Merge branch 'master' of ssh://rsync.linux-nfs.org/home/trondmy/www_sites/rsync.linux...

[linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 6da13bba3e23c18add93ec275e904106dfdda2af..1f31a528fdba8ef941f5a75acb65c3ac6a4bf815 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -294,6 +294,10 @@ static inline void prepare_lock_switch(runqueue_t *rq, task_t *next)
  
  static inline void finish_lock_switch(runqueue_t *rq, task_t *prev)
  {
+#ifdef CONFIG_DEBUG_SPINLOCK
+       /* this is a valid case when another task releases the spinlock */
+       rq->lock.owner = current;
+#endif
         spin_unlock_irq(&rq->lock);
  }
  
@@ -1529,10 +1533,6 @@ static inline void finish_task_switch(runqueue_t *rq, task_t *prev)
          *              Manfred Spraul <manfred@colorfullife.com>
          */
         prev_task_flags = prev->flags;
-#ifdef CONFIG_DEBUG_SPINLOCK
-       /* this is a valid case when another task releases the spinlock */
-       rq->lock.owner = current;
-#endif
         finish_arch_switch(prev);
         finish_lock_switch(rq, prev);
         if (mm)
@@ -1906,10 +1906,11 @@ out:
   */
  static struct sched_group *
  find_busiest_group(struct sched_domain *sd, int this_cpu,
-                  unsigned long *imbalance, enum idle_type idle)
+                  unsigned long *imbalance, enum idle_type idle, int *sd_idle)
  {
         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+       unsigned long max_pull;
         int load_idx;
  
         max_load = this_load = total_load = total_pwr = 0;
@@ -1931,6 +1932,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 avg_load = 0;
  
                 for_each_cpu_mask(i, group->cpumask) {
+                       if (*sd_idle && !idle_cpu(i))
+                               *sd_idle = 0;
+
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
                                 load = target_load(i, load_idx);
@@ -1956,7 +1960,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 group = group->next;
         } while (group != sd->groups);
  
-       if (!busiest || this_load >= max_load)
+       if (!busiest || this_load >= max_load || max_load <= SCHED_LOAD_SCALE)
                 goto out_balanced;
  
         avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
@@ -1976,8 +1980,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
          * by pulling tasks to us.  Be careful of negative numbers as they'll
          * appear as very large values with unsigned longs.
          */
+
+       /* Don't want to pull so many tasks that a group would go idle */
+       max_pull = min(max_load - avg_load, max_load - SCHED_LOAD_SCALE);
+
         /* How much load to actually move to equalise the imbalance */
-       *imbalance = min((max_load - avg_load) * busiest->cpu_power,
+       *imbalance = min(max_pull * busiest->cpu_power,
                                 (avg_load - this_load) * this->cpu_power)
                         / SCHED_LOAD_SCALE;
  
@@ -2074,11 +2082,14 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
         unsigned long imbalance;
         int nr_moved, all_pinned = 0;
         int active_balance = 0;
+       int sd_idle = 0;
+
+       if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER)
+               sd_idle = 1;
  
-       spin_lock(&this_rq->lock);
         schedstat_inc(sd, lb_cnt[idle]);
  
-       group = find_busiest_group(sd, this_cpu, &imbalance, idle);
+       group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle);
         if (!group) {
                 schedstat_inc(sd, lb_nobusyg[idle]);
                 goto out_balanced;
@@ -2102,19 +2113,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                  * still unbalanced. nr_moved simply stays zero, so it is
                  * correctly treated as an imbalance.
                  */
-               double_lock_balance(this_rq, busiest);
+               double_rq_lock(this_rq, busiest);
                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                               imbalance, sd, idle,
-                                               &all_pinned);
-               spin_unlock(&busiest->lock);
+                                       imbalance, sd, idle, &all_pinned);
+               double_rq_unlock(this_rq, busiest);
  
                 /* All tasks on this runqueue were pinned by CPU affinity */
                 if (unlikely(all_pinned))
                         goto out_balanced;
         }
  
-       spin_unlock(&this_rq->lock);
-
         if (!nr_moved) {
                 schedstat_inc(sd, lb_failed[idle]);
                 sd->nr_balance_failed++;
@@ -2122,6 +2130,16 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
  
                         spin_lock(&busiest->lock);
+
+                       /* don't kick the migration_thread, if the curr
+                        * task on busiest cpu can't be moved to this_cpu
+                        */
+                       if (!cpu_isset(this_cpu, busiest->curr->cpus_allowed)) {
+                               spin_unlock(&busiest->lock);
+                               all_pinned = 1;
+                               goto out_one_pinned;
+                       }
+
                         if (!busiest->active_balance) {
                                 busiest->active_balance = 1;
                                 busiest->push_cpu = this_cpu;
@@ -2154,19 +2172,23 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                         sd->balance_interval *= 2;
         }
  
+       if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+               return -1;
         return nr_moved;
  
  out_balanced:
-       spin_unlock(&this_rq->lock);
-
         schedstat_inc(sd, lb_balanced[idle]);
  
         sd->nr_balance_failed = 0;
+
+out_one_pinned:
         /* tune up the balancing interval */
         if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) ||
                         (sd->balance_interval < sd->max_interval))
                 sd->balance_interval *= 2;
  
+       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+               return -1;
         return 0;
  }
  
@@ -2184,9 +2206,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
         runqueue_t *busiest = NULL;
         unsigned long imbalance;
         int nr_moved = 0;
+       int sd_idle = 0;
+
+       if (sd->flags & SD_SHARE_CPUPOWER)
+               sd_idle = 1;
  
         schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
-       group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
+       group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, &sd_idle);
         if (!group) {
                 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
                 goto out_balanced;
@@ -2200,22 +2226,30 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
  
         BUG_ON(busiest == this_rq);
  
-       /* Attempt to move tasks */
-       double_lock_balance(this_rq, busiest);
-
         schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
-       nr_moved = move_tasks(this_rq, this_cpu, busiest,
+
+       nr_moved = 0;
+       if (busiest->nr_running > 1) {
+               /* Attempt to move tasks */
+               double_lock_balance(this_rq, busiest);
+               nr_moved = move_tasks(this_rq, this_cpu, busiest,
                                         imbalance, sd, NEWLY_IDLE, NULL);
-       if (!nr_moved)
+               spin_unlock(&busiest->lock);
+       }
+
+       if (!nr_moved) {
                 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
-       else
+               if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+                       return -1;
+       } else
                 sd->nr_balance_failed = 0;
  
-       spin_unlock(&busiest->lock);
         return nr_moved;
  
  out_balanced:
         schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+       if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER)
+               return -1;
         sd->nr_balance_failed = 0;
         return 0;
  }
@@ -2340,7 +2374,11 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
  
                 if (j - sd->last_balance >= interval) {
                         if (load_balance(this_cpu, this_rq, sd, idle)) {
-                               /* We've pulled tasks over so no longer idle */
+                               /*
+                                * We've pulled tasks over so either we're no
+                                * longer idle, or one of our SMT siblings is
+                                * not idle.
+                                */
                                 idle = NOT_IDLE;
                         }
                         sd->last_balance += interval;
@@ -2650,6 +2688,16 @@ static inline void wake_sleeping_dependent(int this_cpu, runqueue_t *this_rq)
          */
  }
  
+/*
+ * number of 'lost' timeslices this task wont be able to fully
+ * utilize, if another task runs on a sibling. This models the
+ * slowdown effect of other tasks running on siblings:
+ */
+static inline unsigned long smt_slice(task_t *p, struct sched_domain *sd)
+{
+       return p->time_slice * (100 - sd->per_cpu_gain) / 100;
+}
+
  static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
  {
         struct sched_domain *tmp, *sd = NULL;
@@ -2714,8 +2762,9 @@ static inline int dependent_sleeper(int this_cpu, runqueue_t *this_rq)
                                 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
                                         ret = 1;
                 } else
-                       if (((smt_curr->time_slice * (100 - sd->per_cpu_gain) /
-                               100) > task_timeslice(p)))
+                       if (smt_curr->static_prio < p->static_prio &&
+                               !TASK_PREEMPTS_CURR(p, smt_rq) &&
+                               smt_slice(smt_curr, sd) > task_timeslice(p))
                                         ret = 1;
  
  check_smt_task:
@@ -2737,8 +2786,8 @@ check_smt_task:
                                 (sd->per_cpu_gain * DEF_TIMESLICE / 100))
                                         resched_task(smt_curr);
                 } else {
-                       if ((p->time_slice * (100 - sd->per_cpu_gain) / 100) >
-                               task_timeslice(smt_curr))
+                       if (TASK_PREEMPTS_CURR(p, smt_rq) &&
+                               smt_slice(p, sd) > task_timeslice(smt_curr))
                                         resched_task(smt_curr);
                         else
                                 wakeup_busy_runqueue(smt_rq);
@@ -3907,7 +3956,7 @@ asmlinkage long sys_sched_yield(void)
         if (rt_task(current))
                 target = rq->active;
  
-       if (current->array->nr_active == 1) {
+       if (array->nr_active == 1) {
                 schedstat_inc(rq, yld_act_empty);
                 if (!rq->expired->nr_active)
                         schedstat_inc(rq, yld_both_empty);
@@ -5553,3 +5602,47 @@ void normalize_rt_tasks(void)
  }
  
  #endif /* CONFIG_MAGIC_SYSRQ */
+
+#ifdef CONFIG_IA64
+/*
+ * These functions are only useful for the IA64 MCA handling.
+ *
+ * They can only be called when the whole system has been
+ * stopped - every CPU needs to be quiescent, and no scheduling
+ * activity can take place. Using them for anything else would
+ * be a serious bug, and as a result, they aren't even visible
+ * under any other configuration.
+ */
+
+/**
+ * curr_task - return the current task for a given cpu.
+ * @cpu: the processor in question.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+task_t *curr_task(int cpu)
+{
+       return cpu_curr(cpu);
+}
+
+/**
+ * set_curr_task - set the current task for a given cpu.
+ * @cpu: the processor in question.
+ * @p: the task pointer to set.
+ *
+ * Description: This function must only be used when non-maskable interrupts
+ * are serviced on a separate stack.  It allows the architecture to switch the
+ * notion of the current task on a cpu in a non-blocking manner.  This function
+ * must be called with all CPU's synchronized, and interrupts disabled, the
+ * and caller must save the original value of the current task (see
+ * curr_task() above) and restore that value before reenabling interrupts and
+ * re-starting the system.
+ *
+ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
+ */
+void set_curr_task(int cpu, task_t *p)
+{
+       cpu_curr(cpu) = p;
+}
+
+#endif