[PATCH] sched: tweak affine wakeups

[linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 6ee4515d5a2045df146710c4e00b7b625cbe0304..5ae3568eed0b4c2a13d52be4b19367ae1e36b935 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,7 +206,7 @@ struct runqueue {
          */
         unsigned long nr_running;
  #ifdef CONFIG_SMP
-       unsigned long cpu_load;
+       unsigned long cpu_load[3];
  #endif
         unsigned long long nr_switches;
  
@@ -886,23 +886,27 @@ void kick_process(task_t *p)
   * We want to under-estimate the load of migration sources, to
   * balance conservatively.
   */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
  {
         runqueue_t *rq = cpu_rq(cpu);
         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+       if (type == 0)
+               return load_now;
  
-       return min(rq->cpu_load, load_now);
+       return min(rq->cpu_load[type-1], load_now);
  }
  
  /*
   * Return a high guess at the load of a migration-target cpu
   */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
  {
         runqueue_t *rq = cpu_rq(cpu);
         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+       if (type == 0)
+               return load_now;
  
-       return max(rq->cpu_load, load_now);
+       return max(rq->cpu_load[type-1], load_now);
  }
  
  #endif
@@ -927,14 +931,14 @@ static int wake_idle(int cpu, task_t *p)
  
         for_each_domain(cpu, sd) {
                 if (sd->flags & SD_WAKE_IDLE) {
-                       cpus_and(tmp, sd->span, cpu_online_map);
-                       cpus_and(tmp, tmp, p->cpus_allowed);
+                       cpus_and(tmp, sd->span, p->cpus_allowed);
                         for_each_cpu_mask(i, tmp) {
                                 if (idle_cpu(i))
                                         return i;
                         }
                 }
-               else break;
+               else
+                       break;
         }
         return cpu;
  }
@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
         runqueue_t *rq;
  #ifdef CONFIG_SMP
         unsigned long load, this_load;
-       struct sched_domain *sd;
+       struct sched_domain *sd, *this_sd = NULL;
         int new_cpu;
  #endif
  
@@ -986,70 +990,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
         if (unlikely(task_running(rq, p)))
                 goto out_activate;
  
-#ifdef CONFIG_SCHEDSTATS
+       new_cpu = cpu;
+
         schedstat_inc(rq, ttwu_cnt);
         if (cpu == this_cpu) {
                 schedstat_inc(rq, ttwu_local);
-       } else {
-               for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_wake_remote);
-                               break;
-                       }
+               goto out_set_cpu;
+       }
+
+       for_each_domain(this_cpu, sd) {
+               if (cpu_isset(cpu, sd->span)) {
+                       schedstat_inc(sd, ttwu_wake_remote);
+                       this_sd = sd;
+                       break;
                 }
         }
-#endif
  
-       new_cpu = cpu;
-       if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+       if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
                 goto out_set_cpu;
  
-       load = source_load(cpu);
-       this_load = target_load(this_cpu);
-
         /*
-        * If sync wakeup then subtract the (maximum possible) effect of
-        * the currently running task from the load of the current CPU:
+        * Check for affine wakeup and passive balancing possibilities.
          */
-       if (sync)
-               this_load -= SCHED_LOAD_SCALE;
+       if (this_sd) {
+               int idx = this_sd->wake_idx;
+               unsigned int imbalance;
  
-       /* Don't pull the task off an idle CPU to a busy one */
-       if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-               goto out_set_cpu;
+               imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
  
-       new_cpu = this_cpu; /* Wake to this CPU if we can */
+               load = source_load(cpu, idx);
+               this_load = target_load(this_cpu, idx);
  
-       /*
-        * Scan domains for affine wakeup and passive balancing
-        * possibilities.
-        */
-       for_each_domain(this_cpu, sd) {
-               unsigned int imbalance;
-               /*
-                * Start passive balancing when half the imbalance_pct
-                * limit is reached.
-                */
-               imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+               new_cpu = this_cpu; /* Wake to this CPU if we can */
  
-               if ((sd->flags & SD_WAKE_AFFINE) &&
-                               !task_hot(p, rq->timestamp_last_tick, sd)) {
+               if (this_sd->flags & SD_WAKE_AFFINE) {
+                       unsigned long tl = this_load;
                         /*
-                        * This domain has SD_WAKE_AFFINE and p is cache cold
-                        * in this domain.
+                        * If sync wakeup then subtract the (maximum possible)
+                        * effect of the currently running task from the load
+                        * of the current CPU:
                          */
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_move_affine);
+                       if (sync)
+                               tl -= SCHED_LOAD_SCALE;
+
+                       if ((tl <= load &&
+                               tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+                               100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+                               /*
+                                * This domain has SD_WAKE_AFFINE and
+                                * p is cache cold in this domain, and
+                                * there is no bad imbalance.
+                                */
+                               schedstat_inc(this_sd, ttwu_move_affine);
                                 goto out_set_cpu;
                         }
-               } else if ((sd->flags & SD_WAKE_BALANCE) &&
-                               imbalance*this_load <= 100*load) {
-                       /*
-                        * This domain has SD_WAKE_BALANCE and there is
-                        * an imbalance.
-                        */
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_move_balance);
+               }
+
+               /*
+                * Start passive balancing when half the imbalance_pct
+                * limit is reached.
+                */
+               if (this_sd->flags & SD_WAKE_BALANCE) {
+                       if (imbalance*this_load <= 100*load) {
+                               schedstat_inc(this_sd, ttwu_move_balance);
                                 goto out_set_cpu;
                         }
                 }
@@ -1509,7 +1512,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
         cpus_and(mask, sd->span, p->cpus_allowed);
  
         for_each_cpu_mask(i, mask) {
-               load = target_load(i);
+               load = target_load(i, sd->wake_idx);
  
                 if (load < min_load) {
                         min_cpu = i;
@@ -1522,7 +1525,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
         }
  
         /* add +1 to account for the new task */
-       this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+       this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
  
         /*
          * Would with the addition of the new task to the
@@ -1632,7 +1635,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
   */
  static inline
  int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-                    struct sched_domain *sd, enum idle_type idle)
+            struct sched_domain *sd, enum idle_type idle, int *all_pinned)
  {
         /*
          * We do not migrate tasks that are:
@@ -1640,10 +1643,12 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
          * 2) cannot be migrated to this CPU due to cpus_allowed, or
          * 3) are cache-hot on their current CPU.
          */
-       if (task_running(rq, p))
-               return 0;
         if (!cpu_isset(this_cpu, p->cpus_allowed))
                 return 0;
+       *all_pinned = 0;
+
+       if (task_running(rq, p))
+               return 0;
  
         /*
          * Aggressive migration if:
@@ -1656,7 +1661,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
                 return 1;
  
         if (task_hot(p, rq->timestamp_last_tick, sd))
-                       return 0;
+               return 0;
         return 1;
  }
  
@@ -1669,16 +1674,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
   */
  static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
                       unsigned long max_nr_move, struct sched_domain *sd,
-                     enum idle_type idle)
+                     enum idle_type idle, int *all_pinned)
  {
         prio_array_t *array, *dst_array;
         struct list_head *head, *curr;
-       int idx, pulled = 0;
+       int idx, pulled = 0, pinned = 0;
         task_t *tmp;
  
-       if (max_nr_move <= 0 || busiest->nr_running <= 1)
+       if (max_nr_move == 0)
                 goto out;
  
+       pinned = 1;
+
         /*
          * We first consider expired tasks. Those will likely not be
          * executed in the near future, and they are most likely to
@@ -1717,7 +1724,7 @@ skip_queue:
  
         curr = curr->prev;
  
-       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
                 if (curr != head)
                         goto skip_queue;
                 idx++;
@@ -1746,6 +1753,9 @@ out:
          * inside pull_task().
          */
         schedstat_add(sd, lb_gained[idle], pulled);
+
+       if (all_pinned)
+               *all_pinned = pinned;
         return pulled;
  }
  
@@ -1760,8 +1770,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
  {
         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+       int load_idx;
  
         max_load = this_load = total_load = total_pwr = 0;
+       if (idle == NOT_IDLE)
+               load_idx = sd->busy_idx;
+       else if (idle == NEWLY_IDLE)
+               load_idx = sd->newidle_idx;
+       else
+               load_idx = sd->idle_idx;
  
         do {
                 unsigned long load;
@@ -1776,9 +1793,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 for_each_cpu_mask(i, group->cpumask) {
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
-                               load = target_load(i);
+                               load = target_load(i, load_idx);
                         else
-                               load = source_load(i);
+                               load = source_load(i, load_idx);
  
                         avg_load += load;
                 }
@@ -1870,15 +1887,9 @@ nextgroup:
  
         /* Get rid of the scaling factor, rounding down as we divide */
         *imbalance = *imbalance / SCHED_LOAD_SCALE;
-
         return busiest;
  
  out_balanced:
-       if (busiest && (idle == NEWLY_IDLE ||
-                       (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
-               *imbalance = 1;
-               return busiest;
-       }
  
         *imbalance = 0;
         return NULL;
@@ -1894,7 +1905,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
         int i;
  
         for_each_cpu_mask(i, group->cpumask) {
-               load = source_load(i);
+               load = source_load(i, 0);
  
                 if (load > max_load) {
                         max_load = load;
@@ -1917,7 +1928,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
         struct sched_group *group;
         runqueue_t *busiest;
         unsigned long imbalance;
-       int nr_moved;
+       int nr_moved, all_pinned;
+       int active_balance = 0;
  
         spin_lock(&this_rq->lock);
         schedstat_inc(sd, lb_cnt[idle]);
@@ -1934,15 +1946,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 goto out_balanced;
         }
  
-       /*
-        * This should be "impossible", but since load
-        * balancing is inherently racy and statistical,
-        * it could happen in theory.
-        */
-       if (unlikely(busiest == this_rq)) {
-               WARN_ON(1);
-               goto out_balanced;
-       }
+       BUG_ON(busiest == this_rq);
  
         schedstat_add(sd, lb_imbalance[idle], imbalance);
  
@@ -1956,9 +1960,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                  */
                 double_lock_balance(this_rq, busiest);
                 nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                               imbalance, sd, idle);
+                                               imbalance, sd, idle,
+                                               &all_pinned);
                 spin_unlock(&busiest->lock);
+
+               /* All tasks on this runqueue were pinned by CPU affinity */
+               if (unlikely(all_pinned))
+                       goto out_balanced;
         }
+
         spin_unlock(&this_rq->lock);
  
         if (!nr_moved) {
@@ -1966,36 +1976,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 sd->nr_balance_failed++;
  
                 if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                       int wake = 0;
  
                         spin_lock(&busiest->lock);
                         if (!busiest->active_balance) {
                                 busiest->active_balance = 1;
                                 busiest->push_cpu = this_cpu;
-                               wake = 1;
+                               active_balance = 1;
                         }
                         spin_unlock(&busiest->lock);
-                       if (wake)
+                       if (active_balance)
                                 wake_up_process(busiest->migration_thread);
  
                         /*
                          * We've kicked active balancing, reset the failure
                          * counter.
                          */
-                       sd->nr_balance_failed = sd->cache_nice_tries;
+                       sd->nr_balance_failed = sd->cache_nice_tries+1;
                 }
-
-               /*
-                * We were unbalanced, but unsuccessful in move_tasks(),
-                * so bump the balance_interval to lessen the lock contention.
-                */
-               if (sd->balance_interval < sd->max_interval)
-                       sd->balance_interval++;
-       } else {
+       } else
                 sd->nr_balance_failed = 0;
  
+       if (likely(!active_balance)) {
                 /* We were unbalanced, so reset the balancing interval */
                 sd->balance_interval = sd->min_interval;
+       } else {
+               /*
+                * If we've begun active balancing, start to back off. This
+                * case may not be covered by the all_pinned logic if there
+                * is only 1 task on the busy runqueue (because we don't call
+                * move_tasks).
+                */
+               if (sd->balance_interval < sd->max_interval)
+                       sd->balance_interval *= 2;
         }
  
         return nr_moved;
@@ -2005,6 +2017,7 @@ out_balanced:
  
         schedstat_inc(sd, lb_balanced[idle]);
  
+       sd->nr_balance_failed = 0;
         /* tune up the balancing interval */
         if (sd->balance_interval < sd->max_interval)
                 sd->balance_interval *= 2;
@@ -2030,31 +2043,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
         schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
         group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
         if (!group) {
-               schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
                 schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
-               goto out;
+               goto out_balanced;
         }
  
         busiest = find_busiest_queue(group);
-       if (!busiest || busiest == this_rq) {
-               schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+       if (!busiest) {
                 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
-               goto out;
+               goto out_balanced;
         }
  
+       BUG_ON(busiest == this_rq);
+
         /* Attempt to move tasks */
         double_lock_balance(this_rq, busiest);
  
         schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
         nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                       imbalance, sd, NEWLY_IDLE);
+                                       imbalance, sd, NEWLY_IDLE, NULL);
         if (!nr_moved)
                 schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+       else
+               sd->nr_balance_failed = 0;
  
         spin_unlock(&busiest->lock);
-
-out:
         return nr_moved;
+
+out_balanced:
+       schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+       sd->nr_balance_failed = 0;
+       return 0;
  }
  
  /*
@@ -2086,56 +2104,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
  static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
  {
         struct sched_domain *sd;
-       struct sched_group *cpu_group;
         runqueue_t *target_rq;
-       cpumask_t visited_cpus;
-       int cpu;
+       int target_cpu = busiest_rq->push_cpu;
+
+       if (busiest_rq->nr_running <= 1)
+               /* no task to move */
+               return;
+
+       target_rq = cpu_rq(target_cpu);
  
         /*
-        * Search for suitable CPUs to push tasks to in successively higher
-        * domains with SD_LOAD_BALANCE set.
+        * This condition is "impossible", if it occurs
+        * we need to fix it.  Originally reported by
+        * Bjorn Helgaas on a 128-cpu setup.
          */
-       visited_cpus = CPU_MASK_NONE;
-       for_each_domain(busiest_cpu, sd) {
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       /* no more domains to search */
-                       break;
+       BUG_ON(busiest_rq == target_rq);
  
-               schedstat_inc(sd, alb_cnt);
+       /* move a task from busiest_rq to target_rq */
+       double_lock_balance(busiest_rq, target_rq);
  
-               cpu_group = sd->groups;
-               do {
-                       for_each_cpu_mask(cpu, cpu_group->cpumask) {
-                               if (busiest_rq->nr_running <= 1)
-                                       /* no more tasks left to move */
-                                       return;
-                               if (cpu_isset(cpu, visited_cpus))
-                                       continue;
-                               cpu_set(cpu, visited_cpus);
-                               if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
-                                       continue;
-
-                               target_rq = cpu_rq(cpu);
-                               /*
-                                * This condition is "impossible", if it occurs
-                                * we need to fix it.  Originally reported by
-                                * Bjorn Helgaas on a 128-cpu setup.
-                                */
-                               BUG_ON(busiest_rq == target_rq);
-
-                               /* move a task from busiest_rq to target_rq */
-                               double_lock_balance(busiest_rq, target_rq);
-                               if (move_tasks(target_rq, cpu, busiest_rq,
-                                               1, sd, SCHED_IDLE)) {
-                                       schedstat_inc(sd, alb_pushed);
-                               } else {
-                                       schedstat_inc(sd, alb_failed);
-                               }
-                               spin_unlock(&target_rq->lock);
-                       }
-                       cpu_group = cpu_group->next;
-               } while (cpu_group != sd->groups);
-       }
+       /* Search for an sd spanning us and the target CPU. */
+       for_each_domain(target_cpu, sd)
+               if ((sd->flags & SD_LOAD_BALANCE) &&
+                       cpu_isset(busiest_cpu, sd->span))
+                               break;
+
+       if (unlikely(sd == NULL))
+               goto out;
+
+       schedstat_inc(sd, alb_cnt);
+
+       if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+               schedstat_inc(sd, alb_pushed);
+       else
+               schedstat_inc(sd, alb_failed);
+out:
+       spin_unlock(&target_rq->lock);
  }
  
  /*
@@ -2156,18 +2160,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
         unsigned long old_load, this_load;
         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
         struct sched_domain *sd;
+       int i;
  
-       /* Update our load */
-       old_load = this_rq->cpu_load;
         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-       /*
-        * Round up the averaging division if load is increasing. This
-        * prevents us from getting stuck on 9 if the load is 10, for
-        * example.
-        */
-       if (this_load > old_load)
-               old_load++;
-       this_rq->cpu_load = (old_load + this_load) / 2;
+       /* Update our load */
+       for (i = 0; i < 3; i++) {
+               unsigned long new_load = this_load;
+               int scale = 1 << i;
+               old_load = this_rq->cpu_load[i];
+               /*
+                * Round up the averaging division if load is increasing. This
+                * prevents us from getting stuck on 9 if the load is 10, for
+                * example.
+                */
+               if (new_load > old_load)
+                       new_load += scale-1;
+               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+       }
  
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
@@ -2869,7 +2878,7 @@ need_resched:
  
  int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
  {
-       task_t *p = curr->task;
+       task_t *p = curr->private;
         return try_to_wake_up(p, mode, sync);
  }
  
@@ -4927,13 +4936,15 @@ void __init sched_init(void)
  
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
+               rq->nr_running = 0;
                 rq->active = rq->arrays;
                 rq->expired = rq->arrays + 1;
                 rq->best_expired_prio = MAX_PRIO;
  
  #ifdef CONFIG_SMP
                 rq->sd = &sched_domain_dummy;
-               rq->cpu_load = 0;
+               for (j = 1; j < 3; j++)
+                       rq->cpu_load[j] = 0;
                 rq->active_balance = 0;
                 rq->push_cpu = 0;
                 rq->migration_thread = NULL;