[PATCH] sched: tweak affine wakeups

[linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 03d737791c1ad4f188b68dcd6a467c7bb21bab31..5ae3568eed0b4c2a13d52be4b19367ae1e36b935 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -206,7 +206,7 @@ struct runqueue {
          */
         unsigned long nr_running;
  #ifdef CONFIG_SMP
-       unsigned long cpu_load;
+       unsigned long cpu_load[3];
  #endif
         unsigned long long nr_switches;
  
@@ -886,23 +886,27 @@ void kick_process(task_t *p)
   * We want to under-estimate the load of migration sources, to
   * balance conservatively.
   */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
  {
         runqueue_t *rq = cpu_rq(cpu);
         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+       if (type == 0)
+               return load_now;
  
-       return min(rq->cpu_load, load_now);
+       return min(rq->cpu_load[type-1], load_now);
  }
  
  /*
   * Return a high guess at the load of a migration-target cpu
   */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
  {
         runqueue_t *rq = cpu_rq(cpu);
         unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+       if (type == 0)
+               return load_now;
  
-       return max(rq->cpu_load, load_now);
+       return max(rq->cpu_load[type-1], load_now);
  }
  
  #endif
@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
         runqueue_t *rq;
  #ifdef CONFIG_SMP
         unsigned long load, this_load;
-       struct sched_domain *sd;
+       struct sched_domain *sd, *this_sd = NULL;
         int new_cpu;
  #endif
  
@@ -986,70 +990,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
         if (unlikely(task_running(rq, p)))
                 goto out_activate;
  
-#ifdef CONFIG_SCHEDSTATS
+       new_cpu = cpu;
+
         schedstat_inc(rq, ttwu_cnt);
         if (cpu == this_cpu) {
                 schedstat_inc(rq, ttwu_local);
-       } else {
-               for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_wake_remote);
-                               break;
-                       }
+               goto out_set_cpu;
+       }
+
+       for_each_domain(this_cpu, sd) {
+               if (cpu_isset(cpu, sd->span)) {
+                       schedstat_inc(sd, ttwu_wake_remote);
+                       this_sd = sd;
+                       break;
                 }
         }
-#endif
  
-       new_cpu = cpu;
-       if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+       if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
                 goto out_set_cpu;
  
-       load = source_load(cpu);
-       this_load = target_load(this_cpu);
-
         /*
-        * If sync wakeup then subtract the (maximum possible) effect of
-        * the currently running task from the load of the current CPU:
+        * Check for affine wakeup and passive balancing possibilities.
          */
-       if (sync)
-               this_load -= SCHED_LOAD_SCALE;
+       if (this_sd) {
+               int idx = this_sd->wake_idx;
+               unsigned int imbalance;
  
-       /* Don't pull the task off an idle CPU to a busy one */
-       if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-               goto out_set_cpu;
+               imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
  
-       new_cpu = this_cpu; /* Wake to this CPU if we can */
+               load = source_load(cpu, idx);
+               this_load = target_load(this_cpu, idx);
  
-       /*
-        * Scan domains for affine wakeup and passive balancing
-        * possibilities.
-        */
-       for_each_domain(this_cpu, sd) {
-               unsigned int imbalance;
-               /*
-                * Start passive balancing when half the imbalance_pct
-                * limit is reached.
-                */
-               imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+               new_cpu = this_cpu; /* Wake to this CPU if we can */
  
-               if ((sd->flags & SD_WAKE_AFFINE) &&
-                               !task_hot(p, rq->timestamp_last_tick, sd)) {
+               if (this_sd->flags & SD_WAKE_AFFINE) {
+                       unsigned long tl = this_load;
                         /*
-                        * This domain has SD_WAKE_AFFINE and p is cache cold
-                        * in this domain.
+                        * If sync wakeup then subtract the (maximum possible)
+                        * effect of the currently running task from the load
+                        * of the current CPU:
                          */
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_move_affine);
+                       if (sync)
+                               tl -= SCHED_LOAD_SCALE;
+
+                       if ((tl <= load &&
+                               tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+                               100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+                               /*
+                                * This domain has SD_WAKE_AFFINE and
+                                * p is cache cold in this domain, and
+                                * there is no bad imbalance.
+                                */
+                               schedstat_inc(this_sd, ttwu_move_affine);
                                 goto out_set_cpu;
                         }
-               } else if ((sd->flags & SD_WAKE_BALANCE) &&
-                               imbalance*this_load <= 100*load) {
-                       /*
-                        * This domain has SD_WAKE_BALANCE and there is
-                        * an imbalance.
-                        */
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_move_balance);
+               }
+
+               /*
+                * Start passive balancing when half the imbalance_pct
+                * limit is reached.
+                */
+               if (this_sd->flags & SD_WAKE_BALANCE) {
+                       if (imbalance*this_load <= 100*load) {
+                               schedstat_inc(this_sd, ttwu_move_balance);
                                 goto out_set_cpu;
                         }
                 }
@@ -1509,7 +1512,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
         cpus_and(mask, sd->span, p->cpus_allowed);
  
         for_each_cpu_mask(i, mask) {
-               load = target_load(i);
+               load = target_load(i, sd->wake_idx);
  
                 if (load < min_load) {
                         min_cpu = i;
@@ -1522,7 +1525,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
         }
  
         /* add +1 to account for the new task */
-       this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+       this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
  
         /*
          * Would with the addition of the new task to the
@@ -1767,8 +1770,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
  {
         struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
         unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+       int load_idx;
  
         max_load = this_load = total_load = total_pwr = 0;
+       if (idle == NOT_IDLE)
+               load_idx = sd->busy_idx;
+       else if (idle == NEWLY_IDLE)
+               load_idx = sd->newidle_idx;
+       else
+               load_idx = sd->idle_idx;
  
         do {
                 unsigned long load;
@@ -1783,9 +1793,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                 for_each_cpu_mask(i, group->cpumask) {
                         /* Bias balancing toward cpus of our domain */
                         if (local_group)
-                               load = target_load(i);
+                               load = target_load(i, load_idx);
                         else
-                               load = source_load(i);
+                               load = source_load(i, load_idx);
  
                         avg_load += load;
                 }
@@ -1877,15 +1887,9 @@ nextgroup:
  
         /* Get rid of the scaling factor, rounding down as we divide */
         *imbalance = *imbalance / SCHED_LOAD_SCALE;
-
         return busiest;
  
  out_balanced:
-       if (busiest && (idle == NEWLY_IDLE ||
-                       (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
-               *imbalance = 1;
-               return busiest;
-       }
  
         *imbalance = 0;
         return NULL;
@@ -1901,7 +1905,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
         int i;
  
         for_each_cpu_mask(i, group->cpumask) {
-               load = source_load(i);
+               load = source_load(i, 0);
  
                 if (load > max_load) {
                         max_load = load;
@@ -1942,15 +1946,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 goto out_balanced;
         }
  
-       /*
-        * This should be "impossible", but since load
-        * balancing is inherently racy and statistical,
-        * it could happen in theory.
-        */
-       if (unlikely(busiest == this_rq)) {
-               WARN_ON(1);
-               goto out_balanced;
-       }
+       BUG_ON(busiest == this_rq);
  
         schedstat_add(sd, lb_imbalance[idle], imbalance);
  
@@ -1995,7 +1991,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                          * We've kicked active balancing, reset the failure
                          * counter.
                          */
-                       sd->nr_balance_failed = sd->cache_nice_tries;
+                       sd->nr_balance_failed = sd->cache_nice_tries+1;
                 }
         } else
                 sd->nr_balance_failed = 0;
@@ -2052,11 +2048,13 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
         }
  
         busiest = find_busiest_queue(group);
-       if (!busiest || busiest == this_rq) {
+       if (!busiest) {
                 schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
                 goto out_balanced;
         }
  
+       BUG_ON(busiest == this_rq);
+
         /* Attempt to move tasks */
         double_lock_balance(this_rq, busiest);
  
@@ -2106,56 +2104,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
  static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
  {
         struct sched_domain *sd;
-       struct sched_group *cpu_group;
         runqueue_t *target_rq;
-       cpumask_t visited_cpus;
-       int cpu;
+       int target_cpu = busiest_rq->push_cpu;
+
+       if (busiest_rq->nr_running <= 1)
+               /* no task to move */
+               return;
+
+       target_rq = cpu_rq(target_cpu);
  
         /*
-        * Search for suitable CPUs to push tasks to in successively higher
-        * domains with SD_LOAD_BALANCE set.
+        * This condition is "impossible", if it occurs
+        * we need to fix it.  Originally reported by
+        * Bjorn Helgaas on a 128-cpu setup.
          */
-       visited_cpus = CPU_MASK_NONE;
-       for_each_domain(busiest_cpu, sd) {
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       /* no more domains to search */
-                       break;
+       BUG_ON(busiest_rq == target_rq);
  
-               schedstat_inc(sd, alb_cnt);
+       /* move a task from busiest_rq to target_rq */
+       double_lock_balance(busiest_rq, target_rq);
  
-               cpu_group = sd->groups;
-               do {
-                       for_each_cpu_mask(cpu, cpu_group->cpumask) {
-                               if (busiest_rq->nr_running <= 1)
-                                       /* no more tasks left to move */
-                                       return;
-                               if (cpu_isset(cpu, visited_cpus))
-                                       continue;
-                               cpu_set(cpu, visited_cpus);
-                               if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
-                                       continue;
-
-                               target_rq = cpu_rq(cpu);
-                               /*
-                                * This condition is "impossible", if it occurs
-                                * we need to fix it.  Originally reported by
-                                * Bjorn Helgaas on a 128-cpu setup.
-                                */
-                               BUG_ON(busiest_rq == target_rq);
-
-                               /* move a task from busiest_rq to target_rq */
-                               double_lock_balance(busiest_rq, target_rq);
-                               if (move_tasks(target_rq, cpu, busiest_rq,
-                                               1, sd, SCHED_IDLE, NULL)) {
-                                       schedstat_inc(sd, alb_pushed);
-                               } else {
-                                       schedstat_inc(sd, alb_failed);
-                               }
-                               spin_unlock(&target_rq->lock);
-                       }
-                       cpu_group = cpu_group->next;
-               } while (cpu_group != sd->groups);
-       }
+       /* Search for an sd spanning us and the target CPU. */
+       for_each_domain(target_cpu, sd)
+               if ((sd->flags & SD_LOAD_BALANCE) &&
+                       cpu_isset(busiest_cpu, sd->span))
+                               break;
+
+       if (unlikely(sd == NULL))
+               goto out;
+
+       schedstat_inc(sd, alb_cnt);
+
+       if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+               schedstat_inc(sd, alb_pushed);
+       else
+               schedstat_inc(sd, alb_failed);
+out:
+       spin_unlock(&target_rq->lock);
  }
  
  /*
@@ -2176,18 +2160,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
         unsigned long old_load, this_load;
         unsigned long j = jiffies + CPU_OFFSET(this_cpu);
         struct sched_domain *sd;
+       int i;
  
-       /* Update our load */
-       old_load = this_rq->cpu_load;
         this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-       /*
-        * Round up the averaging division if load is increasing. This
-        * prevents us from getting stuck on 9 if the load is 10, for
-        * example.
-        */
-       if (this_load > old_load)
-               old_load++;
-       this_rq->cpu_load = (old_load + this_load) / 2;
+       /* Update our load */
+       for (i = 0; i < 3; i++) {
+               unsigned long new_load = this_load;
+               int scale = 1 << i;
+               old_load = this_rq->cpu_load[i];
+               /*
+                * Round up the averaging division if load is increasing. This
+                * prevents us from getting stuck on 9 if the load is 10, for
+                * example.
+                */
+               if (new_load > old_load)
+                       new_load += scale-1;
+               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+       }
  
         for_each_domain(this_cpu, sd) {
                 unsigned long interval;
@@ -4947,13 +4936,15 @@ void __init sched_init(void)
  
                 rq = cpu_rq(i);
                 spin_lock_init(&rq->lock);
+               rq->nr_running = 0;
                 rq->active = rq->arrays;
                 rq->expired = rq->arrays + 1;
                 rq->best_expired_prio = MAX_PRIO;
  
  #ifdef CONFIG_SMP
                 rq->sd = &sched_domain_dummy;
-               rq->cpu_load = 0;
+               for (j = 1; j < 3; j++)
+                       rq->cpu_load[j] = 0;
                 rq->active_balance = 0;
                 rq->push_cpu = 0;
                 rq->migration_thread = NULL;