]> err.no Git - linux-2.6/blobdiff - kernel/sched.c
[PATCH] sched: tweak affine wakeups
[linux-2.6] / kernel / sched.c
index 66b2ed784822739b1c2e46c317dca2e3324fc6f1..5ae3568eed0b4c2a13d52be4b19367ae1e36b935 100644 (file)
@@ -206,7 +206,7 @@ struct runqueue {
         */
        unsigned long nr_running;
 #ifdef CONFIG_SMP
-       unsigned long cpu_load;
+       unsigned long cpu_load[3];
 #endif
        unsigned long long nr_switches;
 
@@ -886,23 +886,27 @@ void kick_process(task_t *p)
  * We want to under-estimate the load of migration sources, to
  * balance conservatively.
  */
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+       if (type == 0)
+               return load_now;
 
-       return min(rq->cpu_load, load_now);
+       return min(rq->cpu_load[type-1], load_now);
 }
 
 /*
  * Return a high guess at the load of a migration-target cpu
  */
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
 {
        runqueue_t *rq = cpu_rq(cpu);
        unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+       if (type == 0)
+               return load_now;
 
-       return max(rq->cpu_load, load_now);
+       return max(rq->cpu_load[type-1], load_now);
 }
 
 #endif
@@ -927,14 +931,14 @@ static int wake_idle(int cpu, task_t *p)
 
        for_each_domain(cpu, sd) {
                if (sd->flags & SD_WAKE_IDLE) {
-                       cpus_and(tmp, sd->span, cpu_online_map);
-                       cpus_and(tmp, tmp, p->cpus_allowed);
+                       cpus_and(tmp, sd->span, p->cpus_allowed);
                        for_each_cpu_mask(i, tmp) {
                                if (idle_cpu(i))
                                        return i;
                        }
                }
-               else break;
+               else
+                       break;
        }
        return cpu;
 }
@@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
        runqueue_t *rq;
 #ifdef CONFIG_SMP
        unsigned long load, this_load;
-       struct sched_domain *sd;
+       struct sched_domain *sd, *this_sd = NULL;
        int new_cpu;
 #endif
 
@@ -986,70 +990,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync)
        if (unlikely(task_running(rq, p)))
                goto out_activate;
 
-#ifdef CONFIG_SCHEDSTATS
+       new_cpu = cpu;
+
        schedstat_inc(rq, ttwu_cnt);
        if (cpu == this_cpu) {
                schedstat_inc(rq, ttwu_local);
-       } else {
-               for_each_domain(this_cpu, sd) {
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_wake_remote);
-                               break;
-                       }
+               goto out_set_cpu;
+       }
+
+       for_each_domain(this_cpu, sd) {
+               if (cpu_isset(cpu, sd->span)) {
+                       schedstat_inc(sd, ttwu_wake_remote);
+                       this_sd = sd;
+                       break;
                }
        }
-#endif
 
-       new_cpu = cpu;
-       if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+       if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
                goto out_set_cpu;
 
-       load = source_load(cpu);
-       this_load = target_load(this_cpu);
-
        /*
-        * If sync wakeup then subtract the (maximum possible) effect of
-        * the currently running task from the load of the current CPU:
+        * Check for affine wakeup and passive balancing possibilities.
         */
-       if (sync)
-               this_load -= SCHED_LOAD_SCALE;
+       if (this_sd) {
+               int idx = this_sd->wake_idx;
+               unsigned int imbalance;
 
-       /* Don't pull the task off an idle CPU to a busy one */
-       if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
-               goto out_set_cpu;
+               imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
 
-       new_cpu = this_cpu; /* Wake to this CPU if we can */
+               load = source_load(cpu, idx);
+               this_load = target_load(this_cpu, idx);
 
-       /*
-        * Scan domains for affine wakeup and passive balancing
-        * possibilities.
-        */
-       for_each_domain(this_cpu, sd) {
-               unsigned int imbalance;
-               /*
-                * Start passive balancing when half the imbalance_pct
-                * limit is reached.
-                */
-               imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+               new_cpu = this_cpu; /* Wake to this CPU if we can */
 
-               if ((sd->flags & SD_WAKE_AFFINE) &&
-                               !task_hot(p, rq->timestamp_last_tick, sd)) {
+               if (this_sd->flags & SD_WAKE_AFFINE) {
+                       unsigned long tl = this_load;
                        /*
-                        * This domain has SD_WAKE_AFFINE and p is cache cold
-                        * in this domain.
+                        * If sync wakeup then subtract the (maximum possible)
+                        * effect of the currently running task from the load
+                        * of the current CPU:
                         */
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_move_affine);
+                       if (sync)
+                               tl -= SCHED_LOAD_SCALE;
+
+                       if ((tl <= load &&
+                               tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+                               100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+                               /*
+                                * This domain has SD_WAKE_AFFINE and
+                                * p is cache cold in this domain, and
+                                * there is no bad imbalance.
+                                */
+                               schedstat_inc(this_sd, ttwu_move_affine);
                                goto out_set_cpu;
                        }
-               } else if ((sd->flags & SD_WAKE_BALANCE) &&
-                               imbalance*this_load <= 100*load) {
-                       /*
-                        * This domain has SD_WAKE_BALANCE and there is
-                        * an imbalance.
-                        */
-                       if (cpu_isset(cpu, sd->span)) {
-                               schedstat_inc(sd, ttwu_move_balance);
+               }
+
+               /*
+                * Start passive balancing when half the imbalance_pct
+                * limit is reached.
+                */
+               if (this_sd->flags & SD_WAKE_BALANCE) {
+                       if (imbalance*this_load <= 100*load) {
+                               schedstat_inc(this_sd, ttwu_move_balance);
                                goto out_set_cpu;
                        }
                }
@@ -1509,7 +1512,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
        cpus_and(mask, sd->span, p->cpus_allowed);
 
        for_each_cpu_mask(i, mask) {
-               load = target_load(i);
+               load = target_load(i, sd->wake_idx);
 
                if (load < min_load) {
                        min_cpu = i;
@@ -1522,7 +1525,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu,
        }
 
        /* add +1 to account for the new task */
-       this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+       this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
 
        /*
         * Would with the addition of the new task to the
@@ -1632,7 +1635,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p,
  */
 static inline
 int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
-                    struct sched_domain *sd, enum idle_type idle)
+            struct sched_domain *sd, enum idle_type idle, int *all_pinned)
 {
        /*
         * We do not migrate tasks that are:
@@ -1640,10 +1643,12 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
         * 2) cannot be migrated to this CPU due to cpus_allowed, or
         * 3) are cache-hot on their current CPU.
         */
-       if (task_running(rq, p))
-               return 0;
        if (!cpu_isset(this_cpu, p->cpus_allowed))
                return 0;
+       *all_pinned = 0;
+
+       if (task_running(rq, p))
+               return 0;
 
        /*
         * Aggressive migration if:
@@ -1656,7 +1661,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
                return 1;
 
        if (task_hot(p, rq->timestamp_last_tick, sd))
-                       return 0;
+               return 0;
        return 1;
 }
 
@@ -1669,16 +1674,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
  */
 static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
                      unsigned long max_nr_move, struct sched_domain *sd,
-                     enum idle_type idle)
+                     enum idle_type idle, int *all_pinned)
 {
        prio_array_t *array, *dst_array;
        struct list_head *head, *curr;
-       int idx, pulled = 0;
+       int idx, pulled = 0, pinned = 0;
        task_t *tmp;
 
-       if (max_nr_move <= 0 || busiest->nr_running <= 1)
+       if (max_nr_move == 0)
                goto out;
 
+       pinned = 1;
+
        /*
         * We first consider expired tasks. Those will likely not be
         * executed in the near future, and they are most likely to
@@ -1717,7 +1724,7 @@ skip_queue:
 
        curr = curr->prev;
 
-       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+       if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
                if (curr != head)
                        goto skip_queue;
                idx++;
@@ -1746,6 +1753,9 @@ out:
         * inside pull_task().
         */
        schedstat_add(sd, lb_gained[idle], pulled);
+
+       if (all_pinned)
+               *all_pinned = pinned;
        return pulled;
 }
 
@@ -1760,8 +1770,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 {
        struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
        unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+       int load_idx;
 
        max_load = this_load = total_load = total_pwr = 0;
+       if (idle == NOT_IDLE)
+               load_idx = sd->busy_idx;
+       else if (idle == NEWLY_IDLE)
+               load_idx = sd->newidle_idx;
+       else
+               load_idx = sd->idle_idx;
 
        do {
                unsigned long load;
@@ -1776,9 +1793,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
                for_each_cpu_mask(i, group->cpumask) {
                        /* Bias balancing toward cpus of our domain */
                        if (local_group)
-                               load = target_load(i);
+                               load = target_load(i, load_idx);
                        else
-                               load = source_load(i);
+                               load = source_load(i, load_idx);
 
                        avg_load += load;
                }
@@ -1870,15 +1887,9 @@ nextgroup:
 
        /* Get rid of the scaling factor, rounding down as we divide */
        *imbalance = *imbalance / SCHED_LOAD_SCALE;
-
        return busiest;
 
 out_balanced:
-       if (busiest && (idle == NEWLY_IDLE ||
-                       (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
-               *imbalance = 1;
-               return busiest;
-       }
 
        *imbalance = 0;
        return NULL;
@@ -1894,7 +1905,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group)
        int i;
 
        for_each_cpu_mask(i, group->cpumask) {
-               load = source_load(i);
+               load = source_load(i, 0);
 
                if (load > max_load) {
                        max_load = load;
@@ -1917,7 +1928,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
        struct sched_group *group;
        runqueue_t *busiest;
        unsigned long imbalance;
-       int nr_moved;
+       int nr_moved, all_pinned;
+       int active_balance = 0;
 
        spin_lock(&this_rq->lock);
        schedstat_inc(sd, lb_cnt[idle]);
@@ -1934,15 +1946,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                goto out_balanced;
        }
 
-       /*
-        * This should be "impossible", but since load
-        * balancing is inherently racy and statistical,
-        * it could happen in theory.
-        */
-       if (unlikely(busiest == this_rq)) {
-               WARN_ON(1);
-               goto out_balanced;
-       }
+       BUG_ON(busiest == this_rq);
 
        schedstat_add(sd, lb_imbalance[idle], imbalance);
 
@@ -1956,9 +1960,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                 */
                double_lock_balance(this_rq, busiest);
                nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                               imbalance, sd, idle);
+                                               imbalance, sd, idle,
+                                               &all_pinned);
                spin_unlock(&busiest->lock);
+
+               /* All tasks on this runqueue were pinned by CPU affinity */
+               if (unlikely(all_pinned))
+                       goto out_balanced;
        }
+
        spin_unlock(&this_rq->lock);
 
        if (!nr_moved) {
@@ -1966,36 +1976,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq,
                sd->nr_balance_failed++;
 
                if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
-                       int wake = 0;
 
                        spin_lock(&busiest->lock);
                        if (!busiest->active_balance) {
                                busiest->active_balance = 1;
                                busiest->push_cpu = this_cpu;
-                               wake = 1;
+                               active_balance = 1;
                        }
                        spin_unlock(&busiest->lock);
-                       if (wake)
+                       if (active_balance)
                                wake_up_process(busiest->migration_thread);
 
                        /*
                         * We've kicked active balancing, reset the failure
                         * counter.
                         */
-                       sd->nr_balance_failed = sd->cache_nice_tries;
+                       sd->nr_balance_failed = sd->cache_nice_tries+1;
                }
-
-               /*
-                * We were unbalanced, but unsuccessful in move_tasks(),
-                * so bump the balance_interval to lessen the lock contention.
-                */
-               if (sd->balance_interval < sd->max_interval)
-                       sd->balance_interval++;
-       } else {
+       } else
                sd->nr_balance_failed = 0;
 
+       if (likely(!active_balance)) {
                /* We were unbalanced, so reset the balancing interval */
                sd->balance_interval = sd->min_interval;
+       } else {
+               /*
+                * If we've begun active balancing, start to back off. This
+                * case may not be covered by the all_pinned logic if there
+                * is only 1 task on the busy runqueue (because we don't call
+                * move_tasks).
+                */
+               if (sd->balance_interval < sd->max_interval)
+                       sd->balance_interval *= 2;
        }
 
        return nr_moved;
@@ -2005,6 +2017,7 @@ out_balanced:
 
        schedstat_inc(sd, lb_balanced[idle]);
 
+       sd->nr_balance_failed = 0;
        /* tune up the balancing interval */
        if (sd->balance_interval < sd->max_interval)
                sd->balance_interval *= 2;
@@ -2030,31 +2043,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq,
        schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
        group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
        if (!group) {
-               schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
                schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
-               goto out;
+               goto out_balanced;
        }
 
        busiest = find_busiest_queue(group);
-       if (!busiest || busiest == this_rq) {
-               schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+       if (!busiest) {
                schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
-               goto out;
+               goto out_balanced;
        }
 
+       BUG_ON(busiest == this_rq);
+
        /* Attempt to move tasks */
        double_lock_balance(this_rq, busiest);
 
        schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
        nr_moved = move_tasks(this_rq, this_cpu, busiest,
-                                       imbalance, sd, NEWLY_IDLE);
+                                       imbalance, sd, NEWLY_IDLE, NULL);
        if (!nr_moved)
                schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+       else
+               sd->nr_balance_failed = 0;
 
        spin_unlock(&busiest->lock);
-
-out:
        return nr_moved;
+
+out_balanced:
+       schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+       sd->nr_balance_failed = 0;
+       return 0;
 }
 
 /*
@@ -2086,56 +2104,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq)
 static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
 {
        struct sched_domain *sd;
-       struct sched_group *cpu_group;
        runqueue_t *target_rq;
-       cpumask_t visited_cpus;
-       int cpu;
+       int target_cpu = busiest_rq->push_cpu;
+
+       if (busiest_rq->nr_running <= 1)
+               /* no task to move */
+               return;
+
+       target_rq = cpu_rq(target_cpu);
 
        /*
-        * Search for suitable CPUs to push tasks to in successively higher
-        * domains with SD_LOAD_BALANCE set.
+        * This condition is "impossible", if it occurs
+        * we need to fix it.  Originally reported by
+        * Bjorn Helgaas on a 128-cpu setup.
         */
-       visited_cpus = CPU_MASK_NONE;
-       for_each_domain(busiest_cpu, sd) {
-               if (!(sd->flags & SD_LOAD_BALANCE))
-                       /* no more domains to search */
-                       break;
+       BUG_ON(busiest_rq == target_rq);
 
-               schedstat_inc(sd, alb_cnt);
+       /* move a task from busiest_rq to target_rq */
+       double_lock_balance(busiest_rq, target_rq);
 
-               cpu_group = sd->groups;
-               do {
-                       for_each_cpu_mask(cpu, cpu_group->cpumask) {
-                               if (busiest_rq->nr_running <= 1)
-                                       /* no more tasks left to move */
-                                       return;
-                               if (cpu_isset(cpu, visited_cpus))
-                                       continue;
-                               cpu_set(cpu, visited_cpus);
-                               if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
-                                       continue;
-
-                               target_rq = cpu_rq(cpu);
-                               /*
-                                * This condition is "impossible", if it occurs
-                                * we need to fix it.  Originally reported by
-                                * Bjorn Helgaas on a 128-cpu setup.
-                                */
-                               BUG_ON(busiest_rq == target_rq);
-
-                               /* move a task from busiest_rq to target_rq */
-                               double_lock_balance(busiest_rq, target_rq);
-                               if (move_tasks(target_rq, cpu, busiest_rq,
-                                               1, sd, SCHED_IDLE)) {
-                                       schedstat_inc(sd, alb_pushed);
-                               } else {
-                                       schedstat_inc(sd, alb_failed);
-                               }
-                               spin_unlock(&target_rq->lock);
-                       }
-                       cpu_group = cpu_group->next;
-               } while (cpu_group != sd->groups);
-       }
+       /* Search for an sd spanning us and the target CPU. */
+       for_each_domain(target_cpu, sd)
+               if ((sd->flags & SD_LOAD_BALANCE) &&
+                       cpu_isset(busiest_cpu, sd->span))
+                               break;
+
+       if (unlikely(sd == NULL))
+               goto out;
+
+       schedstat_inc(sd, alb_cnt);
+
+       if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+               schedstat_inc(sd, alb_pushed);
+       else
+               schedstat_inc(sd, alb_failed);
+out:
+       spin_unlock(&target_rq->lock);
 }
 
 /*
@@ -2156,18 +2160,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq,
        unsigned long old_load, this_load;
        unsigned long j = jiffies + CPU_OFFSET(this_cpu);
        struct sched_domain *sd;
+       int i;
 
-       /* Update our load */
-       old_load = this_rq->cpu_load;
        this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
-       /*
-        * Round up the averaging division if load is increasing. This
-        * prevents us from getting stuck on 9 if the load is 10, for
-        * example.
-        */
-       if (this_load > old_load)
-               old_load++;
-       this_rq->cpu_load = (old_load + this_load) / 2;
+       /* Update our load */
+       for (i = 0; i < 3; i++) {
+               unsigned long new_load = this_load;
+               int scale = 1 << i;
+               old_load = this_rq->cpu_load[i];
+               /*
+                * Round up the averaging division if load is increasing. This
+                * prevents us from getting stuck on 9 if the load is 10, for
+                * example.
+                */
+               if (new_load > old_load)
+                       new_load += scale-1;
+               this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+       }
 
        for_each_domain(this_cpu, sd) {
                unsigned long interval;
@@ -2576,7 +2585,7 @@ void fastcall add_preempt_count(int val)
        /*
         * Underflow?
         */
-       BUG_ON(((int)preempt_count() < 0));
+       BUG_ON((preempt_count() < 0));
        preempt_count() += val;
        /*
         * Spinlock count overflowing soon?
@@ -2869,7 +2878,7 @@ need_resched:
 
 int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
 {
-       task_t *p = curr->task;
+       task_t *p = curr->private;
        return try_to_wake_up(p, mode, sync);
 }
 
@@ -3755,19 +3764,22 @@ EXPORT_SYMBOL(cond_resched);
  */
 int cond_resched_lock(spinlock_t * lock)
 {
+       int ret = 0;
+
        if (need_lockbreak(lock)) {
                spin_unlock(lock);
                cpu_relax();
+               ret = 1;
                spin_lock(lock);
        }
        if (need_resched()) {
                _raw_spin_unlock(lock);
                preempt_enable_no_resched();
                __cond_resched();
+               ret = 1;
                spin_lock(lock);
-               return 1;
        }
-       return 0;
+       return ret;
 }
 
 EXPORT_SYMBOL(cond_resched_lock);
@@ -3811,7 +3823,7 @@ EXPORT_SYMBOL(yield);
  */
 void __sched io_schedule(void)
 {
-       struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+       struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
 
        atomic_inc(&rq->nr_iowait);
        schedule();
@@ -3822,7 +3834,7 @@ EXPORT_SYMBOL(io_schedule);
 
 long __sched io_schedule_timeout(long timeout)
 {
-       struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id());
+       struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id());
        long ret;
 
        atomic_inc(&rq->nr_iowait);
@@ -4924,13 +4936,15 @@ void __init sched_init(void)
 
                rq = cpu_rq(i);
                spin_lock_init(&rq->lock);
+               rq->nr_running = 0;
                rq->active = rq->arrays;
                rq->expired = rq->arrays + 1;
                rq->best_expired_prio = MAX_PRIO;
 
 #ifdef CONFIG_SMP
                rq->sd = &sched_domain_dummy;
-               rq->cpu_load = 0;
+               for (j = 1; j < 3; j++)
+                       rq->cpu_load[j] = 0;
                rq->active_balance = 0;
                rq->push_cpu = 0;
                rq->migration_thread = NULL;