X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=5ae3568eed0b4c2a13d52be4b19367ae1e36b935;hb=a3f21bce1fefdf92a4d1705e888d390b10f3ac6f;hp=66b2ed784822739b1c2e46c317dca2e3324fc6f1;hpb=ae20ea8525a80a863f70d332cf47b71bd9f54c1f;p=linux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 66b2ed7848..5ae3568eed 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -206,7 +206,7 @@ struct runqueue { */ unsigned long nr_running; #ifdef CONFIG_SMP - unsigned long cpu_load; + unsigned long cpu_load[3]; #endif unsigned long long nr_switches; @@ -886,23 +886,27 @@ void kick_process(task_t *p) * We want to under-estimate the load of migration sources, to * balance conservatively. */ -static inline unsigned long source_load(int cpu) +static inline unsigned long source_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; - return min(rq->cpu_load, load_now); + return min(rq->cpu_load[type-1], load_now); } /* * Return a high guess at the load of a migration-target cpu */ -static inline unsigned long target_load(int cpu) +static inline unsigned long target_load(int cpu, int type) { runqueue_t *rq = cpu_rq(cpu); unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE; + if (type == 0) + return load_now; - return max(rq->cpu_load, load_now); + return max(rq->cpu_load[type-1], load_now); } #endif @@ -927,14 +931,14 @@ static int wake_idle(int cpu, task_t *p) for_each_domain(cpu, sd) { if (sd->flags & SD_WAKE_IDLE) { - cpus_and(tmp, sd->span, cpu_online_map); - cpus_and(tmp, tmp, p->cpus_allowed); + cpus_and(tmp, sd->span, p->cpus_allowed); for_each_cpu_mask(i, tmp) { if (idle_cpu(i)) return i; } } - else break; + else + break; } return cpu; } @@ -967,7 +971,7 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) runqueue_t *rq; #ifdef CONFIG_SMP unsigned long load, this_load; - struct sched_domain *sd; + struct sched_domain *sd, *this_sd = NULL; int new_cpu; #endif @@ -986,70 +990,69 @@ static int try_to_wake_up(task_t * p, unsigned int state, int sync) if (unlikely(task_running(rq, p))) goto out_activate; -#ifdef CONFIG_SCHEDSTATS + new_cpu = cpu; + schedstat_inc(rq, ttwu_cnt); if (cpu == this_cpu) { schedstat_inc(rq, ttwu_local); - } else { - for_each_domain(this_cpu, sd) { - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_wake_remote); - break; - } + goto out_set_cpu; + } + + for_each_domain(this_cpu, sd) { + if (cpu_isset(cpu, sd->span)) { + schedstat_inc(sd, ttwu_wake_remote); + this_sd = sd; + break; } } -#endif - new_cpu = cpu; - if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) goto out_set_cpu; - load = source_load(cpu); - this_load = target_load(this_cpu); - /* - * If sync wakeup then subtract the (maximum possible) effect of - * the currently running task from the load of the current CPU: + * Check for affine wakeup and passive balancing possibilities. */ - if (sync) - this_load -= SCHED_LOAD_SCALE; + if (this_sd) { + int idx = this_sd->wake_idx; + unsigned int imbalance; - /* Don't pull the task off an idle CPU to a busy one */ - if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2) - goto out_set_cpu; + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; - new_cpu = this_cpu; /* Wake to this CPU if we can */ + load = source_load(cpu, idx); + this_load = target_load(this_cpu, idx); - /* - * Scan domains for affine wakeup and passive balancing - * possibilities. - */ - for_each_domain(this_cpu, sd) { - unsigned int imbalance; - /* - * Start passive balancing when half the imbalance_pct - * limit is reached. - */ - imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2; + new_cpu = this_cpu; /* Wake to this CPU if we can */ - if ((sd->flags & SD_WAKE_AFFINE) && - !task_hot(p, rq->timestamp_last_tick, sd)) { + if (this_sd->flags & SD_WAKE_AFFINE) { + unsigned long tl = this_load; /* - * This domain has SD_WAKE_AFFINE and p is cache cold - * in this domain. + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: */ - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_move_affine); + if (sync) + tl -= SCHED_LOAD_SCALE; + + if ((tl <= load && + tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) || + 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); goto out_set_cpu; } - } else if ((sd->flags & SD_WAKE_BALANCE) && - imbalance*this_load <= 100*load) { - /* - * This domain has SD_WAKE_BALANCE and there is - * an imbalance. - */ - if (cpu_isset(cpu, sd->span)) { - schedstat_inc(sd, ttwu_move_balance); + } + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); goto out_set_cpu; } } @@ -1509,7 +1512,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu, cpus_and(mask, sd->span, p->cpus_allowed); for_each_cpu_mask(i, mask) { - load = target_load(i); + load = target_load(i, sd->wake_idx); if (load < min_load) { min_cpu = i; @@ -1522,7 +1525,7 @@ static int find_idlest_cpu(struct task_struct *p, int this_cpu, } /* add +1 to account for the new task */ - this_load = source_load(this_cpu) + SCHED_LOAD_SCALE; + this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE; /* * Would with the addition of the new task to the @@ -1632,7 +1635,7 @@ void pull_task(runqueue_t *src_rq, prio_array_t *src_array, task_t *p, */ static inline int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle, int *all_pinned) { /* * We do not migrate tasks that are: @@ -1640,10 +1643,12 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) are cache-hot on their current CPU. */ - if (task_running(rq, p)) - return 0; if (!cpu_isset(this_cpu, p->cpus_allowed)) return 0; + *all_pinned = 0; + + if (task_running(rq, p)) + return 0; /* * Aggressive migration if: @@ -1656,7 +1661,7 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, return 1; if (task_hot(p, rq->timestamp_last_tick, sd)) - return 0; + return 0; return 1; } @@ -1669,16 +1674,18 @@ int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu, */ static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest, unsigned long max_nr_move, struct sched_domain *sd, - enum idle_type idle) + enum idle_type idle, int *all_pinned) { prio_array_t *array, *dst_array; struct list_head *head, *curr; - int idx, pulled = 0; + int idx, pulled = 0, pinned = 0; task_t *tmp; - if (max_nr_move <= 0 || busiest->nr_running <= 1) + if (max_nr_move == 0) goto out; + pinned = 1; + /* * We first consider expired tasks. Those will likely not be * executed in the near future, and they are most likely to @@ -1717,7 +1724,7 @@ skip_queue: curr = curr->prev; - if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) { + if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) { if (curr != head) goto skip_queue; idx++; @@ -1746,6 +1753,9 @@ out: * inside pull_task(). */ schedstat_add(sd, lb_gained[idle], pulled); + + if (all_pinned) + *all_pinned = pinned; return pulled; } @@ -1760,8 +1770,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; + int load_idx; max_load = this_load = total_load = total_pwr = 0; + if (idle == NOT_IDLE) + load_idx = sd->busy_idx; + else if (idle == NEWLY_IDLE) + load_idx = sd->newidle_idx; + else + load_idx = sd->idle_idx; do { unsigned long load; @@ -1776,9 +1793,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, for_each_cpu_mask(i, group->cpumask) { /* Bias balancing toward cpus of our domain */ if (local_group) - load = target_load(i); + load = target_load(i, load_idx); else - load = source_load(i); + load = source_load(i, load_idx); avg_load += load; } @@ -1870,15 +1887,9 @@ nextgroup: /* Get rid of the scaling factor, rounding down as we divide */ *imbalance = *imbalance / SCHED_LOAD_SCALE; - return busiest; out_balanced: - if (busiest && (idle == NEWLY_IDLE || - (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) { - *imbalance = 1; - return busiest; - } *imbalance = 0; return NULL; @@ -1894,7 +1905,7 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) int i; for_each_cpu_mask(i, group->cpumask) { - load = source_load(i); + load = source_load(i, 0); if (load > max_load) { max_load = load; @@ -1917,7 +1928,8 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, struct sched_group *group; runqueue_t *busiest; unsigned long imbalance; - int nr_moved; + int nr_moved, all_pinned; + int active_balance = 0; spin_lock(&this_rq->lock); schedstat_inc(sd, lb_cnt[idle]); @@ -1934,15 +1946,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, goto out_balanced; } - /* - * This should be "impossible", but since load - * balancing is inherently racy and statistical, - * it could happen in theory. - */ - if (unlikely(busiest == this_rq)) { - WARN_ON(1); - goto out_balanced; - } + BUG_ON(busiest == this_rq); schedstat_add(sd, lb_imbalance[idle], imbalance); @@ -1956,9 +1960,15 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, */ double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, idle); + imbalance, sd, idle, + &all_pinned); spin_unlock(&busiest->lock); + + /* All tasks on this runqueue were pinned by CPU affinity */ + if (unlikely(all_pinned)) + goto out_balanced; } + spin_unlock(&this_rq->lock); if (!nr_moved) { @@ -1966,36 +1976,38 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, sd->nr_balance_failed++; if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) { - int wake = 0; spin_lock(&busiest->lock); if (!busiest->active_balance) { busiest->active_balance = 1; busiest->push_cpu = this_cpu; - wake = 1; + active_balance = 1; } spin_unlock(&busiest->lock); - if (wake) + if (active_balance) wake_up_process(busiest->migration_thread); /* * We've kicked active balancing, reset the failure * counter. */ - sd->nr_balance_failed = sd->cache_nice_tries; + sd->nr_balance_failed = sd->cache_nice_tries+1; } - - /* - * We were unbalanced, but unsuccessful in move_tasks(), - * so bump the balance_interval to lessen the lock contention. - */ - if (sd->balance_interval < sd->max_interval) - sd->balance_interval++; - } else { + } else sd->nr_balance_failed = 0; + if (likely(!active_balance)) { /* We were unbalanced, so reset the balancing interval */ sd->balance_interval = sd->min_interval; + } else { + /* + * If we've begun active balancing, start to back off. This + * case may not be covered by the all_pinned logic if there + * is only 1 task on the busy runqueue (because we don't call + * move_tasks). + */ + if (sd->balance_interval < sd->max_interval) + sd->balance_interval *= 2; } return nr_moved; @@ -2005,6 +2017,7 @@ out_balanced: schedstat_inc(sd, lb_balanced[idle]); + sd->nr_balance_failed = 0; /* tune up the balancing interval */ if (sd->balance_interval < sd->max_interval) sd->balance_interval *= 2; @@ -2030,31 +2043,36 @@ static int load_balance_newidle(int this_cpu, runqueue_t *this_rq, schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE); if (!group) { - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); - goto out; + goto out_balanced; } busiest = find_busiest_queue(group); - if (!busiest || busiest == this_rq) { - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + if (!busiest) { schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); - goto out; + goto out_balanced; } + BUG_ON(busiest == this_rq); + /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); nr_moved = move_tasks(this_rq, this_cpu, busiest, - imbalance, sd, NEWLY_IDLE); + imbalance, sd, NEWLY_IDLE, NULL); if (!nr_moved) schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + else + sd->nr_balance_failed = 0; spin_unlock(&busiest->lock); - -out: return nr_moved; + +out_balanced: + schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + sd->nr_balance_failed = 0; + return 0; } /* @@ -2086,56 +2104,42 @@ static inline void idle_balance(int this_cpu, runqueue_t *this_rq) static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu) { struct sched_domain *sd; - struct sched_group *cpu_group; runqueue_t *target_rq; - cpumask_t visited_cpus; - int cpu; + int target_cpu = busiest_rq->push_cpu; + + if (busiest_rq->nr_running <= 1) + /* no task to move */ + return; + + target_rq = cpu_rq(target_cpu); /* - * Search for suitable CPUs to push tasks to in successively higher - * domains with SD_LOAD_BALANCE set. + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by + * Bjorn Helgaas on a 128-cpu setup. */ - visited_cpus = CPU_MASK_NONE; - for_each_domain(busiest_cpu, sd) { - if (!(sd->flags & SD_LOAD_BALANCE)) - /* no more domains to search */ - break; + BUG_ON(busiest_rq == target_rq); - schedstat_inc(sd, alb_cnt); + /* move a task from busiest_rq to target_rq */ + double_lock_balance(busiest_rq, target_rq); - cpu_group = sd->groups; - do { - for_each_cpu_mask(cpu, cpu_group->cpumask) { - if (busiest_rq->nr_running <= 1) - /* no more tasks left to move */ - return; - if (cpu_isset(cpu, visited_cpus)) - continue; - cpu_set(cpu, visited_cpus); - if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu) - continue; - - target_rq = cpu_rq(cpu); - /* - * This condition is "impossible", if it occurs - * we need to fix it. Originally reported by - * Bjorn Helgaas on a 128-cpu setup. - */ - BUG_ON(busiest_rq == target_rq); - - /* move a task from busiest_rq to target_rq */ - double_lock_balance(busiest_rq, target_rq); - if (move_tasks(target_rq, cpu, busiest_rq, - 1, sd, SCHED_IDLE)) { - schedstat_inc(sd, alb_pushed); - } else { - schedstat_inc(sd, alb_failed); - } - spin_unlock(&target_rq->lock); - } - cpu_group = cpu_group->next; - } while (cpu_group != sd->groups); - } + /* Search for an sd spanning us and the target CPU. */ + for_each_domain(target_cpu, sd) + if ((sd->flags & SD_LOAD_BALANCE) && + cpu_isset(busiest_cpu, sd->span)) + break; + + if (unlikely(sd == NULL)) + goto out; + + schedstat_inc(sd, alb_cnt); + + if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL)) + schedstat_inc(sd, alb_pushed); + else + schedstat_inc(sd, alb_failed); +out: + spin_unlock(&target_rq->lock); } /* @@ -2156,18 +2160,23 @@ static void rebalance_tick(int this_cpu, runqueue_t *this_rq, unsigned long old_load, this_load; unsigned long j = jiffies + CPU_OFFSET(this_cpu); struct sched_domain *sd; + int i; - /* Update our load */ - old_load = this_rq->cpu_load; this_load = this_rq->nr_running * SCHED_LOAD_SCALE; - /* - * Round up the averaging division if load is increasing. This - * prevents us from getting stuck on 9 if the load is 10, for - * example. - */ - if (this_load > old_load) - old_load++; - this_rq->cpu_load = (old_load + this_load) / 2; + /* Update our load */ + for (i = 0; i < 3; i++) { + unsigned long new_load = this_load; + int scale = 1 << i; + old_load = this_rq->cpu_load[i]; + /* + * Round up the averaging division if load is increasing. This + * prevents us from getting stuck on 9 if the load is 10, for + * example. + */ + if (new_load > old_load) + new_load += scale-1; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + } for_each_domain(this_cpu, sd) { unsigned long interval; @@ -2576,7 +2585,7 @@ void fastcall add_preempt_count(int val) /* * Underflow? */ - BUG_ON(((int)preempt_count() < 0)); + BUG_ON((preempt_count() < 0)); preempt_count() += val; /* * Spinlock count overflowing soon? @@ -2869,7 +2878,7 @@ need_resched: int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key) { - task_t *p = curr->task; + task_t *p = curr->private; return try_to_wake_up(p, mode, sync); } @@ -3755,19 +3764,22 @@ EXPORT_SYMBOL(cond_resched); */ int cond_resched_lock(spinlock_t * lock) { + int ret = 0; + if (need_lockbreak(lock)) { spin_unlock(lock); cpu_relax(); + ret = 1; spin_lock(lock); } if (need_resched()) { _raw_spin_unlock(lock); preempt_enable_no_resched(); __cond_resched(); + ret = 1; spin_lock(lock); - return 1; } - return 0; + return ret; } EXPORT_SYMBOL(cond_resched_lock); @@ -3811,7 +3823,7 @@ EXPORT_SYMBOL(yield); */ void __sched io_schedule(void) { - struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); atomic_inc(&rq->nr_iowait); schedule(); @@ -3822,7 +3834,7 @@ EXPORT_SYMBOL(io_schedule); long __sched io_schedule_timeout(long timeout) { - struct runqueue *rq = &per_cpu(runqueues, _smp_processor_id()); + struct runqueue *rq = &per_cpu(runqueues, raw_smp_processor_id()); long ret; atomic_inc(&rq->nr_iowait); @@ -4924,13 +4936,15 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); + rq->nr_running = 0; rq->active = rq->arrays; rq->expired = rq->arrays + 1; rq->best_expired_prio = MAX_PRIO; #ifdef CONFIG_SMP rq->sd = &sched_domain_dummy; - rq->cpu_load = 0; + for (j = 1; j < 3; j++) + rq->cpu_load[j] = 0; rq->active_balance = 0; rq->push_cpu = 0; rq->migration_thread = NULL;