*/
unsigned long nr_running;
#ifdef CONFIG_SMP
- unsigned long cpu_load;
+ unsigned long cpu_load[3];
#endif
unsigned long long nr_switches;
* We want to under-estimate the load of migration sources, to
* balance conservatively.
*/
-static inline unsigned long source_load(int cpu)
+static inline unsigned long source_load(int cpu, int type)
{
runqueue_t *rq = cpu_rq(cpu);
unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ if (type == 0)
+ return load_now;
- return min(rq->cpu_load, load_now);
+ return min(rq->cpu_load[type-1], load_now);
}
/*
* Return a high guess at the load of a migration-target cpu
*/
-static inline unsigned long target_load(int cpu)
+static inline unsigned long target_load(int cpu, int type)
{
runqueue_t *rq = cpu_rq(cpu);
unsigned long load_now = rq->nr_running * SCHED_LOAD_SCALE;
+ if (type == 0)
+ return load_now;
- return max(rq->cpu_load, load_now);
+ return max(rq->cpu_load[type-1], load_now);
}
#endif
for_each_domain(cpu, sd) {
if (sd->flags & SD_WAKE_IDLE) {
- cpus_and(tmp, sd->span, cpu_online_map);
- cpus_and(tmp, tmp, p->cpus_allowed);
+ cpus_and(tmp, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, tmp) {
if (idle_cpu(i))
return i;
}
}
- else break;
+ else
+ break;
}
return cpu;
}
runqueue_t *rq;
#ifdef CONFIG_SMP
unsigned long load, this_load;
- struct sched_domain *sd;
+ struct sched_domain *sd, *this_sd = NULL;
int new_cpu;
#endif
if (unlikely(task_running(rq, p)))
goto out_activate;
-#ifdef CONFIG_SCHEDSTATS
+ new_cpu = cpu;
+
schedstat_inc(rq, ttwu_cnt);
if (cpu == this_cpu) {
schedstat_inc(rq, ttwu_local);
- } else {
- for_each_domain(this_cpu, sd) {
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_wake_remote);
- break;
- }
+ goto out_set_cpu;
+ }
+
+ for_each_domain(this_cpu, sd) {
+ if (cpu_isset(cpu, sd->span)) {
+ schedstat_inc(sd, ttwu_wake_remote);
+ this_sd = sd;
+ break;
}
}
-#endif
- new_cpu = cpu;
- if (cpu == this_cpu || unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
+ if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed)))
goto out_set_cpu;
- load = source_load(cpu);
- this_load = target_load(this_cpu);
-
/*
- * If sync wakeup then subtract the (maximum possible) effect of
- * the currently running task from the load of the current CPU:
+ * Check for affine wakeup and passive balancing possibilities.
*/
- if (sync)
- this_load -= SCHED_LOAD_SCALE;
+ if (this_sd) {
+ int idx = this_sd->wake_idx;
+ unsigned int imbalance;
- /* Don't pull the task off an idle CPU to a busy one */
- if (load < SCHED_LOAD_SCALE/2 && this_load > SCHED_LOAD_SCALE/2)
- goto out_set_cpu;
+ imbalance = 100 + (this_sd->imbalance_pct - 100) / 2;
- new_cpu = this_cpu; /* Wake to this CPU if we can */
+ load = source_load(cpu, idx);
+ this_load = target_load(this_cpu, idx);
- /*
- * Scan domains for affine wakeup and passive balancing
- * possibilities.
- */
- for_each_domain(this_cpu, sd) {
- unsigned int imbalance;
- /*
- * Start passive balancing when half the imbalance_pct
- * limit is reached.
- */
- imbalance = sd->imbalance_pct + (sd->imbalance_pct - 100) / 2;
+ new_cpu = this_cpu; /* Wake to this CPU if we can */
- if ((sd->flags & SD_WAKE_AFFINE) &&
- !task_hot(p, rq->timestamp_last_tick, sd)) {
+ if (this_sd->flags & SD_WAKE_AFFINE) {
+ unsigned long tl = this_load;
/*
- * This domain has SD_WAKE_AFFINE and p is cache cold
- * in this domain.
+ * If sync wakeup then subtract the (maximum possible)
+ * effect of the currently running task from the load
+ * of the current CPU:
*/
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_move_affine);
+ if (sync)
+ tl -= SCHED_LOAD_SCALE;
+
+ if ((tl <= load &&
+ tl + target_load(cpu, idx) <= SCHED_LOAD_SCALE) ||
+ 100*(tl + SCHED_LOAD_SCALE) <= imbalance*load) {
+ /*
+ * This domain has SD_WAKE_AFFINE and
+ * p is cache cold in this domain, and
+ * there is no bad imbalance.
+ */
+ schedstat_inc(this_sd, ttwu_move_affine);
goto out_set_cpu;
}
- } else if ((sd->flags & SD_WAKE_BALANCE) &&
- imbalance*this_load <= 100*load) {
- /*
- * This domain has SD_WAKE_BALANCE and there is
- * an imbalance.
- */
- if (cpu_isset(cpu, sd->span)) {
- schedstat_inc(sd, ttwu_move_balance);
+ }
+
+ /*
+ * Start passive balancing when half the imbalance_pct
+ * limit is reached.
+ */
+ if (this_sd->flags & SD_WAKE_BALANCE) {
+ if (imbalance*this_load <= 100*load) {
+ schedstat_inc(this_sd, ttwu_move_balance);
goto out_set_cpu;
}
}
cpus_and(mask, sd->span, p->cpus_allowed);
for_each_cpu_mask(i, mask) {
- load = target_load(i);
+ load = target_load(i, sd->wake_idx);
if (load < min_load) {
min_cpu = i;
}
/* add +1 to account for the new task */
- this_load = source_load(this_cpu) + SCHED_LOAD_SCALE;
+ this_load = source_load(this_cpu, sd->wake_idx) + SCHED_LOAD_SCALE;
/*
* Would with the addition of the new task to the
*/
static inline
int can_migrate_task(task_t *p, runqueue_t *rq, int this_cpu,
- struct sched_domain *sd, enum idle_type idle)
+ struct sched_domain *sd, enum idle_type idle, int *all_pinned)
{
/*
* We do not migrate tasks that are:
* 2) cannot be migrated to this CPU due to cpus_allowed, or
* 3) are cache-hot on their current CPU.
*/
- if (task_running(rq, p))
- return 0;
if (!cpu_isset(this_cpu, p->cpus_allowed))
return 0;
+ *all_pinned = 0;
+
+ if (task_running(rq, p))
+ return 0;
/*
* Aggressive migration if:
return 1;
if (task_hot(p, rq->timestamp_last_tick, sd))
- return 0;
+ return 0;
return 1;
}
*/
static int move_tasks(runqueue_t *this_rq, int this_cpu, runqueue_t *busiest,
unsigned long max_nr_move, struct sched_domain *sd,
- enum idle_type idle)
+ enum idle_type idle, int *all_pinned)
{
prio_array_t *array, *dst_array;
struct list_head *head, *curr;
- int idx, pulled = 0;
+ int idx, pulled = 0, pinned = 0;
task_t *tmp;
- if (max_nr_move <= 0 || busiest->nr_running <= 1)
+ if (max_nr_move == 0)
goto out;
+ pinned = 1;
+
/*
* We first consider expired tasks. Those will likely not be
* executed in the near future, and they are most likely to
curr = curr->prev;
- if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle)) {
+ if (!can_migrate_task(tmp, busiest, this_cpu, sd, idle, &pinned)) {
if (curr != head)
goto skip_queue;
idx++;
* inside pull_task().
*/
schedstat_add(sd, lb_gained[idle], pulled);
+
+ if (all_pinned)
+ *all_pinned = pinned;
return pulled;
}
{
struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+ int load_idx;
max_load = this_load = total_load = total_pwr = 0;
+ if (idle == NOT_IDLE)
+ load_idx = sd->busy_idx;
+ else if (idle == NEWLY_IDLE)
+ load_idx = sd->newidle_idx;
+ else
+ load_idx = sd->idle_idx;
do {
unsigned long load;
for_each_cpu_mask(i, group->cpumask) {
/* Bias balancing toward cpus of our domain */
if (local_group)
- load = target_load(i);
+ load = target_load(i, load_idx);
else
- load = source_load(i);
+ load = source_load(i, load_idx);
avg_load += load;
}
/* Get rid of the scaling factor, rounding down as we divide */
*imbalance = *imbalance / SCHED_LOAD_SCALE;
-
return busiest;
out_balanced:
- if (busiest && (idle == NEWLY_IDLE ||
- (idle == SCHED_IDLE && max_load > SCHED_LOAD_SCALE)) ) {
- *imbalance = 1;
- return busiest;
- }
*imbalance = 0;
return NULL;
int i;
for_each_cpu_mask(i, group->cpumask) {
- load = source_load(i);
+ load = source_load(i, 0);
if (load > max_load) {
max_load = load;
struct sched_group *group;
runqueue_t *busiest;
unsigned long imbalance;
- int nr_moved;
+ int nr_moved, all_pinned;
+ int active_balance = 0;
spin_lock(&this_rq->lock);
schedstat_inc(sd, lb_cnt[idle]);
goto out_balanced;
}
- /*
- * This should be "impossible", but since load
- * balancing is inherently racy and statistical,
- * it could happen in theory.
- */
- if (unlikely(busiest == this_rq)) {
- WARN_ON(1);
- goto out_balanced;
- }
+ BUG_ON(busiest == this_rq);
schedstat_add(sd, lb_imbalance[idle], imbalance);
*/
double_lock_balance(this_rq, busiest);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, idle);
+ imbalance, sd, idle,
+ &all_pinned);
spin_unlock(&busiest->lock);
+
+ /* All tasks on this runqueue were pinned by CPU affinity */
+ if (unlikely(all_pinned))
+ goto out_balanced;
}
+
spin_unlock(&this_rq->lock);
if (!nr_moved) {
sd->nr_balance_failed++;
if (unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2)) {
- int wake = 0;
spin_lock(&busiest->lock);
if (!busiest->active_balance) {
busiest->active_balance = 1;
busiest->push_cpu = this_cpu;
- wake = 1;
+ active_balance = 1;
}
spin_unlock(&busiest->lock);
- if (wake)
+ if (active_balance)
wake_up_process(busiest->migration_thread);
/*
* We've kicked active balancing, reset the failure
* counter.
*/
- sd->nr_balance_failed = sd->cache_nice_tries;
+ sd->nr_balance_failed = sd->cache_nice_tries+1;
}
-
- /*
- * We were unbalanced, but unsuccessful in move_tasks(),
- * so bump the balance_interval to lessen the lock contention.
- */
- if (sd->balance_interval < sd->max_interval)
- sd->balance_interval++;
- } else {
+ } else
sd->nr_balance_failed = 0;
+ if (likely(!active_balance)) {
/* We were unbalanced, so reset the balancing interval */
sd->balance_interval = sd->min_interval;
+ } else {
+ /*
+ * If we've begun active balancing, start to back off. This
+ * case may not be covered by the all_pinned logic if there
+ * is only 1 task on the busy runqueue (because we don't call
+ * move_tasks).
+ */
+ if (sd->balance_interval < sd->max_interval)
+ sd->balance_interval *= 2;
}
return nr_moved;
schedstat_inc(sd, lb_balanced[idle]);
+ sd->nr_balance_failed = 0;
/* tune up the balancing interval */
if (sd->balance_interval < sd->max_interval)
sd->balance_interval *= 2;
schedstat_inc(sd, lb_cnt[NEWLY_IDLE]);
group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE);
if (!group) {
- schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]);
- goto out;
+ goto out_balanced;
}
busiest = find_busiest_queue(group);
- if (!busiest || busiest == this_rq) {
- schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ if (!busiest) {
schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]);
- goto out;
+ goto out_balanced;
}
+ BUG_ON(busiest == this_rq);
+
/* Attempt to move tasks */
double_lock_balance(this_rq, busiest);
schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance);
nr_moved = move_tasks(this_rq, this_cpu, busiest,
- imbalance, sd, NEWLY_IDLE);
+ imbalance, sd, NEWLY_IDLE, NULL);
if (!nr_moved)
schedstat_inc(sd, lb_failed[NEWLY_IDLE]);
+ else
+ sd->nr_balance_failed = 0;
spin_unlock(&busiest->lock);
-
-out:
return nr_moved;
+
+out_balanced:
+ schedstat_inc(sd, lb_balanced[NEWLY_IDLE]);
+ sd->nr_balance_failed = 0;
+ return 0;
}
/*
static void active_load_balance(runqueue_t *busiest_rq, int busiest_cpu)
{
struct sched_domain *sd;
- struct sched_group *cpu_group;
runqueue_t *target_rq;
- cpumask_t visited_cpus;
- int cpu;
+ int target_cpu = busiest_rq->push_cpu;
+
+ if (busiest_rq->nr_running <= 1)
+ /* no task to move */
+ return;
+
+ target_rq = cpu_rq(target_cpu);
/*
- * Search for suitable CPUs to push tasks to in successively higher
- * domains with SD_LOAD_BALANCE set.
+ * This condition is "impossible", if it occurs
+ * we need to fix it. Originally reported by
+ * Bjorn Helgaas on a 128-cpu setup.
*/
- visited_cpus = CPU_MASK_NONE;
- for_each_domain(busiest_cpu, sd) {
- if (!(sd->flags & SD_LOAD_BALANCE))
- /* no more domains to search */
- break;
+ BUG_ON(busiest_rq == target_rq);
- schedstat_inc(sd, alb_cnt);
+ /* move a task from busiest_rq to target_rq */
+ double_lock_balance(busiest_rq, target_rq);
- cpu_group = sd->groups;
- do {
- for_each_cpu_mask(cpu, cpu_group->cpumask) {
- if (busiest_rq->nr_running <= 1)
- /* no more tasks left to move */
- return;
- if (cpu_isset(cpu, visited_cpus))
- continue;
- cpu_set(cpu, visited_cpus);
- if (!cpu_and_siblings_are_idle(cpu) || cpu == busiest_cpu)
- continue;
-
- target_rq = cpu_rq(cpu);
- /*
- * This condition is "impossible", if it occurs
- * we need to fix it. Originally reported by
- * Bjorn Helgaas on a 128-cpu setup.
- */
- BUG_ON(busiest_rq == target_rq);
-
- /* move a task from busiest_rq to target_rq */
- double_lock_balance(busiest_rq, target_rq);
- if (move_tasks(target_rq, cpu, busiest_rq,
- 1, sd, SCHED_IDLE)) {
- schedstat_inc(sd, alb_pushed);
- } else {
- schedstat_inc(sd, alb_failed);
- }
- spin_unlock(&target_rq->lock);
- }
- cpu_group = cpu_group->next;
- } while (cpu_group != sd->groups);
- }
+ /* Search for an sd spanning us and the target CPU. */
+ for_each_domain(target_cpu, sd)
+ if ((sd->flags & SD_LOAD_BALANCE) &&
+ cpu_isset(busiest_cpu, sd->span))
+ break;
+
+ if (unlikely(sd == NULL))
+ goto out;
+
+ schedstat_inc(sd, alb_cnt);
+
+ if (move_tasks(target_rq, target_cpu, busiest_rq, 1, sd, SCHED_IDLE, NULL))
+ schedstat_inc(sd, alb_pushed);
+ else
+ schedstat_inc(sd, alb_failed);
+out:
+ spin_unlock(&target_rq->lock);
}
/*
unsigned long old_load, this_load;
unsigned long j = jiffies + CPU_OFFSET(this_cpu);
struct sched_domain *sd;
+ int i;
- /* Update our load */
- old_load = this_rq->cpu_load;
this_load = this_rq->nr_running * SCHED_LOAD_SCALE;
- /*
- * Round up the averaging division if load is increasing. This
- * prevents us from getting stuck on 9 if the load is 10, for
- * example.
- */
- if (this_load > old_load)
- old_load++;
- this_rq->cpu_load = (old_load + this_load) / 2;
+ /* Update our load */
+ for (i = 0; i < 3; i++) {
+ unsigned long new_load = this_load;
+ int scale = 1 << i;
+ old_load = this_rq->cpu_load[i];
+ /*
+ * Round up the averaging division if load is increasing. This
+ * prevents us from getting stuck on 9 if the load is 10, for
+ * example.
+ */
+ if (new_load > old_load)
+ new_load += scale-1;
+ this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale;
+ }
for_each_domain(this_cpu, sd) {
unsigned long interval;
int default_wake_function(wait_queue_t *curr, unsigned mode, int sync, void *key)
{
- task_t *p = curr->task;
+ task_t *p = curr->private;
return try_to_wake_up(p, mode, sync);
}
rq = cpu_rq(i);
spin_lock_init(&rq->lock);
+ rq->nr_running = 0;
rq->active = rq->arrays;
rq->expired = rq->arrays + 1;
rq->best_expired_prio = MAX_PRIO;
#ifdef CONFIG_SMP
rq->sd = &sched_domain_dummy;
- rq->cpu_load = 0;
+ for (j = 1; j < 3; j++)
+ rq->cpu_load[j] = 0;
rq->active_balance = 0;
rq->push_cpu = 0;
rq->migration_thread = NULL;