X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=08f86178aa34da228080edbdd8649a2a5b3a9d71;hb=ac98695d6c1508b724f246f38ce57fb4e3cec356;hp=d327511d268e062458372ae543b45a7fd9290f2b;hpb=fe2eea3fafb3df2f5b8a55a48bcbb0d23b3b5618;p=linux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index d327511d26..08f86178aa 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -56,6 +56,16 @@ #include +/* + * Scheduler clock - returns current time in nanosec units. + * This is default implementation. + * Architectures and sub-architectures can override this. + */ +unsigned long long __attribute__((weak)) sched_clock(void) +{ + return (unsigned long long)jiffies * (1000000000 / HZ); +} + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -225,8 +235,10 @@ struct rq { unsigned long nr_uninterruptible; unsigned long expired_timestamp; - unsigned long long timestamp_last_tick; + /* Cached timestamp set by update_cpu_clock() */ + unsigned long long most_recent_timestamp; struct task_struct *curr, *idle; + unsigned long next_balance; struct mm_struct *prev_mm; struct prio_array *active, *expired, arrays[2]; int best_expired_prio; @@ -426,7 +438,7 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) * bump this up when changing the output format or the meaning of an existing * format, so that tools can adapt (or abort) */ -#define SCHEDSTAT_VERSION 12 +#define SCHEDSTAT_VERSION 14 static int show_schedstat(struct seq_file *seq, void *v) { @@ -464,7 +476,8 @@ static int show_schedstat(struct seq_file *seq, void *v) seq_printf(seq, "domain%d %s", dcnt++, mask_str); for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " + "%lu", sd->lb_cnt[itype], sd->lb_balanced[itype], sd->lb_failed[itype], @@ -474,11 +487,13 @@ static int show_schedstat(struct seq_file *seq, void *v) sd->lb_nobusyq[itype], sd->lb_nobusyg[itype]); } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", + seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" + " %lu %lu %lu\n", sd->alb_cnt, sd->alb_failed, sd->alb_pushed, sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, sd->ttwu_move_balance); + sd->ttwu_wake_remote, sd->ttwu_move_affine, + sd->ttwu_move_balance); } preempt_enable(); #endif @@ -938,13 +953,16 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) { unsigned long long now; + if (rt_task(p)) + goto out; + now = sched_clock(); #ifdef CONFIG_SMP if (!local) { /* Compensate for drifting sched_clock */ struct rq *this_rq = this_rq(); - now = (now - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + now = (now - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; } #endif @@ -959,8 +977,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) (now - p->timestamp) >> 20); } - if (!rt_task(p)) - p->prio = recalc_task_prio(p, now); + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -985,7 +1002,7 @@ static void activate_task(struct task_struct *p, struct rq *rq, int local) } } p->timestamp = now; - +out: __activate_task(p, rq); } @@ -1450,7 +1467,9 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync) if (this_sd->flags & SD_WAKE_AFFINE) { unsigned long tl = this_load; - unsigned long tl_per_task = cpu_avg_load_per_task(this_cpu); + unsigned long tl_per_task; + + tl_per_task = cpu_avg_load_per_task(this_cpu); /* * If sync wakeup then subtract the (maximum possible) @@ -1558,6 +1577,7 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) return try_to_wake_up(p, state, 0); } +static void task_running_tick(struct rq *rq, struct task_struct *p); /* * Perform scheduler related setup for a newly forked process p. * p is forked by current. @@ -1618,7 +1638,7 @@ void fastcall sched_fork(struct task_struct *p, int clone_flags) * runqueue lock is not a problem. */ current->time_slice = 1; - scheduler_tick(); + task_running_tick(cpu_rq(cpu), current); } local_irq_enable(); put_cpu(); @@ -1688,8 +1708,8 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * Not the local CPU - must adjust timestamp. This should * get optimised away in the !CONFIG_SMP case. */ - p->timestamp = (p->timestamp - this_rq->timestamp_last_tick) - + rq->timestamp_last_tick; + p->timestamp = (p->timestamp - this_rq->most_recent_timestamp) + + rq->most_recent_timestamp; __activate_task(p, rq); if (TASK_PREEMPTS_CURR(p, rq)) resched_task(rq->curr); @@ -2067,8 +2087,8 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, set_task_cpu(p, this_cpu); inc_nr_running(p, this_rq); enqueue_task(p, this_array); - p->timestamp = (p->timestamp - src_rq->timestamp_last_tick) - + this_rq->timestamp_last_tick; + p->timestamp = (p->timestamp - src_rq->most_recent_timestamp) + + this_rq->most_recent_timestamp; /* * Note that idle threads have a prio of MAX_PRIO, for this test * to be always true for them. @@ -2104,10 +2124,15 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, * 2) too many balance attempts have failed. */ - if (sd->nr_balance_failed > sd->cache_nice_tries) + if (sd->nr_balance_failed > sd->cache_nice_tries) { +#ifdef CONFIG_SCHEDSTATS + if (task_hot(p, rq->most_recent_timestamp, sd)) + schedstat_inc(sd, lb_hot_gained[idle]); +#endif return 1; + } - if (task_hot(p, rq->timestamp_last_tick, sd)) + if (task_hot(p, rq->most_recent_timestamp, sd)) return 0; return 1; } @@ -2205,11 +2230,6 @@ skip_queue: goto skip_bitmap; } -#ifdef CONFIG_SCHEDSTATS - if (task_hot(tmp, busiest->timestamp_last_tick, sd)) - schedstat_inc(sd, lb_hot_gained[idle]); -#endif - pull_task(busiest, array, tmp, this_rq, dst_array, this_cpu); pulled++; rem_load_move -= tmp->load_weight; @@ -2247,7 +2267,7 @@ out: static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long *imbalance, enum idle_type idle, int *sd_idle, - cpumask_t *cpus) + cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; unsigned long max_load, avg_load, total_load, this_load, total_pwr; @@ -2276,10 +2296,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, unsigned long load, group_capacity; int local_group; int i; + unsigned int balance_cpu = -1, first_idle_cpu = 0; unsigned long sum_nr_running, sum_weighted_load; local_group = cpu_isset(this_cpu, group->cpumask); + if (local_group) + balance_cpu = first_cpu(group->cpumask); + /* Tally up the load of all CPUs in the group */ sum_weighted_load = sum_nr_running = avg_load = 0; @@ -2295,9 +2319,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, *sd_idle = 0; /* Bias balancing toward cpus of our domain */ - if (local_group) + if (local_group) { + if (idle_cpu(i) && !first_idle_cpu) { + first_idle_cpu = 1; + balance_cpu = i; + } + load = target_load(i, load_idx); - else + } else load = source_load(i, load_idx); avg_load += load; @@ -2305,6 +2334,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, sum_weighted_load += rq->raw_weighted_load; } + /* + * First idle cpu or the first cpu(busiest) in this sched group + * is eligible for doing load balancing at this and above + * domains. + */ + if (local_group && balance_cpu != this_cpu && balance) { + *balance = 0; + goto ret; + } + total_load += avg_load; total_pwr += group->cpu_power; @@ -2464,18 +2503,21 @@ small_imbalance: pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/busiest->cpu_power; + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + busiest->cpu_power; if (max_load > tmp) pwr_move += busiest->cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ - if (max_load*busiest->cpu_power < - busiest_load_per_task*SCHED_LOAD_SCALE) - tmp = max_load*busiest->cpu_power/this->cpu_power; + if (max_load * busiest->cpu_power < + busiest_load_per_task * SCHED_LOAD_SCALE) + tmp = max_load * busiest->cpu_power / this->cpu_power; else - tmp = busiest_load_per_task*SCHED_LOAD_SCALE/this->cpu_power; - pwr_move += this->cpu_power*min(this_load_per_task, this_load + tmp); + tmp = busiest_load_per_task * SCHED_LOAD_SCALE / + this->cpu_power; + pwr_move += this->cpu_power * + min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ @@ -2496,8 +2538,8 @@ out_balanced: *imbalance = min_load_per_task; return group_min; } -ret: #endif +ret: *imbalance = 0; return NULL; } @@ -2548,7 +2590,8 @@ static inline unsigned long minus_1_or_zero(unsigned long n) * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum idle_type idle) + struct sched_domain *sd, enum idle_type idle, + int *balance) { int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; @@ -2571,7 +2614,11 @@ static int load_balance(int this_cpu, struct rq *this_rq, redo: group = find_busiest_group(sd, this_cpu, &imbalance, idle, &sd_idle, - &cpus); + &cpus, balance); + + if (*balance == 0) + goto out_balanced; + if (!group) { schedstat_inc(sd, lb_nobusyg[idle]); goto out_balanced; @@ -2713,7 +2760,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); redo: group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, - &sd_idle, &cpus); + &sd_idle, &cpus, NULL); if (!group) { schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); goto out_balanced; @@ -2773,14 +2820,28 @@ out_balanced: static void idle_balance(int this_cpu, struct rq *this_rq) { struct sched_domain *sd; + int pulled_task = 0; + unsigned long next_balance = jiffies + 60 * HZ; for_each_domain(this_cpu, sd) { if (sd->flags & SD_BALANCE_NEWIDLE) { /* If we've pulled tasks over stop searching: */ - if (load_balance_newidle(this_cpu, this_rq, sd)) + pulled_task = load_balance_newidle(this_cpu, + this_rq, sd); + if (time_after(next_balance, + sd->last_balance + sd->balance_interval)) + next_balance = sd->last_balance + + sd->balance_interval; + if (pulled_task) break; } } + if (!pulled_task) + /* + * We are going idle. next_balance may be set based on + * a busy processor. So reset next_balance. + */ + this_rq->next_balance = next_balance; } /* @@ -2833,28 +2894,19 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) spin_unlock(&target_rq->lock); } -/* - * rebalance_tick will get called every timer tick, on every CPU. - * - * It checks each scheduling domain to see if it is due to be balanced, - * and initiates a balancing operation if so. - * - * Balancing parameters are set up in arch_init_sched_domains. - */ - -static void -rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) +static void update_load(struct rq *this_rq) { - unsigned long this_load, interval; - struct sched_domain *sd; - int i, scale; + unsigned long this_load; + unsigned int i, scale; this_load = this_rq->raw_weighted_load; /* Update our load: */ - for (i = 0, scale = 1; i < 3; i++, scale <<= 1) { + for (i = 0, scale = 1; i < 3; i++, scale += scale) { unsigned long old_load, new_load; + /* scale is effectively 1 << i now, and >> i divides by scale */ + old_load = this_rq->cpu_load[i]; new_load = this_load; /* @@ -2864,8 +2916,34 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) */ if (new_load > old_load) new_load += scale-1; - this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) / scale; + this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * + * It checks each scheduling domain to see if it is due to be balanced, + * and initiates a balancing operation if so. + * + * Balancing parameters are set up in arch_init_sched_domains. + */ +static DEFINE_SPINLOCK(balancing); + +static void run_rebalance_domains(struct softirq_action *h) +{ + int this_cpu = smp_processor_id(), balance = 1; + struct rq *this_rq = cpu_rq(this_cpu); + unsigned long interval; + struct sched_domain *sd; + /* + * We are idle if there are no processes running. This + * is valid even if we are the idle process (SMT). + */ + enum idle_type idle = !this_rq->nr_running ? + SCHED_IDLE : NOT_IDLE; + /* Earliest time when we have to call run_rebalance_domains again */ + unsigned long next_balance = jiffies + 60*HZ; for_each_domain(this_cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -2880,8 +2958,13 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) if (unlikely(!interval)) interval = 1; - if (jiffies - sd->last_balance >= interval) { - if (load_balance(this_cpu, this_rq, sd, idle)) { + if (sd->flags & SD_SERIALIZE) { + if (!spin_trylock(&balancing)) + goto out; + } + + if (time_after_eq(jiffies, sd->last_balance + interval)) { + if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is @@ -2889,42 +2972,48 @@ rebalance_tick(int this_cpu, struct rq *this_rq, enum idle_type idle) */ idle = NOT_IDLE; } - sd->last_balance += interval; + sd->last_balance = jiffies; } + if (sd->flags & SD_SERIALIZE) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; } + this_rq->next_balance = next_balance; } #else /* * on UP we do not need to balance between CPUs: */ -static inline void rebalance_tick(int cpu, struct rq *rq, enum idle_type idle) -{ -} static inline void idle_balance(int cpu, struct rq *rq) { } #endif -static inline int wake_priority_sleeper(struct rq *rq) +static inline void wake_priority_sleeper(struct rq *rq) { - int ret = 0; - #ifdef CONFIG_SCHED_SMT if (!rq->nr_running) - return 0; + return; spin_lock(&rq->lock); /* * If an SMT sibling task has been put to sleep for priority * reasons reschedule the idle task to see if it can now run. */ - if (rq->nr_running) { + if (rq->nr_running) resched_task(rq->idle); - ret = 1; - } spin_unlock(&rq->lock); #endif - return ret; } DEFINE_PER_CPU(struct kernel_stat, kstat); @@ -2938,7 +3027,8 @@ EXPORT_PER_CPU_SYMBOL(kstat); static inline void update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) { - p->sched_time += now - max(p->timestamp, rq->timestamp_last_tick); + p->sched_time += now - p->last_ran; + p->last_ran = rq->most_recent_timestamp = now; } /* @@ -2951,8 +3041,7 @@ unsigned long long current_sched_time(const struct task_struct *p) unsigned long flags; local_irq_save(flags); - ns = max(p->timestamp, task_rq(p)->timestamp_last_tick); - ns = p->sched_time + sched_clock() - ns; + ns = p->sched_time + sched_clock() - p->last_ran; local_irq_restore(flags); return ns; @@ -3052,35 +3141,12 @@ void account_steal_time(struct task_struct *p, cputime_t steal) cpustat->steal = cputime64_add(cpustat->steal, tmp); } -/* - * This function gets called by the timer code, with HZ frequency. - * We call it with interrupts disabled. - * - * It also gets called by the fork code, when changing the parent's - * timeslices. - */ -void scheduler_tick(void) +static void task_running_tick(struct rq *rq, struct task_struct *p) { - unsigned long long now = sched_clock(); - struct task_struct *p = current; - int cpu = smp_processor_id(); - struct rq *rq = cpu_rq(cpu); - - update_cpu_clock(p, rq, now); - - rq->timestamp_last_tick = now; - - if (p == rq->idle) { - if (wake_priority_sleeper(rq)) - goto out; - rebalance_tick(cpu, rq, SCHED_IDLE); - return; - } - - /* Task might have expired already, but not scheduled off yet */ if (p->array != rq->active) { + /* Task has expired but was not scheduled yet */ set_tsk_need_resched(p); - goto out; + return; } spin_lock(&rq->lock); /* @@ -3148,8 +3214,34 @@ void scheduler_tick(void) } out_unlock: spin_unlock(&rq->lock); -out: - rebalance_tick(cpu, rq, NOT_IDLE); +} + +/* + * This function gets called by the timer code, with HZ frequency. + * We call it with interrupts disabled. + * + * It also gets called by the fork code, when changing the parent's + * timeslices. + */ +void scheduler_tick(void) +{ + unsigned long long now = sched_clock(); + struct task_struct *p = current; + int cpu = smp_processor_id(); + struct rq *rq = cpu_rq(cpu); + + update_cpu_clock(p, rq, now); + + if (p == rq->idle) + /* Task on the idle queue */ + wake_priority_sleeper(rq); + else + task_running_tick(rq, p); +#ifdef CONFIG_SMP + update_load(rq); + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); +#endif } #ifdef CONFIG_SCHED_SMT @@ -3295,7 +3387,8 @@ void fastcall add_preempt_count(int val) /* * Spinlock count overflowing soon? */ - DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= PREEMPT_MASK-10); + DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= + PREEMPT_MASK - 10); } EXPORT_SYMBOL(add_preempt_count); @@ -3349,6 +3442,8 @@ asmlinkage void __sched schedule(void) "%s/0x%08x/%d\n", current->comm, preempt_count(), current->pid); debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); dump_stack(); } profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -4110,13 +4205,12 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) } /** - * sched_setscheduler - change the scheduling policy and/or RT priority of - * a thread. + * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. * @p: the task in question. * @policy: new policy. * @param: structure containing the new RT priority. * - * NOTE: the task may be already dead + * NOTE that the task may be already dead. */ int sched_setscheduler(struct task_struct *p, int policy, struct sched_param *param) @@ -4484,7 +4578,7 @@ asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len, /** * sys_sched_yield - yield the current processor to other threads. * - * this function yields the current CPU by moving the calling thread + * This function yields the current CPU by moving the calling thread * to the expired array. If there are no other threads running on this * CPU then this function will return. */ @@ -4534,15 +4628,6 @@ asmlinkage long sys_sched_yield(void) return 0; } -static inline int __resched_legal(int expected_preempt_count) -{ - if (unlikely(preempt_count() != expected_preempt_count)) - return 0; - if (unlikely(system_state != SYSTEM_RUNNING)) - return 0; - return 1; -} - static void __cond_resched(void) { #ifdef CONFIG_DEBUG_SPINLOCK_SLEEP @@ -4562,7 +4647,8 @@ static void __cond_resched(void) int __sched cond_resched(void) { - if (need_resched() && __resched_legal(0)) { + if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && + system_state == SYSTEM_RUNNING) { __cond_resched(); return 1; } @@ -4588,7 +4674,7 @@ int cond_resched_lock(spinlock_t *lock) ret = 1; spin_lock(lock); } - if (need_resched() && __resched_legal(1)) { + if (need_resched() && system_state == SYSTEM_RUNNING) { spin_release(&lock->dep_map, 1, _THIS_IP_); _raw_spin_unlock(lock); preempt_enable_no_resched(); @@ -4604,7 +4690,7 @@ int __sched cond_resched_softirq(void) { BUG_ON(!in_softirq()); - if (need_resched() && __resched_legal(0)) { + if (need_resched() && system_state == SYSTEM_RUNNING) { raw_local_irq_disable(); _local_bh_enable(); raw_local_irq_enable(); @@ -4619,7 +4705,7 @@ EXPORT_SYMBOL(cond_resched_softirq); /** * yield - yield the current processor to other threads. * - * this is a shortcut for kernel-space yielding - it marks the + * This is a shortcut for kernel-space yielding - it marks the * thread runnable and calls sys_sched_yield(). */ void __sched yield(void) @@ -4994,8 +5080,8 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) * afterwards, and pretending it was a local activate. * This way is cleaner and logically correct. */ - p->timestamp = p->timestamp - rq_src->timestamp_last_tick - + rq_dest->timestamp_last_tick; + p->timestamp = p->timestamp - rq_src->most_recent_timestamp + + rq_dest->most_recent_timestamp; deactivate_task(p, rq_src); __activate_task(p, rq_dest); if (TASK_PREEMPTS_CURR(p, rq_dest)) @@ -5368,16 +5454,19 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!(sd->flags & SD_LOAD_BALANCE)) { printk("does not load-balance\n"); if (sd->parent) - printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain has parent"); + printk(KERN_ERR "ERROR: !SD_LOAD_BALANCE domain" + " has parent"); break; } printk("span %s\n", str); if (!cpu_isset(cpu, sd->span)) - printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->span does not contain " + "CPU%d\n", cpu); if (!cpu_isset(cpu, group->cpumask)) - printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n", cpu); + printk(KERN_ERR "ERROR: domain->groups does not contain" + " CPU%d\n", cpu); printk(KERN_DEBUG); for (i = 0; i < level + 2; i++) @@ -5392,7 +5481,8 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) if (!group->cpu_power) { printk("\n"); - printk(KERN_ERR "ERROR: domain->cpu_power not set\n"); + printk(KERN_ERR "ERROR: domain->cpu_power not " + "set\n"); } if (!cpus_weight(group->cpumask)) { @@ -5415,15 +5505,17 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) printk("\n"); if (!cpus_equal(sd->span, groupmask)) - printk(KERN_ERR "ERROR: groups don't span domain->span\n"); + printk(KERN_ERR "ERROR: groups don't span " + "domain->span\n"); level++; sd = sd->parent; + if (!sd) + continue; - if (sd) { - if (!cpus_subset(groupmask, sd->span)) - printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n"); - } + if (!cpus_subset(groupmask, sd->span)) + printk(KERN_ERR "ERROR: parent span is not a superset " + "of domain->span\n"); } while (sd); } @@ -5519,7 +5611,7 @@ static void cpu_attach_domain(struct sched_domain *sd, int cpu) } /* cpus with isolated domains */ -static cpumask_t __cpuinitdata cpu_isolated_map = CPU_MASK_NONE; +static cpumask_t cpu_isolated_map = CPU_MASK_NONE; /* Setup the mask of cpus configured for isolated domains */ static int __init isolated_cpu_setup(char *str) @@ -5741,8 +5833,9 @@ __setup("max_cache_size=", setup_max_cache_size); */ static void touch_cache(void *__cache, unsigned long __size) { - unsigned long size = __size/sizeof(long), chunk1 = size/3, - chunk2 = 2*size/3; + unsigned long size = __size / sizeof(long); + unsigned long chunk1 = size / 3; + unsigned long chunk2 = 2 * size / 3; unsigned long *cache = __cache; int i; @@ -5851,11 +5944,11 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) */ measure_one(cache, size, cpu1, cpu2); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu1, cpu2); + cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); measure_one(cache, size, cpu2, cpu1); for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i*1024, cpu2, cpu1); + cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); /* * (We measure the non-migrating [cached] cost on both @@ -5865,17 +5958,17 @@ measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) measure_one(cache, size, cpu1, cpu1); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu1, cpu1); + cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); measure_one(cache, size, cpu2, cpu2); for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i*1024, cpu2, cpu2); + cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); /* * Get the per-iteration migration cost: */ - do_div(cost1, 2*ITERATIONS); - do_div(cost2, 2*ITERATIONS); + do_div(cost1, 2 * ITERATIONS); + do_div(cost2, 2 * ITERATIONS); return cost1 - cost2; } @@ -5913,7 +6006,7 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) */ cache = vmalloc(max_size); if (!cache) { - printk("could not vmalloc %d bytes for cache!\n", 2*max_size); + printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); return 1000000; /* return 1 msec on very small boxen */ } @@ -5938,7 +6031,8 @@ static unsigned long long measure_migration_cost(int cpu1, int cpu2) avg_fluct = (avg_fluct + fluct)/2; if (migration_debug) - printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): (%8Ld %8Ld)\n", + printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " + "(%8Ld %8Ld)\n", cpu1, cpu2, size, (long)cost / 1000000, ((long)cost / 100000) % 10, @@ -6033,20 +6127,18 @@ static void calibrate_migration_costs(const cpumask_t *cpu_map) -1 #endif ); - if (system_state == SYSTEM_BOOTING) { - if (num_online_cpus() > 1) { - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); - } - printk("\n"); + if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { + printk("migration_cost="); + for (distance = 0; distance <= max_distance; distance++) { + if (distance) + printk(","); + printk("%ld", (long)migration_cost[distance] / 1000); } + printk("\n"); } j1 = jiffies; if (migration_debug) - printk("migration: %ld seconds\n", (j1-j0)/HZ); + printk("migration: %ld seconds\n", (j1-j0) / HZ); /* * Move back to the original CPU. NUMA-Q gets confused @@ -6784,7 +6876,7 @@ void __init sched_init_smp(void) lock_cpu_hotplug(); arch_init_sched_domains(&cpu_online_map); - cpus_andnot(non_isolated_cpus, cpu_online_map, cpu_isolated_map); + cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); unlock_cpu_hotplug(); @@ -6852,6 +6944,10 @@ void __init sched_init(void) set_load_weight(&init_task); +#ifdef CONFIG_SMP + open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); +#endif + #ifdef CONFIG_RT_MUTEXES plist_head_init(&init_task.pi_waiters, &init_task.pi_lock); #endif @@ -6887,6 +6983,8 @@ void __might_sleep(char *file, int line) printk("in_atomic():%d, irqs_disabled():%d\n", in_atomic(), irqs_disabled()); debug_show_held_locks(current); + if (irqs_disabled()) + print_irqtrace_events(current); dump_stack(); } #endif