X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=b533d6db78aab0afd5840627e0b568797fb12fbd;hb=2f81eccbd7a5440b43ff874c8f02d6143f41ba4f;hp=0559665a3a0b14e485edccf09282511e0b1fa60a;hpb=aba2da66cfbf7790ad79d4dee95871127d5ddf5e;p=linux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 0559665a3a..b533d6db78 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -261,9 +262,9 @@ struct rq { s64 clock_max_delta; unsigned int clock_warps, clock_overflows; - unsigned int clock_unstable_events; - - struct sched_class *load_balance_class; + u64 idle_clock; + unsigned int clock_deep_idle_events; + u64 tick_timestamp; atomic_t nr_iowait; @@ -301,7 +302,7 @@ struct rq { struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); static DEFINE_MUTEX(sched_hotcpu_mutex); static inline void check_preempt_curr(struct rq *rq, struct task_struct *p) @@ -319,15 +320,19 @@ static inline int cpu_of(struct rq *rq) } /* - * Per-runqueue clock, as finegrained as the platform can give us: + * Update the per-runqueue clock, as finegrained as the platform can give + * us, but without assuming monotonicity, etc.: */ -static unsigned long long __rq_clock(struct rq *rq) +static void __update_rq_clock(struct rq *rq) { u64 prev_raw = rq->prev_clock_raw; u64 now = sched_clock(); s64 delta = now - prev_raw; u64 clock = rq->clock; +#ifdef CONFIG_SCHED_DEBUG + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); +#endif /* * Protect against sched_clock() occasionally going backwards: */ @@ -338,8 +343,11 @@ static unsigned long long __rq_clock(struct rq *rq) /* * Catch too large forward jumps too: */ - if (unlikely(delta > 2*TICK_NSEC)) { - clock++; + if (unlikely(clock + delta > rq->tick_timestamp + TICK_NSEC)) { + if (clock < rq->tick_timestamp + TICK_NSEC) + clock = rq->tick_timestamp + TICK_NSEC; + else + clock++; rq->clock_overflows++; } else { if (unlikely(delta > rq->clock_max_delta)) @@ -350,18 +358,12 @@ static unsigned long long __rq_clock(struct rq *rq) rq->prev_clock_raw = now; rq->clock = clock; - - return clock; } -static inline unsigned long long rq_clock(struct rq *rq) +static void update_rq_clock(struct rq *rq) { - int this_cpu = smp_processor_id(); - - if (this_cpu == cpu_of(rq)) - return __rq_clock(rq); - - return rq->clock; + if (likely(smp_processor_id() == cpu_of(rq))) + __update_rq_clock(rq); } /* @@ -379,6 +381,25 @@ static inline unsigned long long rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +/* + * For kernel-internal use: high-speed (but slightly incorrect) per-cpu + * clock constructed from sched_clock(): + */ +unsigned long long cpu_clock(int cpu) +{ + unsigned long long now; + unsigned long flags; + struct rq *rq; + + local_irq_save(flags); + rq = cpu_rq(cpu); + update_rq_clock(rq); + now = rq->clock; + local_irq_restore(flags); + + return now; +} + #ifdef CONFIG_FAIR_GROUP_SCHED /* Change a task's ->cfs_rq if it moves across CPUs */ static inline void set_task_cfs_rq(struct task_struct *p) @@ -536,18 +557,40 @@ static inline struct rq *this_rq_lock(void) } /* - * CPU frequency is/was unstable - start new by setting prev_clock_raw: + * We are going deep-idle (irqs are disabled): */ -void sched_clock_unstable_event(void) +void sched_clock_idle_sleep_event(void) { - unsigned long flags; - struct rq *rq; + struct rq *rq = cpu_rq(smp_processor_id()); - rq = task_rq_lock(current, &flags); - rq->prev_clock_raw = sched_clock(); - rq->clock_unstable_events++; - task_rq_unlock(rq, &flags); + spin_lock(&rq->lock); + __update_rq_clock(rq); + spin_unlock(&rq->lock); + rq->clock_deep_idle_events++; } +EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); + +/* + * We just idled delta nanoseconds (called with irqs disabled): + */ +void sched_clock_idle_wakeup_event(u64 delta_ns) +{ + struct rq *rq = cpu_rq(smp_processor_id()); + u64 now = sched_clock(); + + rq->idle_clock += delta_ns; + /* + * Override the previous timestamp and ignore all + * sched_clock() deltas that occured while we idled, + * and use the PM-provided delta_ns to advance the + * rq clock: + */ + spin_lock(&rq->lock); + rq->prev_clock_raw = now; + rq->clock += delta_ns; + spin_unlock(&rq->lock); +} +EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); /* * resched_task - mark a task 'to be rescheduled now'. @@ -622,27 +665,31 @@ static u64 div64_likely32(u64 divident, unsigned long divisor) #define WMULT_SHIFT 32 -static inline unsigned long +/* + * Shift right and round: + */ +#define RSR(x, y) (((x) + (1UL << ((y) - 1))) >> (y)) + +static unsigned long calc_delta_mine(unsigned long delta_exec, unsigned long weight, struct load_weight *lw) { u64 tmp; if (unlikely(!lw->inv_weight)) - lw->inv_weight = WMULT_CONST / lw->weight; + lw->inv_weight = (WMULT_CONST - lw->weight/2) / lw->weight + 1; tmp = (u64)delta_exec * weight; /* * Check whether we'd overflow the 64-bit multiplication: */ - if (unlikely(tmp > WMULT_CONST)) { - tmp = ((tmp >> WMULT_SHIFT/2) * lw->inv_weight) - >> (WMULT_SHIFT/2); - } else { - tmp = (tmp * lw->inv_weight) >> WMULT_SHIFT; - } + if (unlikely(tmp > WMULT_CONST)) + tmp = RSR(RSR(tmp, WMULT_SHIFT/2) * lw->inv_weight, + WMULT_SHIFT/2); + else + tmp = RSR(tmp * lw->inv_weight, WMULT_SHIFT); - return (unsigned long)min(tmp, (u64)sysctl_sched_runtime_limit); + return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX); } static inline unsigned long @@ -663,46 +710,6 @@ static void update_load_sub(struct load_weight *lw, unsigned long dec) lw->inv_weight = 0; } -static void __update_curr_load(struct rq *rq, struct load_stat *ls) -{ - if (rq->curr != rq->idle && ls->load.weight) { - ls->delta_exec += ls->delta_stat; - ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); - ls->delta_stat = 0; - } -} - -/* - * Update delta_exec, delta_fair fields for rq. - * - * delta_fair clock advances at a rate inversely proportional to - * total load (rq->ls.load.weight) on the runqueue, while - * delta_exec advances at the same rate as wall-clock (provided - * cpu is not idle). - * - * delta_exec / delta_fair is a measure of the (smoothened) load on this - * runqueue over any given interval. This (smoothened) load is used - * during load balance. - * - * This function is called /before/ updating rq->ls.load - * and when switching tasks. - */ -static void update_curr_load(struct rq *rq, u64 now) -{ - struct load_stat *ls = &rq->ls; - u64 start; - - start = ls->load_update_start; - ls->load_update_start = now; - ls->delta_stat += now - start; - /* - * Stagger updates to ls->delta_fair. Very frequent updates - * can be expensive. - */ - if (ls->delta_stat >= sysctl_sched_stat_granularity) - __update_curr_load(rq, ls); -} - /* * To aid in avoiding the subversion of "niceness" due to uneven distribution * of tasks with abnormal "nice" values across CPUs the contribution that @@ -712,19 +719,6 @@ static void update_curr_load(struct rq *rq, u64 now) * slice expiry etc. */ -/* - * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE - * If static_prio_timeslice() is ever changed to break this assumption then - * this code will need modification - */ -#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE -#define load_weight(lp) \ - (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) -#define PRIO_TO_LOAD_WEIGHT(prio) \ - load_weight(static_prio_timeslice(prio)) -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + load_weight(rp)) - #define WEIGHT_IDLEPRIO 2 #define WMULT_IDLEPRIO (1 << 31) @@ -736,53 +730,39 @@ static void update_curr_load(struct rq *rq, u64 now) * * The "10% effect" is relative and cumulative: from _any_ nice level, * if you go up 1 level, it's -10% CPU usage, if you go down 1 level - * it's +10% CPU usage. + * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25. + * If a task goes up by ~10% and another task goes down by ~10% then + * the relative distance between them is ~25%.) */ static const int prio_to_weight[40] = { -/* -20 */ 88818, 71054, 56843, 45475, 36380, 29104, 23283, 18626, 14901, 11921, -/* -10 */ 9537, 7629, 6103, 4883, 3906, 3125, 2500, 2000, 1600, 1280, -/* 0 */ NICE_0_LOAD /* 1024 */, -/* 1 */ 819, 655, 524, 419, 336, 268, 215, 172, 137, -/* 10 */ 110, 87, 70, 56, 45, 36, 29, 23, 18, 15, + /* -20 */ 88761, 71755, 56483, 46273, 36291, + /* -15 */ 29154, 23254, 18705, 14949, 11916, + /* -10 */ 9548, 7620, 6100, 4904, 3906, + /* -5 */ 3121, 2501, 1991, 1586, 1277, + /* 0 */ 1024, 820, 655, 526, 423, + /* 5 */ 335, 272, 215, 172, 137, + /* 10 */ 110, 87, 70, 56, 45, + /* 15 */ 36, 29, 23, 18, 15, }; +/* + * Inverse (2^32/x) values of the prio_to_weight[] array, precalculated. + * + * In cases where the weight does not change often, we can use the + * precalculated inverse to speed up arithmetics by turning divisions + * into multiplications: + */ static const u32 prio_to_wmult[40] = { - 48356, 60446, 75558, 94446, 118058, 147573, - 184467, 230589, 288233, 360285, 450347, - 562979, 703746, 879575, 1099582, 1374389, - 717986, 2147483, 2684354, 3355443, 4194304, - 5244160, 6557201, 8196502, 10250518, 12782640, - 16025997, 19976592, 24970740, 31350126, 39045157, - 49367440, 61356675, 76695844, 95443717, 119304647, - 148102320, 186737708, 238609294, 286331153, + /* -20 */ 48388, 59856, 76040, 92818, 118348, + /* -15 */ 147320, 184698, 229616, 287308, 360437, + /* -10 */ 449829, 563644, 704093, 875809, 1099582, + /* -5 */ 1376151, 1717300, 2157191, 2708050, 3363326, + /* 0 */ 4194304, 5237765, 6557202, 8165337, 10153587, + /* 5 */ 12820798, 15790321, 19976592, 24970740, 31350126, + /* 10 */ 39045157, 49367440, 61356676, 76695844, 95443717, + /* 15 */ 119304647, 148102320, 186737708, 238609294, 286331153, }; -static inline void -inc_load(struct rq *rq, const struct task_struct *p, u64 now) -{ - update_curr_load(rq, now); - update_load_add(&rq->ls.load, p->se.load.weight); -} - -static inline void -dec_load(struct rq *rq, const struct task_struct *p, u64 now) -{ - update_curr_load(rq, now); - update_load_sub(&rq->ls.load, p->se.load.weight); -} - -static inline void inc_nr_running(struct task_struct *p, struct rq *rq, u64 now) -{ - rq->nr_running++; - inc_load(rq, p, now); -} - -static inline void dec_nr_running(struct task_struct *p, struct rq *rq, u64 now) -{ - rq->nr_running--; - dec_load(rq, p, now); -} - static void activate_task(struct rq *rq, struct task_struct *p, int wakeup); /* @@ -800,8 +780,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, unsigned long *load_moved, - int this_best_prio, int best_prio, int best_prio_seen, - struct rq_iterator *iterator); + int *this_best_prio, struct rq_iterator *iterator); #include "sched_stats.h" #include "sched_rt.c" @@ -813,6 +792,70 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, #define sched_class_highest (&rt_sched_class) +static void __update_curr_load(struct rq *rq, struct load_stat *ls) +{ + if (rq->curr != rq->idle && ls->load.weight) { + ls->delta_exec += ls->delta_stat; + ls->delta_fair += calc_delta_fair(ls->delta_stat, &ls->load); + ls->delta_stat = 0; + } +} + +/* + * Update delta_exec, delta_fair fields for rq. + * + * delta_fair clock advances at a rate inversely proportional to + * total load (rq->ls.load.weight) on the runqueue, while + * delta_exec advances at the same rate as wall-clock (provided + * cpu is not idle). + * + * delta_exec / delta_fair is a measure of the (smoothened) load on this + * runqueue over any given interval. This (smoothened) load is used + * during load balance. + * + * This function is called /before/ updating rq->ls.load + * and when switching tasks. + */ +static void update_curr_load(struct rq *rq) +{ + struct load_stat *ls = &rq->ls; + u64 start; + + start = ls->load_update_start; + ls->load_update_start = rq->clock; + ls->delta_stat += rq->clock - start; + /* + * Stagger updates to ls->delta_fair. Very frequent updates + * can be expensive. + */ + if (ls->delta_stat >= sysctl_sched_stat_granularity) + __update_curr_load(rq, ls); +} + +static inline void inc_load(struct rq *rq, const struct task_struct *p) +{ + update_curr_load(rq); + update_load_add(&rq->ls.load, p->se.load.weight); +} + +static inline void dec_load(struct rq *rq, const struct task_struct *p) +{ + update_curr_load(rq); + update_load_sub(&rq->ls.load, p->se.load.weight); +} + +static void inc_nr_running(struct task_struct *p, struct rq *rq) +{ + rq->nr_running++; + inc_load(rq, p); +} + +static void dec_nr_running(struct task_struct *p, struct rq *rq) +{ + rq->nr_running--; + dec_load(rq, p); +} + static void set_load_weight(struct task_struct *p) { task_rq(p)->cfs.wait_runtime -= p->se.wait_runtime; @@ -837,18 +880,16 @@ static void set_load_weight(struct task_struct *p) p->se.load.inv_weight = prio_to_wmult[p->static_prio - MAX_RT_PRIO]; } -static void -enqueue_task(struct rq *rq, struct task_struct *p, int wakeup, u64 now) +static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup) { sched_info_queued(p); - p->sched_class->enqueue_task(rq, p, wakeup, now); + p->sched_class->enqueue_task(rq, p, wakeup); p->se.on_rq = 1; } -static void -dequeue_task(struct rq *rq, struct task_struct *p, int sleep, u64 now) +static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep) { - p->sched_class->dequeue_task(rq, p, sleep, now); + p->sched_class->dequeue_task(rq, p, sleep); p->se.on_rq = 0; } @@ -903,13 +944,11 @@ static int effective_prio(struct task_struct *p) */ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) { - u64 now = rq_clock(rq); - if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - enqueue_task(rq, p, wakeup, now); - inc_nr_running(p, rq, now); + enqueue_task(rq, p, wakeup); + inc_nr_running(p, rq); } /* @@ -917,13 +956,13 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup) */ static inline void activate_idle_task(struct task_struct *p, struct rq *rq) { - u64 now = rq_clock(rq); + update_rq_clock(rq); if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible--; - enqueue_task(rq, p, 0, now); - inc_nr_running(p, rq, now); + enqueue_task(rq, p, 0); + inc_nr_running(p, rq); } /* @@ -931,13 +970,11 @@ static inline void activate_idle_task(struct task_struct *p, struct rq *rq) */ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep) { - u64 now = rq_clock(rq); - if (p->state == TASK_UNINTERRUPTIBLE) rq->nr_uninterruptible++; - dequeue_task(rq, p, sleep, now); - dec_nr_running(p, rq, now); + dequeue_task(rq, p, sleep); + dec_nr_running(p, rq); } /** @@ -972,18 +1009,21 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) u64 clock_offset, fair_clock_offset; clock_offset = old_rq->clock - new_rq->clock; - fair_clock_offset = old_rq->cfs.fair_clock - - new_rq->cfs.fair_clock; - if (p->se.wait_start) - p->se.wait_start -= clock_offset; + fair_clock_offset = old_rq->cfs.fair_clock - new_rq->cfs.fair_clock; + if (p->se.wait_start_fair) p->se.wait_start_fair -= fair_clock_offset; + if (p->se.sleep_start_fair) + p->se.sleep_start_fair -= fair_clock_offset; + +#ifdef CONFIG_SCHEDSTATS + if (p->se.wait_start) + p->se.wait_start -= clock_offset; if (p->se.sleep_start) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; - if (p->se.sleep_start_fair) - p->se.sleep_start_fair -= fair_clock_offset; +#endif __set_task_cpu(p, new_cpu); } @@ -1502,6 +1542,7 @@ out_set_cpu: out_activate: #endif /* CONFIG_SMP */ + update_rq_clock(rq); activate_task(rq, p, 1); /* * Sync wakeups (i.e. those types of wakeups where the waker @@ -1544,17 +1585,20 @@ int fastcall wake_up_state(struct task_struct *p, unsigned int state) static void __sched_fork(struct task_struct *p) { p->se.wait_start_fair = 0; - p->se.wait_start = 0; p->se.exec_start = 0; p->se.sum_exec_runtime = 0; + p->se.prev_sum_exec_runtime = 0; p->se.delta_exec = 0; p->se.delta_fair_run = 0; p->se.delta_fair_sleep = 0; p->se.wait_runtime = 0; + p->se.sleep_start_fair = 0; + +#ifdef CONFIG_SCHEDSTATS + p->se.wait_start = 0; p->se.sum_wait_runtime = 0; p->se.sum_sleep_runtime = 0; p->se.sleep_start = 0; - p->se.sleep_start_fair = 0; p->se.block_start = 0; p->se.sleep_max = 0; p->se.block_max = 0; @@ -1562,10 +1606,15 @@ static void __sched_fork(struct task_struct *p) p->se.wait_max = 0; p->se.wait_runtime_overruns = 0; p->se.wait_runtime_underruns = 0; +#endif INIT_LIST_HEAD(&p->run_list); p->se.on_rq = 0; +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&p->preempt_notifiers); +#endif + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1630,11 +1679,14 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); this_cpu = smp_processor_id(); /* parent's CPU */ + update_rq_clock(rq); p->prio = effective_prio(p); - if (!sysctl_sched_child_runs_first || (clone_flags & CLONE_VM) || - task_cpu(p) != this_cpu || !current->se.on_rq) { + if (!p->sched_class->task_new || !sysctl_sched_child_runs_first || + (clone_flags & CLONE_VM) || task_cpu(p) != this_cpu || + !current->se.on_rq) { + activate_task(rq, p, 0); } else { /* @@ -1642,14 +1694,74 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * management (if any): */ p->sched_class->task_new(rq, p); + inc_nr_running(p, rq); } check_preempt_curr(rq, p); task_rq_unlock(rq, &flags); } +#ifdef CONFIG_PREEMPT_NOTIFIERS + +/** + * preempt_notifier_register - tell me when current is being being preempted & rescheduled + * @notifier: notifier struct to register + */ +void preempt_notifier_register(struct preempt_notifier *notifier) +{ + hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); +} +EXPORT_SYMBOL_GPL(preempt_notifier_register); + +/** + * preempt_notifier_unregister - no longer interested in preemption notifications + * @notifier: notifier struct to unregister + * + * This is safe to call from within a preemption notifier. + */ +void preempt_notifier_unregister(struct preempt_notifier *notifier) +{ + hlist_del(¬ifier->link); +} +EXPORT_SYMBOL_GPL(preempt_notifier_unregister); + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_in(notifier, raw_smp_processor_id()); +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ + struct preempt_notifier *notifier; + struct hlist_node *node; + + hlist_for_each_entry(notifier, node, &curr->preempt_notifiers, link) + notifier->ops->sched_out(notifier, next); +} + +#else + +static void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ +} + +#endif + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch + * @prev: the current task that is being switched out * @next: the task we are going to switch to. * * This is called with the rq lock held and interrupts off. It must @@ -1659,8 +1771,11 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) * prepare_task_switch sets up locking and calls architecture specific * hooks. */ -static inline void prepare_task_switch(struct rq *rq, struct task_struct *next) +static inline void +prepare_task_switch(struct rq *rq, struct task_struct *prev, + struct task_struct *next) { + fire_sched_out_preempt_notifiers(prev, next); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -1702,6 +1817,7 @@ static inline void finish_task_switch(struct rq *rq, struct task_struct *prev) prev_state = prev->state; finish_arch_switch(prev); finish_lock_switch(rq, prev); + fire_sched_in_preempt_notifiers(current); if (mm) mmdrop(mm); if (unlikely(prev_state == TASK_DEAD)) { @@ -1742,7 +1858,7 @@ context_switch(struct rq *rq, struct task_struct *prev, { struct mm_struct *mm, *oldmm; - prepare_task_switch(rq, next); + prepare_task_switch(rq, prev, next); mm = next->mm; oldmm = prev->active_mm; /* @@ -1865,7 +1981,6 @@ static void update_cpu_load(struct rq *this_rq) unsigned long total_load = this_rq->ls.load.weight; unsigned long this_load = total_load; struct load_stat *ls = &this_rq->ls; - u64 now = __rq_clock(this_rq); int i, scale; this_rq->nr_load_updates++; @@ -1873,7 +1988,7 @@ static void update_cpu_load(struct rq *this_rq) goto do_avg; /* Update delta_fair/delta_exec fields first */ - update_curr_load(this_rq, now); + update_curr_load(this_rq); fair_delta64 = ls->delta_fair + 1; ls->delta_fair = 0; @@ -1881,8 +1996,8 @@ static void update_cpu_load(struct rq *this_rq) exec_delta64 = ls->delta_exec + 1; ls->delta_exec = 0; - sample_interval64 = now - ls->load_update_last; - ls->load_update_last = now; + sample_interval64 = this_rq->clock - ls->load_update_last; + ls->load_update_last = this_rq->clock; if ((s64)sample_interval64 < (s64)TICK_NSEC) sample_interval64 = TICK_NSEC; @@ -1937,6 +2052,8 @@ static void double_rq_lock(struct rq *rq1, struct rq *rq2) spin_lock(&rq1->lock); } } + update_rq_clock(rq1); + update_rq_clock(rq2); } /* @@ -2064,12 +2181,6 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, if (task_running(rq, p)) return 0; - /* - * Aggressive migration if too many balance attempts have failed: - */ - if (sd->nr_balance_failed > sd->cache_nice_tries) - return 1; - return 1; } @@ -2077,8 +2188,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, unsigned long *load_moved, - int this_best_prio, int best_prio, int best_prio_seen, - struct rq_iterator *iterator) + int *this_best_prio, struct rq_iterator *iterator) { int pulled = 0, pinned = 0, skip_for_load; struct task_struct *p; @@ -2103,12 +2213,8 @@ next: */ skip_for_load = (p->se.load.weight >> 1) > rem_load_move + SCHED_LOAD_SCALE_FUZZ; - if (skip_for_load && p->prio < this_best_prio) - skip_for_load = !best_prio_seen && p->prio == best_prio; - if (skip_for_load || + if ((skip_for_load && p->prio >= *this_best_prio) || !can_migrate_task(p, busiest, this_cpu, sd, idle, &pinned)) { - - best_prio_seen |= p->prio == best_prio; p = iterator->next(iterator->arg); goto next; } @@ -2122,8 +2228,8 @@ next: * and the prescribed amount of weighted load. */ if (pulled < max_nr_move && rem_load_move > 0) { - if (p->prio < this_best_prio) - this_best_prio = p->prio; + if (p->prio < *this_best_prio) + *this_best_prio = p->prio; p = iterator->next(iterator->arg); goto next; } @@ -2142,32 +2248,52 @@ out: } /* - * move_tasks tries to move up to max_nr_move tasks and max_load_move weighted - * load from busiest to this_rq, as part of a balancing operation within - * "domain". Returns the number of tasks moved. + * move_tasks tries to move up to max_load_move weighted load from busiest to + * this_rq, as part of a balancing operation within domain "sd". + * Returns 1 if successful and 0 otherwise. * * Called with both runqueues locked. */ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, - unsigned long max_nr_move, unsigned long max_load_move, + unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { struct sched_class *class = sched_class_highest; - unsigned long load_moved, total_nr_moved = 0, nr_moved; - long rem_load_move = max_load_move; + unsigned long total_load_moved = 0; + int this_best_prio = this_rq->curr->prio; do { - nr_moved = class->load_balance(this_rq, this_cpu, busiest, - max_nr_move, (unsigned long)rem_load_move, - sd, idle, all_pinned, &load_moved); - total_nr_moved += nr_moved; - max_nr_move -= nr_moved; - rem_load_move -= load_moved; + total_load_moved += + class->load_balance(this_rq, this_cpu, busiest, + ULONG_MAX, max_load_move - total_load_moved, + sd, idle, all_pinned, &this_best_prio); class = class->next; - } while (class && max_nr_move && rem_load_move > 0); + } while (class && max_load_move > total_load_moved); - return total_nr_moved; + return total_load_moved > 0; +} + +/* + * move_one_task tries to move exactly one task from busiest to this_rq, as + * part of active balancing operations within "domain". + * Returns 1 if successful and 0 otherwise. + * + * Called with both runqueues locked. + */ +static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest, + struct sched_domain *sd, enum cpu_idle_type idle) +{ + struct sched_class *class; + int this_best_prio = MAX_PRIO; + + for (class = sched_class_highest; class; class = class->next) + if (class->load_balance(this_rq, this_cpu, busiest, + 1, ULONG_MAX, sd, idle, NULL, + &this_best_prio)) + return 1; + + return 0; } /* @@ -2226,7 +2352,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, rq = cpu_rq(i); - if (*sd_idle && !idle_cpu(i)) + if (*sd_idle && rq->nr_running) *sd_idle = 0; /* Bias balancing toward cpus of our domain */ @@ -2248,9 +2374,11 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, /* * First idle cpu or the first cpu(busiest) in this sched group * is eligible for doing load balancing at this and above - * domains. + * domains. In the newly idle case, we will allow all the cpu's + * to do the newly idle load balance. */ - if (local_group && balance_cpu != this_cpu && balance) { + if (idle != CPU_NEWLY_IDLE && local_group && + balance_cpu != this_cpu && balance) { *balance = 0; goto ret; } @@ -2384,7 +2512,7 @@ group_next: * a think about bumping its value to force at least one task to be * moved */ - if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task/2) { + if (*imbalance + SCHED_LOAD_SCALE_FUZZ < busiest_load_per_task) { unsigned long tmp, pwr_now, pwr_move; unsigned int imbn; @@ -2497,11 +2625,6 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, */ #define MAX_PINNED_INTERVAL 512 -static inline unsigned long minus_1_or_zero(unsigned long n) -{ - return n > 0 ? n - 1 : 0; -} - /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2510,7 +2633,7 @@ static int load_balance(int this_cpu, struct rq *this_rq, struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { - int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; + int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; struct sched_group *group; unsigned long imbalance; struct rq *busiest; @@ -2551,18 +2674,17 @@ redo: schedstat_add(sd, lb_imbalance[idle], imbalance); - nr_moved = 0; + ld_moved = 0; if (busiest->nr_running > 1) { /* * Attempt to move tasks. If find_busiest_group has found * an imbalance but busiest->nr_running <= 1, the group is - * still unbalanced. nr_moved simply stays zero, so it is + * still unbalanced. ld_moved simply stays zero, so it is * correctly treated as an imbalance. */ local_irq_save(flags); double_rq_lock(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - minus_1_or_zero(busiest->nr_running), + ld_moved = move_tasks(this_rq, this_cpu, busiest, imbalance, sd, idle, &all_pinned); double_rq_unlock(this_rq, busiest); local_irq_restore(flags); @@ -2570,7 +2692,7 @@ redo: /* * some other cpu did the load balance for us. */ - if (nr_moved && this_cpu != smp_processor_id()) + if (ld_moved && this_cpu != smp_processor_id()) resched_cpu(this_cpu); /* All tasks on this runqueue were pinned by CPU affinity */ @@ -2582,7 +2704,7 @@ redo: } } - if (!nr_moved) { + if (!ld_moved) { schedstat_inc(sd, lb_failed[idle]); sd->nr_balance_failed++; @@ -2631,10 +2753,10 @@ redo: sd->balance_interval *= 2; } - if (!nr_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && + if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; - return nr_moved; + return ld_moved; out_balanced: schedstat_inc(sd, lb_balanced[idle]); @@ -2666,8 +2788,9 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) struct sched_group *group; struct rq *busiest = NULL; unsigned long imbalance; - int nr_moved = 0; + int ld_moved = 0; int sd_idle = 0; + int all_pinned = 0; cpumask_t cpus = CPU_MASK_ALL; /* @@ -2700,23 +2823,25 @@ redo: schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); - nr_moved = 0; + ld_moved = 0; if (busiest->nr_running > 1) { /* Attempt to move tasks */ double_lock_balance(this_rq, busiest); - nr_moved = move_tasks(this_rq, this_cpu, busiest, - minus_1_or_zero(busiest->nr_running), - imbalance, sd, CPU_NEWLY_IDLE, NULL); + /* this_rq->clock is already updated */ + update_rq_clock(busiest); + ld_moved = move_tasks(this_rq, this_cpu, busiest, + imbalance, sd, CPU_NEWLY_IDLE, + &all_pinned); spin_unlock(&busiest->lock); - if (!nr_moved) { + if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), cpus); if (!cpus_empty(cpus)) goto redo; } } - if (!nr_moved) { + if (!ld_moved) { schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) @@ -2724,7 +2849,7 @@ redo: } else sd->nr_balance_failed = 0; - return nr_moved; + return ld_moved; out_balanced: schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); @@ -2801,6 +2926,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) /* move a task from busiest_rq to target_rq */ double_lock_balance(busiest_rq, target_rq); + update_rq_clock(busiest_rq); + update_rq_clock(target_rq); /* Search for an sd spanning us and the target CPU. */ for_each_domain(target_cpu, sd) { @@ -2812,9 +2939,8 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) if (likely(sd)) { schedstat_inc(sd, alb_cnt); - if (move_tasks(target_rq, target_cpu, busiest_rq, 1, - RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE, - NULL)) + if (move_one_task(target_rq, target_cpu, busiest_rq, + sd, CPU_IDLE)) schedstat_inc(sd, alb_pushed); else schedstat_inc(sd, alb_failed); @@ -2912,6 +3038,7 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) struct sched_domain *sd; /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; + int update_next_balance = 0; for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) @@ -2948,8 +3075,10 @@ static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) if (sd->flags & SD_SERIALIZE) spin_unlock(&balancing); out: - if (time_after(next_balance, sd->last_balance + interval)) + if (time_after(next_balance, sd->last_balance + interval)) { next_balance = sd->last_balance + interval; + update_next_balance = 1; + } /* * Stop the load balance at this level. There is another @@ -2959,7 +3088,14 @@ out: if (!balance) break; } - rq->next_balance = next_balance; + + /* + * next_balance will be updated only when there is a need. + * When the cpu is attached to null domain for ex, it will not be + * updated. + */ + if (likely(update_next_balance)) + rq->next_balance = next_balance; } /* @@ -2998,7 +3134,7 @@ static void run_rebalance_domains(struct softirq_action *h) if (need_resched()) break; - rebalance_domains(balance_cpu, SCHED_IDLE); + rebalance_domains(balance_cpu, CPU_IDLE); rq = cpu_rq(balance_cpu); if (time_after(this_rq->next_balance, rq->next_balance)) @@ -3083,8 +3219,7 @@ static int balance_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, unsigned long *load_moved, - int this_best_prio, int best_prio, int best_prio_seen, - struct rq_iterator *iterator) + int *this_best_prio, struct rq_iterator *iterator) { *load_moved = 0; @@ -3110,7 +3245,8 @@ unsigned long long task_sched_runtime(struct task_struct *p) rq = task_rq_lock(p, &flags); ns = p->se.sum_exec_runtime; if (rq->curr == p) { - delta_exec = rq_clock(rq) - p->se.exec_start; + update_rq_clock(rq); + delta_exec = rq->clock - p->se.exec_start; if ((s64)delta_exec > 0) ns += delta_exec; } @@ -3204,11 +3340,19 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; + u64 next_tick = rq->tick_timestamp + TICK_NSEC; spin_lock(&rq->lock); + __update_rq_clock(rq); + /* + * Let rq->clock advance by at least TICK_NSEC: + */ + if (unlikely(rq->clock < next_tick)) + rq->clock = next_tick; + rq->tick_timestamp = rq->clock; + update_cpu_load(rq); if (curr != rq->idle) /* FIXME: needed? */ curr->sched_class->task_tick(rq, curr); - update_cpu_load(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -3290,7 +3434,7 @@ static inline void schedule_debug(struct task_struct *prev) * Pick up the highest-prio task: */ static inline struct task_struct * -pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) +pick_next_task(struct rq *rq, struct task_struct *prev) { struct sched_class *class; struct task_struct *p; @@ -3300,14 +3444,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, u64 now) * the fair class we can call that function directly: */ if (likely(rq->nr_running == rq->cfs.nr_running)) { - p = fair_sched_class.pick_next_task(rq, now); + p = fair_sched_class.pick_next_task(rq); if (likely(p)) return p; } class = sched_class_highest; for ( ; ; ) { - p = class->pick_next_task(rq, now); + p = class->pick_next_task(rq); if (p) return p; /* @@ -3326,7 +3470,6 @@ asmlinkage void __sched schedule(void) struct task_struct *prev, *next; long *switch_count; struct rq *rq; - u64 now; int cpu; need_resched: @@ -3344,6 +3487,7 @@ need_resched_nonpreemptible: spin_lock_irq(&rq->lock); clear_tsk_need_resched(prev); + __update_rq_clock(rq); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { if (unlikely((prev->state & TASK_INTERRUPTIBLE) && @@ -3358,9 +3502,8 @@ need_resched_nonpreemptible: if (unlikely(!rq->nr_running)) idle_balance(cpu, rq); - now = __rq_clock(rq); - prev->sched_class->put_prev_task(rq, prev, now); - next = pick_next_task(rq, prev, now); + prev->sched_class->put_prev_task(rq, prev); + next = pick_next_task(rq, prev); sched_info_switch(prev, next); @@ -3803,17 +3946,16 @@ void rt_mutex_setprio(struct task_struct *p, int prio) unsigned long flags; int oldprio, on_rq; struct rq *rq; - u64 now; BUG_ON(prio < 0 || prio > MAX_PRIO); rq = task_rq_lock(p, &flags); - now = rq_clock(rq); + update_rq_clock(rq); oldprio = p->prio; on_rq = p->se.on_rq; if (on_rq) - dequeue_task(rq, p, 0, now); + dequeue_task(rq, p, 0); if (rt_prio(prio)) p->sched_class = &rt_sched_class; @@ -3823,7 +3965,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) p->prio = prio; if (on_rq) { - enqueue_task(rq, p, 0, now); + enqueue_task(rq, p, 0); /* * Reschedule if we are currently running on this runqueue and * our priority decreased, or if we are not currently running on @@ -3846,7 +3988,6 @@ void set_user_nice(struct task_struct *p, long nice) int old_prio, delta, on_rq; unsigned long flags; struct rq *rq; - u64 now; if (TASK_NICE(p) == nice || nice < -20 || nice > 19) return; @@ -3855,7 +3996,7 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &flags); - now = rq_clock(rq); + update_rq_clock(rq); /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -3868,8 +4009,8 @@ void set_user_nice(struct task_struct *p, long nice) } on_rq = p->se.on_rq; if (on_rq) { - dequeue_task(rq, p, 0, now); - dec_load(rq, p, now); + dequeue_task(rq, p, 0); + dec_load(rq, p); } p->static_prio = NICE_TO_PRIO(nice); @@ -3879,8 +4020,8 @@ void set_user_nice(struct task_struct *p, long nice) delta = p->prio - old_prio; if (on_rq) { - enqueue_task(rq, p, 0, now); - inc_load(rq, p, now); + enqueue_task(rq, p, 0); + inc_load(rq, p); /* * If the task increased its priority or is running and * lowered its priority, then reschedule its CPU: @@ -4116,6 +4257,7 @@ recheck: spin_unlock_irqrestore(&p->pi_lock, flags); goto recheck; } + update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) deactivate_task(rq, p, 0); @@ -4371,10 +4513,8 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) out_unlock: read_unlock(&tasklist_lock); mutex_unlock(&sched_hotcpu_mutex); - if (retval) - return retval; - return 0; + return retval; } /** @@ -4772,14 +4912,18 @@ cpumask_t nohz_cpu_mask = CPU_MASK_NONE; static inline void sched_init_granularity(void) { unsigned int factor = 1 + ilog2(num_online_cpus()); - const unsigned long gran_limit = 100000000; + const unsigned long limit = 100000000; - sysctl_sched_granularity *= factor; - if (sysctl_sched_granularity > gran_limit) - sysctl_sched_granularity = gran_limit; + sysctl_sched_min_granularity *= factor; + if (sysctl_sched_min_granularity > limit) + sysctl_sched_min_granularity = limit; - sysctl_sched_runtime_limit = sysctl_sched_granularity * 4; - sysctl_sched_wakeup_granularity = sysctl_sched_granularity / 2; + sysctl_sched_latency *= factor; + if (sysctl_sched_latency > limit) + sysctl_sched_latency = limit; + + sysctl_sched_runtime_limit = sysctl_sched_latency; + sysctl_sched_wakeup_granularity = sysctl_sched_min_granularity / 2; } #ifdef CONFIG_SMP @@ -4874,6 +5018,7 @@ static int __migrate_task(struct task_struct *p, int src_cpu, int dest_cpu) on_rq = p->se.on_rq; if (on_rq) deactivate_task(rq_src, p, 0); + set_task_cpu(p, dest_cpu); if (on_rq) { activate_task(rq_dest, p, 0); @@ -4903,8 +5048,6 @@ static int migration_thread(void *data) struct migration_req *req; struct list_head *head; - try_to_freeze(); - spin_lock_irq(&rq->lock); if (cpu_is_offline(cpu)) { @@ -5108,14 +5251,137 @@ static void migrate_dead_tasks(unsigned int dead_cpu) for ( ; ; ) { if (!rq->nr_running) break; - next = pick_next_task(rq, rq->curr, rq_clock(rq)); + update_rq_clock(rq); + next = pick_next_task(rq, rq->curr); if (!next) break; migrate_dead(dead_cpu, next); + } } #endif /* CONFIG_HOTPLUG_CPU */ +#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) + +static struct ctl_table sd_ctl_dir[] = { + { + .procname = "sched_domain", + .mode = 0555, + }, + {0,}, +}; + +static struct ctl_table sd_ctl_root[] = { + { + .ctl_name = CTL_KERN, + .procname = "kernel", + .mode = 0555, + .child = sd_ctl_dir, + }, + {0,}, +}; + +static struct ctl_table *sd_alloc_ctl_entry(int n) +{ + struct ctl_table *entry = + kmalloc(n * sizeof(struct ctl_table), GFP_KERNEL); + + BUG_ON(!entry); + memset(entry, 0, n * sizeof(struct ctl_table)); + + return entry; +} + +static void +set_table_entry(struct ctl_table *entry, + const char *procname, void *data, int maxlen, + mode_t mode, proc_handler *proc_handler) +{ + entry->procname = procname; + entry->data = data; + entry->maxlen = maxlen; + entry->mode = mode; + entry->proc_handler = proc_handler; +} + +static struct ctl_table * +sd_alloc_ctl_domain_table(struct sched_domain *sd) +{ + struct ctl_table *table = sd_alloc_ctl_entry(14); + + set_table_entry(&table[0], "min_interval", &sd->min_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[1], "max_interval", &sd->max_interval, + sizeof(long), 0644, proc_doulongvec_minmax); + set_table_entry(&table[2], "busy_idx", &sd->busy_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[3], "idle_idx", &sd->idle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[5], "wake_idx", &sd->wake_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[7], "busy_factor", &sd->busy_factor, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[10], "cache_nice_tries", + &sd->cache_nice_tries, + sizeof(int), 0644, proc_dointvec_minmax); + set_table_entry(&table[12], "flags", &sd->flags, + sizeof(int), 0644, proc_dointvec_minmax); + + return table; +} + +static ctl_table *sd_alloc_ctl_cpu_table(int cpu) +{ + struct ctl_table *entry, *table; + struct sched_domain *sd; + int domain_num = 0, i; + char buf[32]; + + for_each_domain(cpu, sd) + domain_num++; + entry = table = sd_alloc_ctl_entry(domain_num + 1); + + i = 0; + for_each_domain(cpu, sd) { + snprintf(buf, 32, "domain%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_domain_table(sd); + entry++; + i++; + } + return table; +} + +static struct ctl_table_header *sd_sysctl_header; +static void init_sched_domain_sysctl(void) +{ + int i, cpu_num = num_online_cpus(); + struct ctl_table *entry = sd_alloc_ctl_entry(cpu_num + 1); + char buf[32]; + + sd_ctl_dir[0].child = entry; + + for (i = 0; i < cpu_num; i++, entry++) { + snprintf(buf, 32, "cpu%d", i); + entry->procname = kstrdup(buf, GFP_KERNEL); + entry->mode = 0555; + entry->child = sd_alloc_ctl_cpu_table(i); + } + sd_sysctl_header = register_sysctl_table(sd_ctl_root); +} +#else +static void init_sched_domain_sysctl(void) +{ +} +#endif + /* * migration_call - callback that gets triggered when a CPU is added. * Here we can start up the necessary migration thread for the new CPU. @@ -5138,7 +5404,6 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) p = kthread_create(migration_thread, hcpu, "migration/%d", cpu); if (IS_ERR(p)) return NOTIFY_BAD; - p->flags |= PF_NOFREEZE; kthread_bind(p, cpu); /* Must be high prio: stop_machine expects to yield to it. */ rq = task_rq_lock(p, &flags); @@ -5173,6 +5438,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) rq->migration_thread = NULL; /* Idle task back to normal (off runqueue, low prio) */ rq = task_rq_lock(rq->idle, &flags); + update_rq_clock(rq); deactivate_task(rq, rq->idle, 0); rq->idle->static_prio = MAX_PRIO; __setscheduler(rq, rq->idle, SCHED_NORMAL, 0); @@ -6095,7 +6361,7 @@ int partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) -int arch_reinit_sched_domains(void) +static int arch_reinit_sched_domains(void) { int err; @@ -6124,24 +6390,6 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt) return ret ? ret : count; } -int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) -{ - int err = 0; - -#ifdef CONFIG_SCHED_SMT - if (smt_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_smt_power_savings.attr); -#endif -#ifdef CONFIG_SCHED_MC - if (!err && mc_capable()) - err = sysfs_create_file(&cls->kset.kobj, - &attr_sched_mc_power_savings.attr); -#endif - return err; -} -#endif - #ifdef CONFIG_SCHED_MC static ssize_t sched_mc_power_savings_show(struct sys_device *dev, char *page) { @@ -6152,8 +6400,8 @@ static ssize_t sched_mc_power_savings_store(struct sys_device *dev, { return sched_power_savings_store(buf, count, 0); } -SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, - sched_mc_power_savings_store); +static SYSDEV_ATTR(sched_mc_power_savings, 0644, sched_mc_power_savings_show, + sched_mc_power_savings_store); #endif #ifdef CONFIG_SCHED_SMT @@ -6166,8 +6414,26 @@ static ssize_t sched_smt_power_savings_store(struct sys_device *dev, { return sched_power_savings_store(buf, count, 1); } -SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, - sched_smt_power_savings_store); +static SYSDEV_ATTR(sched_smt_power_savings, 0644, sched_smt_power_savings_show, + sched_smt_power_savings_store); +#endif + +int sched_create_sysfs_power_savings_entries(struct sysdev_class *cls) +{ + int err = 0; + +#ifdef CONFIG_SCHED_SMT + if (smt_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_smt_power_savings.attr); +#endif +#ifdef CONFIG_SCHED_MC + if (!err && mc_capable()) + err = sysfs_create_file(&cls->kset.kobj, + &attr_sched_mc_power_savings.attr); +#endif + return err; +} #endif /* @@ -6222,6 +6488,8 @@ void __init sched_init_smp(void) /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + init_sched_domain_sysctl(); + /* Move init over to a non-isolated CPU */ if (set_cpus_allowed(current, non_isolated_cpus) < 0) BUG(); @@ -6308,6 +6576,10 @@ void __init sched_init(void) set_load_weight(&init_task); +#ifdef CONFIG_PREEMPT_NOTIFIERS + INIT_HLIST_HEAD(&init_task.preempt_notifiers); +#endif + #ifdef CONFIG_SMP nr_cpu_ids = highest_cpu + 1; open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); @@ -6373,12 +6645,14 @@ void normalize_rt_tasks(void) do_each_thread(g, p) { p->se.fair_key = 0; p->se.wait_runtime = 0; + p->se.exec_start = 0; p->se.wait_start_fair = 0; + p->se.sleep_start_fair = 0; +#ifdef CONFIG_SCHEDSTATS p->se.wait_start = 0; - p->se.exec_start = 0; p->se.sleep_start = 0; - p->se.sleep_start_fair = 0; p->se.block_start = 0; +#endif task_rq(p)->cfs.fair_clock = 0; task_rq(p)->clock = 0; @@ -6402,12 +6676,13 @@ void normalize_rt_tasks(void) goto out_unlock; #endif + update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) - deactivate_task(task_rq(p), p, 0); + deactivate_task(rq, p, 0); __setscheduler(rq, p, SCHED_NORMAL, 0); if (on_rq) { - activate_task(task_rq(p), p, 0); + activate_task(rq, p, 0); resched_task(rq->curr); } #ifdef CONFIG_SMP