X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=5cd069b77fd748ba04f70e94926c506024a2467d;hb=14531189f0a1071b928586e9e1a89eceac91d95f;hp=960d7c5fca39753b61a404bbbdbef1543f5807cc;hpb=b7405e16435f710edfae6ba32bef4ca20d3de145;p=linux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 960d7c5fca..5cd069b77f 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -52,8 +52,9 @@ #include #include #include -#include +#include +#include #include /* @@ -90,6 +91,9 @@ unsigned long long __attribute__((weak)) sched_clock(void) #define NS_TO_JIFFIES(TIME) ((TIME) / (1000000000 / HZ)) #define JIFFIES_TO_NS(TIME) ((TIME) * (1000000000 / HZ)) +#define NICE_0_LOAD SCHED_LOAD_SCALE +#define NICE_0_SHIFT SCHED_LOAD_SHIFT + /* * These are the 'tuning knobs' of the scheduler: * @@ -181,6 +185,27 @@ static unsigned int static_prio_timeslice(int static_prio) return SCALE_PRIO(DEF_TIMESLICE, static_prio); } +#ifdef CONFIG_SMP +/* + * Divide a load by a sched group cpu_power : (load / sg->__cpu_power) + * Since cpu_power is a 'constant', we can use a reciprocal divide. + */ +static inline u32 sg_div_cpu_power(const struct sched_group *sg, u32 load) +{ + return reciprocal_divide(load, sg->reciprocal_cpu_power); +} + +/* + * Each time a sched group cpu_power is changed, + * we must compute its reciprocal value + */ +static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) +{ + sg->__cpu_power += val; + sg->reciprocal_cpu_power = reciprocal_value(sg->__cpu_power); +} +#endif + /* * task_timeslice() scales user-nice values [ -20 ... 0 ... 19 ] * to time slice values: [800ms ... 100ms ... 5ms] @@ -195,10 +220,74 @@ static inline unsigned int task_timeslice(struct task_struct *p) return static_prio_timeslice(p->static_prio); } +static inline int rt_policy(int policy) +{ + if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) + return 1; + return 0; +} + +static inline int task_has_rt_policy(struct task_struct *p) +{ + return rt_policy(p->policy); +} + /* - * These are the runqueue data structures: + * This is the priority-queue data structure of the RT scheduling class: */ +struct rt_prio_array { + DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1); /* include 1 bit for delimiter */ + struct list_head queue[MAX_RT_PRIO]; +}; + +struct load_stat { + struct load_weight load; + u64 load_update_start, load_update_last; + unsigned long delta_fair, delta_exec, delta_stat; +}; + +/* CFS-related fields in a runqueue */ +struct cfs_rq { + struct load_weight load; + unsigned long nr_running; + + s64 fair_clock; + u64 exec_clock; + s64 wait_runtime; + u64 sleeper_bonus; + unsigned long wait_runtime_overruns, wait_runtime_underruns; + + struct rb_root tasks_timeline; + struct rb_node *rb_leftmost; + struct rb_node *rb_load_balance_curr; +#ifdef CONFIG_FAIR_GROUP_SCHED + /* 'curr' points to currently running entity on this cfs_rq. + * It is set to NULL otherwise (i.e when none are currently running). + */ + struct sched_entity *curr; + struct rq *rq; /* cpu runqueue to which this cfs_rq is attached */ + /* leaf cfs_rqs are those that hold tasks (lowest schedulable entity in + * a hierarchy). Non-leaf lrqs hold other higher schedulable entities + * (like users, containers etc.) + * + * leaf_cfs_rq_list ties together list of leaf cfs_rq's in a cpu. This + * list is used during load balance. + */ + struct list_head leaf_cfs_rq_list; /* Better name : task_cfs_rq_list? */ +#endif +}; + +/* Real-Time classes' related field in a runqueue: */ +struct rt_rq { + struct rt_prio_array active; + int rt_load_balance_idx; + struct list_head *rt_load_balance_head, *rt_load_balance_curr; +}; + +/* + * The prio-array type of the old scheduler: + */ struct prio_array { unsigned int nr_active; DECLARE_BITMAP(bitmap, MAX_PRIO+1); /* include 1 bit for delimiter */ @@ -213,7 +302,7 @@ struct prio_array { * acquire operations must be ordered by ascending &runqueue. */ struct rq { - spinlock_t lock; + spinlock_t lock; /* runqueue lock */ /* * nr_running and cpu_load should be in the same cacheline because @@ -221,10 +310,21 @@ struct rq { */ unsigned long nr_running; unsigned long raw_weighted_load; -#ifdef CONFIG_SMP - unsigned long cpu_load[3]; + #define CPU_LOAD_IDX_MAX 5 + unsigned long cpu_load[CPU_LOAD_IDX_MAX]; + unsigned char idle_at_tick; +#ifdef CONFIG_NO_HZ + unsigned char in_nohz_recently; +#endif + struct load_stat ls; /* capture load from *all* tasks on this cpu */ + unsigned long nr_load_updates; + u64 nr_switches; + + struct cfs_rq cfs; +#ifdef CONFIG_FAIR_GROUP_SCHED + struct list_head leaf_cfs_rq_list; /* list of leaf cfs_rq on this cpu */ #endif - unsigned long long nr_switches; + struct rt_rq rt; /* * This is part of a global counter where only the total sum @@ -235,13 +335,23 @@ struct rq { unsigned long nr_uninterruptible; unsigned long expired_timestamp; - /* Cached timestamp set by update_cpu_clock() */ unsigned long long most_recent_timestamp; + struct task_struct *curr, *idle; unsigned long next_balance; struct mm_struct *prev_mm; + struct prio_array *active, *expired, arrays[2]; int best_expired_prio; + + u64 clock, prev_clock_raw; + s64 clock_max_delta; + + unsigned int clock_warps, clock_overflows; + unsigned int clock_unstable_events; + + struct sched_class *load_balance_class; + atomic_t nr_iowait; #ifdef CONFIG_SMP @@ -278,7 +388,8 @@ struct rq { struct lock_class_key rq_lock_key; }; -static DEFINE_PER_CPU(struct rq, runqueues); +static DEFINE_PER_CPU(struct rq, runqueues) ____cacheline_aligned_in_smp; +static DEFINE_MUTEX(sched_hotcpu_mutex); static inline int cpu_of(struct rq *rq) { @@ -289,6 +400,52 @@ static inline int cpu_of(struct rq *rq) #endif } +/* + * Per-runqueue clock, as finegrained as the platform can give us: + */ +static unsigned long long __rq_clock(struct rq *rq) +{ + u64 prev_raw = rq->prev_clock_raw; + u64 now = sched_clock(); + s64 delta = now - prev_raw; + u64 clock = rq->clock; + + /* + * Protect against sched_clock() occasionally going backwards: + */ + if (unlikely(delta < 0)) { + clock++; + rq->clock_warps++; + } else { + /* + * Catch too large forward jumps too: + */ + if (unlikely(delta > 2*TICK_NSEC)) { + clock++; + rq->clock_overflows++; + } else { + if (unlikely(delta > rq->clock_max_delta)) + rq->clock_max_delta = delta; + clock += delta; + } + } + + rq->prev_clock_raw = now; + rq->clock = clock; + + return clock; +} + +static inline unsigned long long rq_clock(struct rq *rq) +{ + int this_cpu = smp_processor_id(); + + if (this_cpu == cpu_of(rq)) + return __rq_clock(rq); + + return rq->clock; +} + /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -304,6 +461,18 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +#ifdef CONFIG_FAIR_GROUP_SCHED +/* Change a task's ->cfs_rq if it moves across CPUs */ +static inline void set_task_cfs_rq(struct task_struct *p) +{ + p->se.cfs_rq = &task_rq(p)->cfs; +} +#else +static inline void set_task_cfs_rq(struct task_struct *p) +{ +} +#endif + #ifndef prepare_arch_switch # define prepare_arch_switch(next) do { } while (0) #endif @@ -433,134 +602,6 @@ static inline void task_rq_unlock(struct rq *rq, unsigned long *flags) spin_unlock_irqrestore(&rq->lock, *flags); } -#ifdef CONFIG_SCHEDSTATS -/* - * bump this up when changing the output format or the meaning of an existing - * format, so that tools can adapt (or abort) - */ -#define SCHEDSTAT_VERSION 14 - -static int show_schedstat(struct seq_file *seq, void *v) -{ - int cpu; - - seq_printf(seq, "version %d\n", SCHEDSTAT_VERSION); - seq_printf(seq, "timestamp %lu\n", jiffies); - for_each_online_cpu(cpu) { - struct rq *rq = cpu_rq(cpu); -#ifdef CONFIG_SMP - struct sched_domain *sd; - int dcnt = 0; -#endif - - /* runqueue-specific stats */ - seq_printf(seq, - "cpu%d %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu %lu", - cpu, rq->yld_both_empty, - rq->yld_act_empty, rq->yld_exp_empty, rq->yld_cnt, - rq->sched_switch, rq->sched_cnt, rq->sched_goidle, - rq->ttwu_cnt, rq->ttwu_local, - rq->rq_sched_info.cpu_time, - rq->rq_sched_info.run_delay, rq->rq_sched_info.pcnt); - - seq_printf(seq, "\n"); - -#ifdef CONFIG_SMP - /* domain-specific stats */ - preempt_disable(); - for_each_domain(cpu, sd) { - enum idle_type itype; - char mask_str[NR_CPUS]; - - cpumask_scnprintf(mask_str, NR_CPUS, sd->span); - seq_printf(seq, "domain%d %s", dcnt++, mask_str); - for (itype = SCHED_IDLE; itype < MAX_IDLE_TYPES; - itype++) { - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu " - "%lu", - sd->lb_cnt[itype], - sd->lb_balanced[itype], - sd->lb_failed[itype], - sd->lb_imbalance[itype], - sd->lb_gained[itype], - sd->lb_hot_gained[itype], - sd->lb_nobusyq[itype], - sd->lb_nobusyg[itype]); - } - seq_printf(seq, " %lu %lu %lu %lu %lu %lu %lu %lu %lu" - " %lu %lu %lu\n", - sd->alb_cnt, sd->alb_failed, sd->alb_pushed, - sd->sbe_cnt, sd->sbe_balanced, sd->sbe_pushed, - sd->sbf_cnt, sd->sbf_balanced, sd->sbf_pushed, - sd->ttwu_wake_remote, sd->ttwu_move_affine, - sd->ttwu_move_balance); - } - preempt_enable(); -#endif - } - return 0; -} - -static int schedstat_open(struct inode *inode, struct file *file) -{ - unsigned int size = PAGE_SIZE * (1 + num_online_cpus() / 32); - char *buf = kmalloc(size, GFP_KERNEL); - struct seq_file *m; - int res; - - if (!buf) - return -ENOMEM; - res = single_open(file, show_schedstat, NULL); - if (!res) { - m = file->private_data; - m->buf = buf; - m->size = size; - } else - kfree(buf); - return res; -} - -const struct file_operations proc_schedstat_operations = { - .open = schedstat_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; - -/* - * Expects runqueue lock to be held for atomicity of update - */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) -{ - if (rq) { - rq->rq_sched_info.run_delay += delta_jiffies; - rq->rq_sched_info.pcnt++; - } -} - -/* - * Expects runqueue lock to be held for atomicity of update - */ -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) -{ - if (rq) - rq->rq_sched_info.cpu_time += delta_jiffies; -} -# define schedstat_inc(rq, field) do { (rq)->field++; } while (0) -# define schedstat_add(rq, field, amt) do { (rq)->field += (amt); } while (0) -#else /* !CONFIG_SCHEDSTATS */ -static inline void -rq_sched_info_arrive(struct rq *rq, unsigned long delta_jiffies) -{} -static inline void -rq_sched_info_depart(struct rq *rq, unsigned long delta_jiffies) -{} -# define schedstat_inc(rq, field) do { } while (0) -# define schedstat_add(rq, field, amt) do { } while (0) -#endif - /* * this_rq_lock - lock this runqueue and disable interrupts. */ @@ -576,111 +617,123 @@ static inline struct rq *this_rq_lock(void) return rq; } -#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) /* - * Called when a process is dequeued from the active array and given - * the cpu. We should note that with the exception of interactive - * tasks, the expired queue will become the active queue after the active - * queue is empty, without explicitly dequeuing and requeuing tasks in the - * expired queue. (Interactive tasks may be requeued directly to the - * active queue, thus delaying tasks in the expired queue from running; - * see scheduler_tick()). + * resched_task - mark a task 'to be rescheduled now'. * - * This function is only called from sched_info_arrive(), rather than - * dequeue_task(). Even though a task may be queued and dequeued multiple - * times as it is shuffled about, we're really interested in knowing how - * long it was from the *first* time it was queued to the time that it - * finally hit a cpu. + * On UP this means the setting of the need_resched flag, on SMP it + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. */ -static inline void sched_info_dequeued(struct task_struct *t) +#ifdef CONFIG_SMP + +#ifndef tsk_is_polling +#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) +#endif + +static void resched_task(struct task_struct *p) { - t->sched_info.last_queued = 0; + int cpu; + + assert_spin_locked(&task_rq(p)->lock); + + if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + return; + + set_tsk_thread_flag(p, TIF_NEED_RESCHED); + + cpu = task_cpu(p); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(p)) + smp_send_reschedule(cpu); } -/* - * Called when a task finally hits the cpu. We can now calculate how - * long it was waiting to run. We also note when it began so that we - * can keep stats on how long its timeslice is. - */ -static void sched_info_arrive(struct task_struct *t) +static void resched_cpu(int cpu) { - unsigned long now = jiffies, delta_jiffies = 0; - - if (t->sched_info.last_queued) - delta_jiffies = now - t->sched_info.last_queued; - sched_info_dequeued(t); - t->sched_info.run_delay += delta_jiffies; - t->sched_info.last_arrival = now; - t->sched_info.pcnt++; + struct rq *rq = cpu_rq(cpu); + unsigned long flags; - rq_sched_info_arrive(task_rq(t), delta_jiffies); + if (!spin_trylock_irqsave(&rq->lock, flags)) + return; + resched_task(cpu_curr(cpu)); + spin_unlock_irqrestore(&rq->lock, flags); } +#else +static inline void resched_task(struct task_struct *p) +{ + assert_spin_locked(&task_rq(p)->lock); + set_tsk_need_resched(p); +} +#endif + +#include "sched_stats.h" /* - * Called when a process is queued into either the active or expired - * array. The time is noted and later used to determine how long we - * had to wait for us to reach the cpu. Since the expired queue will - * become the active queue after active queue is empty, without dequeuing - * and requeuing any tasks, we are interested in queuing to either. It - * is unusual but not impossible for tasks to be dequeued and immediately - * requeued in the same or another array: this can happen in sched_yield(), - * set_user_nice(), and even load_balance() as it moves tasks from runqueue - * to runqueue. - * - * This function is only called from enqueue_task(), but also only updates - * the timestamp if it is already not set. It's assumed that - * sched_info_dequeued() will clear that stamp when appropriate. + * To aid in avoiding the subversion of "niceness" due to uneven distribution + * of tasks with abnormal "nice" values across CPUs the contribution that + * each task makes to its run queue's load is weighted according to its + * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a + * scaled version of the new time slice allocation that they receive on time + * slice expiry etc. */ -static inline void sched_info_queued(struct task_struct *t) -{ - if (unlikely(sched_info_on())) - if (!t->sched_info.last_queued) - t->sched_info.last_queued = jiffies; -} /* - * Called when a process ceases being the active-running process, either - * voluntarily or involuntarily. Now we can calculate how long we ran. + * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE + * If static_prio_timeslice() is ever changed to break this assumption then + * this code will need modification */ -static inline void sched_info_depart(struct task_struct *t) -{ - unsigned long delta_jiffies = jiffies - t->sched_info.last_arrival; +#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE +#define LOAD_WEIGHT(lp) \ + (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) +#define PRIO_TO_LOAD_WEIGHT(prio) \ + LOAD_WEIGHT(static_prio_timeslice(prio)) +#define RTPRIO_TO_LOAD_WEIGHT(rp) \ + (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) - t->sched_info.cpu_time += delta_jiffies; - rq_sched_info_depart(task_rq(t), delta_jiffies); +static void set_load_weight(struct task_struct *p) +{ + if (task_has_rt_policy(p)) { +#ifdef CONFIG_SMP + if (p == task_rq(p)->migration_thread) + /* + * The migration thread does the actual balancing. + * Giving its load any weight will skew balancing + * adversely. + */ + p->load_weight = 0; + else +#endif + p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); + } else + p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); } -/* - * Called when tasks are switched involuntarily due, typically, to expiring - * their time slice. (This may also be called when switching to or from - * the idle task.) We are only called when prev != next. - */ static inline void -__sched_info_switch(struct task_struct *prev, struct task_struct *next) +inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) { - struct rq *rq = task_rq(prev); + rq->raw_weighted_load += p->load_weight; +} - /* - * prev now departs the cpu. It's not interesting to record - * stats about how efficient we were at scheduling the idle - * process, however. - */ - if (prev != rq->idle) - sched_info_depart(prev); +static inline void +dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) +{ + rq->raw_weighted_load -= p->load_weight; +} - if (next != rq->idle) - sched_info_arrive(next); +static inline void inc_nr_running(struct task_struct *p, struct rq *rq) +{ + rq->nr_running++; + inc_raw_weighted_load(rq, p); } -static inline void -sched_info_switch(struct task_struct *prev, struct task_struct *next) + +static inline void dec_nr_running(struct task_struct *p, struct rq *rq) { - if (unlikely(sched_info_on())) - __sched_info_switch(prev, next); + rq->nr_running--; + dec_raw_weighted_load(rq, p); } -#else -#define sched_info_queued(t) do { } while (0) -#define sched_info_switch(t, next) do { } while (0) -#endif /* CONFIG_SCHEDSTATS || CONFIG_TASK_DELAY_ACCT */ /* * Adding/removing a task to/from a priority array: @@ -749,70 +802,6 @@ static inline int __normal_prio(struct task_struct *p) return prio; } -/* - * To aid in avoiding the subversion of "niceness" due to uneven distribution - * of tasks with abnormal "nice" values across CPUs the contribution that - * each task makes to its run queue's load is weighted according to its - * scheduling class and "nice" value. For SCHED_NORMAL tasks this is just a - * scaled version of the new time slice allocation that they receive on time - * slice expiry etc. - */ - -/* - * Assume: static_prio_timeslice(NICE_TO_PRIO(0)) == DEF_TIMESLICE - * If static_prio_timeslice() is ever changed to break this assumption then - * this code will need modification - */ -#define TIME_SLICE_NICE_ZERO DEF_TIMESLICE -#define LOAD_WEIGHT(lp) \ - (((lp) * SCHED_LOAD_SCALE) / TIME_SLICE_NICE_ZERO) -#define PRIO_TO_LOAD_WEIGHT(prio) \ - LOAD_WEIGHT(static_prio_timeslice(prio)) -#define RTPRIO_TO_LOAD_WEIGHT(rp) \ - (PRIO_TO_LOAD_WEIGHT(MAX_RT_PRIO) + LOAD_WEIGHT(rp)) - -static void set_load_weight(struct task_struct *p) -{ - if (has_rt_policy(p)) { -#ifdef CONFIG_SMP - if (p == task_rq(p)->migration_thread) - /* - * The migration thread does the actual balancing. - * Giving its load any weight will skew balancing - * adversely. - */ - p->load_weight = 0; - else -#endif - p->load_weight = RTPRIO_TO_LOAD_WEIGHT(p->rt_priority); - } else - p->load_weight = PRIO_TO_LOAD_WEIGHT(p->static_prio); -} - -static inline void -inc_raw_weighted_load(struct rq *rq, const struct task_struct *p) -{ - rq->raw_weighted_load += p->load_weight; -} - -static inline void -dec_raw_weighted_load(struct rq *rq, const struct task_struct *p) -{ - rq->raw_weighted_load -= p->load_weight; -} - -static inline void inc_nr_running(struct task_struct *p, struct rq *rq) -{ - rq->nr_running++; - inc_raw_weighted_load(rq, p); -} - -static inline void dec_nr_running(struct task_struct *p, struct rq *rq) -{ - rq->nr_running--; - dec_raw_weighted_load(rq, p); -} - /* * Calculate the expected normal priority: i.e. priority * without taking RT-inheritance into account. Might be @@ -824,7 +813,7 @@ static inline int normal_prio(struct task_struct *p) { int prio; - if (has_rt_policy(p)) + if (task_has_rt_policy(p)) prio = MAX_RT_PRIO-1 - p->rt_priority; else prio = __normal_prio(p); @@ -1016,47 +1005,6 @@ static void deactivate_task(struct task_struct *p, struct rq *rq) p->array = NULL; } -/* - * resched_task - mark a task 'to be rescheduled now'. - * - * On UP this means the setting of the need_resched flag, on SMP it - * might also involve a cross-CPU call to trigger the scheduler on - * the target CPU. - */ -#ifdef CONFIG_SMP - -#ifndef tsk_is_polling -#define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) -#endif - -static void resched_task(struct task_struct *p) -{ - int cpu; - - assert_spin_locked(&task_rq(p)->lock); - - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) - return; - - set_tsk_thread_flag(p, TIF_NEED_RESCHED); - - cpu = task_cpu(p); - if (cpu == smp_processor_id()) - return; - - /* NEED_RESCHED must be visible before we test polling */ - smp_mb(); - if (!tsk_is_polling(p)) - smp_send_reschedule(cpu); -} -#else -static inline void resched_task(struct task_struct *p) -{ - assert_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); -} -#endif - /** * task_curr - is this task currently executing on a CPU? * @p: the task in question. @@ -1073,6 +1021,12 @@ unsigned long weighted_cpuload(const int cpu) } #ifdef CONFIG_SMP + +void set_task_cpu(struct task_struct *p, unsigned int cpu) +{ + task_thread_info(p)->cpu = cpu; +} + struct migration_req { struct list_head list; @@ -1121,21 +1075,72 @@ void wait_task_inactive(struct task_struct *p) { unsigned long flags; struct rq *rq; - int preempted; + struct prio_array *array; + int running; repeat: + /* + * We do the initial early heuristics without holding + * any task-queue locks at all. We'll only try to get + * the runqueue lock when things look like they will + * work out! + */ + rq = task_rq(p); + + /* + * If the task is actively running on another CPU + * still, just relax and busy-wait without holding + * any locks. + * + * NOTE! Since we don't hold any locks, it's not + * even sure that "rq" stays as the right runqueue! + * But we don't care, since "task_running()" will + * return false if the runqueue has changed and p + * is actually now running somewhere else! + */ + while (task_running(rq, p)) + cpu_relax(); + + /* + * Ok, time to look more closely! We need the rq + * lock now, to be *sure*. If we're wrong, we'll + * just go back and repeat. + */ rq = task_rq_lock(p, &flags); - /* Must be off runqueue entirely, not preempted. */ - if (unlikely(p->array || task_running(rq, p))) { - /* If it's preempted, we yield. It could be a while. */ - preempted = !task_running(rq, p); - task_rq_unlock(rq, &flags); + running = task_running(rq, p); + array = p->array; + task_rq_unlock(rq, &flags); + + /* + * Was it really running after all now that we + * checked with the proper locks actually held? + * + * Oops. Go back and try again.. + */ + if (unlikely(running)) { cpu_relax(); - if (preempted) - yield(); goto repeat; } - task_rq_unlock(rq, &flags); + + /* + * It's not enough that it's not actively running, + * it must be off the runqueue _entirely_, and not + * preempted! + * + * So if it wa still runnable (but just not actively + * running right now), it's preempted, and we should + * yield - it could be a while. + */ + if (unlikely(array)) { + yield(); + goto repeat; + } + + /* + * Ahh, all good. It wasn't running, and it wasn't + * runnable, which means that it will never become + * running in the future either. We're all done! + */ } /*** @@ -1241,7 +1246,8 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, int this_cpu) } /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); if (local_group) { this_load = avg_load; @@ -1368,7 +1374,16 @@ static int wake_idle(int cpu, struct task_struct *p) struct sched_domain *sd; int i; - if (idle_cpu(cpu)) + /* + * If it is idle, then it is the best cpu to run this task. + * + * This cpu is also the best, if it has more than one task already. + * Siblings must be also busy(in most cases) as they didn't already + * pickup the extra load from this cpu and hence we need not check + * sibling runqueue info. This will avoid the checks and cache miss + * penalities associated with that. + */ + if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) return cpu; for_each_domain(cpu, sd) { @@ -1726,37 +1741,6 @@ void fastcall wake_up_new_task(struct task_struct *p, unsigned long clone_flags) task_rq_unlock(this_rq, &flags); } -/* - * Potentially available exiting-child timeslices are - * retrieved here - this way the parent does not get - * penalized for creating too many threads. - * - * (this cannot be used to 'generate' timeslices - * artificially, because any timeslice recovered here - * was given away by the parent in the first place.) - */ -void fastcall sched_exit(struct task_struct *p) -{ - unsigned long flags; - struct rq *rq; - - /* - * If the child was a (relative-) CPU hog then decrease - * the sleep_avg of the parent as well. - */ - rq = task_rq_lock(p->parent, &flags); - if (p->first_time_slice && task_cpu(p) == task_cpu(p->parent)) { - p->parent->time_slice += p->time_slice; - if (unlikely(p->parent->time_slice > task_timeslice(p))) - p->parent->time_slice = task_timeslice(p); - } - if (p->sleep_avg < p->parent->sleep_avg) - p->parent->sleep_avg = p->parent->sleep_avg / - (EXIT_WEIGHT + 1) * EXIT_WEIGHT + p->sleep_avg / - (EXIT_WEIGHT + 1); - task_rq_unlock(rq, &flags); -} - /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -2109,7 +2093,7 @@ static void pull_task(struct rq *src_rq, struct prio_array *src_array, */ static int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, - struct sched_domain *sd, enum idle_type idle, + struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { /* @@ -2155,7 +2139,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu, */ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_nr_move, unsigned long max_load_move, - struct sched_domain *sd, enum idle_type idle, + struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned) { int idx, pulled = 0, pinned = 0, this_best_prio, best_prio, @@ -2273,7 +2257,7 @@ out: */ static struct sched_group * find_busiest_group(struct sched_domain *sd, int this_cpu, - unsigned long *imbalance, enum idle_type idle, int *sd_idle, + unsigned long *imbalance, enum cpu_idle_type idle, int *sd_idle, cpumask_t *cpus, int *balance) { struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups; @@ -2292,9 +2276,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, max_load = this_load = total_load = total_pwr = 0; busiest_load_per_task = busiest_nr_running = 0; this_load_per_task = this_nr_running = 0; - if (idle == NOT_IDLE) + if (idle == CPU_NOT_IDLE) load_idx = sd->busy_idx; - else if (idle == NEWLY_IDLE) + else if (idle == CPU_NEWLY_IDLE) load_idx = sd->newidle_idx; else load_idx = sd->idle_idx; @@ -2352,12 +2336,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, } total_load += avg_load; - total_pwr += group->cpu_power; + total_pwr += group->__cpu_power; /* Adjust by relative CPU power of the group */ - avg_load = (avg_load * SCHED_LOAD_SCALE) / group->cpu_power; + avg_load = sg_div_cpu_power(group, + avg_load * SCHED_LOAD_SCALE); - group_capacity = group->cpu_power / SCHED_LOAD_SCALE; + group_capacity = group->__cpu_power / SCHED_LOAD_SCALE; if (local_group) { this_load = avg_load; @@ -2377,7 +2362,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu, * Busy processors will not participate in power savings * balance. */ - if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto group_next; /* @@ -2468,8 +2453,8 @@ group_next: max_pull = min(max_load - avg_load, max_load - busiest_load_per_task); /* How much load to actually move to equalise the imbalance */ - *imbalance = min(max_pull * busiest->cpu_power, - (avg_load - this_load) * this->cpu_power) + *imbalance = min(max_pull * busiest->__cpu_power, + (avg_load - this_load) * this->__cpu_power) / SCHED_LOAD_SCALE; /* @@ -2503,28 +2488,29 @@ small_imbalance: * moving them. */ - pwr_now += busiest->cpu_power * - min(busiest_load_per_task, max_load); - pwr_now += this->cpu_power * - min(this_load_per_task, this_load); + pwr_now += busiest->__cpu_power * + min(busiest_load_per_task, max_load); + pwr_now += this->__cpu_power * + min(this_load_per_task, this_load); pwr_now /= SCHED_LOAD_SCALE; /* Amount of load we'd subtract */ - tmp = busiest_load_per_task * SCHED_LOAD_SCALE / - busiest->cpu_power; + tmp = sg_div_cpu_power(busiest, + busiest_load_per_task * SCHED_LOAD_SCALE); if (max_load > tmp) - pwr_move += busiest->cpu_power * + pwr_move += busiest->__cpu_power * min(busiest_load_per_task, max_load - tmp); /* Amount of load we'd add */ - if (max_load * busiest->cpu_power < + if (max_load * busiest->__cpu_power < busiest_load_per_task * SCHED_LOAD_SCALE) - tmp = max_load * busiest->cpu_power / this->cpu_power; + tmp = sg_div_cpu_power(this, + max_load * busiest->__cpu_power); else - tmp = busiest_load_per_task * SCHED_LOAD_SCALE / - this->cpu_power; - pwr_move += this->cpu_power * - min(this_load_per_task, this_load + tmp); + tmp = sg_div_cpu_power(this, + busiest_load_per_task * SCHED_LOAD_SCALE); + pwr_move += this->__cpu_power * + min(this_load_per_task, this_load + tmp); pwr_move /= SCHED_LOAD_SCALE; /* Move if we gain throughput */ @@ -2538,7 +2524,7 @@ small_imbalance: out_balanced: #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) - if (idle == NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) + if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE)) goto ret; if (this == group_leader && group_leader != group_min) { @@ -2555,7 +2541,7 @@ ret: * find_busiest_queue - find the busiest runqueue among the cpus in group. */ static struct rq * -find_busiest_queue(struct sched_group *group, enum idle_type idle, +find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle, unsigned long imbalance, cpumask_t *cpus) { struct rq *busiest = NULL, *rq; @@ -2597,7 +2583,7 @@ static inline unsigned long minus_1_or_zero(unsigned long n) * tasks if there is an imbalance. */ static int load_balance(int this_cpu, struct rq *this_rq, - struct sched_domain *sd, enum idle_type idle, + struct sched_domain *sd, enum cpu_idle_type idle, int *balance) { int nr_moved, all_pinned = 0, active_balance = 0, sd_idle = 0; @@ -2611,9 +2597,9 @@ static int load_balance(int this_cpu, struct rq *this_rq, * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, * let the state of idle sibling percolate up as IDLE, instead of - * portraying it as NOT_IDLE. + * portraying it as CPU_NOT_IDLE. */ - if (idle != NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && + if (idle != CPU_NOT_IDLE && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; @@ -2657,6 +2643,12 @@ redo: double_rq_unlock(this_rq, busiest); local_irq_restore(flags); + /* + * some other cpu did the load balance for us. + */ + if (nr_moved && this_cpu != smp_processor_id()) + resched_cpu(this_cpu); + /* All tasks on this runqueue were pinned by CPU affinity */ if (unlikely(all_pinned)) { cpu_clear(cpu_of(busiest), cpus); @@ -2741,7 +2733,7 @@ out_one_pinned: * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. * - * Called from schedule when this_rq is about to become idle (NEWLY_IDLE). + * Called from schedule when this_rq is about to become idle (CPU_NEWLY_IDLE). * this_rq is locked. */ static int @@ -2758,31 +2750,31 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd) * When power savings policy is enabled for the parent domain, idle * sibling can pick up load irrespective of busy siblings. In this case, * let the state of idle sibling percolate up as IDLE, instead of - * portraying it as NOT_IDLE. + * portraying it as CPU_NOT_IDLE. */ if (sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) sd_idle = 1; - schedstat_inc(sd, lb_cnt[NEWLY_IDLE]); + schedstat_inc(sd, lb_cnt[CPU_NEWLY_IDLE]); redo: - group = find_busiest_group(sd, this_cpu, &imbalance, NEWLY_IDLE, + group = find_busiest_group(sd, this_cpu, &imbalance, CPU_NEWLY_IDLE, &sd_idle, &cpus, NULL); if (!group) { - schedstat_inc(sd, lb_nobusyg[NEWLY_IDLE]); + schedstat_inc(sd, lb_nobusyg[CPU_NEWLY_IDLE]); goto out_balanced; } - busiest = find_busiest_queue(group, NEWLY_IDLE, imbalance, + busiest = find_busiest_queue(group, CPU_NEWLY_IDLE, imbalance, &cpus); if (!busiest) { - schedstat_inc(sd, lb_nobusyq[NEWLY_IDLE]); + schedstat_inc(sd, lb_nobusyq[CPU_NEWLY_IDLE]); goto out_balanced; } BUG_ON(busiest == this_rq); - schedstat_add(sd, lb_imbalance[NEWLY_IDLE], imbalance); + schedstat_add(sd, lb_imbalance[CPU_NEWLY_IDLE], imbalance); nr_moved = 0; if (busiest->nr_running > 1) { @@ -2790,7 +2782,7 @@ redo: double_lock_balance(this_rq, busiest); nr_moved = move_tasks(this_rq, this_cpu, busiest, minus_1_or_zero(busiest->nr_running), - imbalance, sd, NEWLY_IDLE, NULL); + imbalance, sd, CPU_NEWLY_IDLE, NULL); spin_unlock(&busiest->lock); if (!nr_moved) { @@ -2801,7 +2793,7 @@ redo: } if (!nr_moved) { - schedstat_inc(sd, lb_failed[NEWLY_IDLE]); + schedstat_inc(sd, lb_failed[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; @@ -2811,7 +2803,7 @@ redo: return nr_moved; out_balanced: - schedstat_inc(sd, lb_balanced[NEWLY_IDLE]); + schedstat_inc(sd, lb_balanced[CPU_NEWLY_IDLE]); if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER && !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE)) return -1; @@ -2831,17 +2823,21 @@ static void idle_balance(int this_cpu, struct rq *this_rq) unsigned long next_balance = jiffies + 60 * HZ; for_each_domain(this_cpu, sd) { - if (sd->flags & SD_BALANCE_NEWIDLE) { + unsigned long interval; + + if (!(sd->flags & SD_LOAD_BALANCE)) + continue; + + if (sd->flags & SD_BALANCE_NEWIDLE) /* If we've pulled tasks over stop searching: */ pulled_task = load_balance_newidle(this_cpu, - this_rq, sd); - if (time_after(next_balance, - sd->last_balance + sd->balance_interval)) - next_balance = sd->last_balance - + sd->balance_interval; - if (pulled_task) - break; - } + this_rq, sd); + + interval = msecs_to_jiffies(sd->balance_interval); + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + if (pulled_task) + break; } if (!pulled_task) /* @@ -2892,7 +2888,7 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) schedstat_inc(sd, alb_cnt); if (move_tasks(target_rq, target_cpu, busiest_rq, 1, - RTPRIO_TO_LOAD_WEIGHT(100), sd, SCHED_IDLE, + RTPRIO_TO_LOAD_WEIGHT(100), sd, CPU_IDLE, NULL)) schedstat_inc(sd, alb_pushed); else @@ -2927,37 +2923,103 @@ static void update_load(struct rq *this_rq) } } +#ifdef CONFIG_NO_HZ +static struct { + atomic_t load_balancer; + cpumask_t cpu_mask; +} nohz ____cacheline_aligned = { + .load_balancer = ATOMIC_INIT(-1), + .cpu_mask = CPU_MASK_NONE, +}; + /* - * run_rebalance_domains is triggered when needed from the scheduler tick. + * This routine will try to nominate the ilb (idle load balancing) + * owner among the cpus whose ticks are stopped. ilb owner will do the idle + * load balancing on behalf of all those cpus. If all the cpus in the system + * go into this tickless mode, then there will be no ilb owner (as there is + * no need for one) and all the cpus will sleep till the next wakeup event + * arrives... + * + * For the ilb owner, tick is not stopped. And this tick will be used + * for idle load balancing. ilb owner will still be part of + * nohz.cpu_mask.. + * + * While stopping the tick, this cpu will become the ilb owner if there + * is no other owner. And will be the owner till that cpu becomes busy + * or if all cpus in the system stop their ticks at which point + * there is no need for ilb owner. * + * When the ilb owner becomes busy, it nominates another owner, during the + * next busy scheduler_tick() + */ +int select_nohz_load_balancer(int stop_tick) +{ + int cpu = smp_processor_id(); + + if (stop_tick) { + cpu_set(cpu, nohz.cpu_mask); + cpu_rq(cpu)->in_nohz_recently = 1; + + /* + * If we are going offline and still the leader, give up! + */ + if (cpu_is_offline(cpu) && + atomic_read(&nohz.load_balancer) == cpu) { + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + return 0; + } + + /* time for ilb owner also to sleep */ + if (cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + if (atomic_read(&nohz.load_balancer) == cpu) + atomic_set(&nohz.load_balancer, -1); + return 0; + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* make me the ilb owner */ + if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) + return 1; + } else if (atomic_read(&nohz.load_balancer) == cpu) + return 1; + } else { + if (!cpu_isset(cpu, nohz.cpu_mask)) + return 0; + + cpu_clear(cpu, nohz.cpu_mask); + + if (atomic_read(&nohz.load_balancer) == cpu) + if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu) + BUG(); + } + return 0; +} +#endif + +static DEFINE_SPINLOCK(balancing); + +/* * It checks each scheduling domain to see if it is due to be balanced, * and initiates a balancing operation if so. * * Balancing parameters are set up in arch_init_sched_domains. */ -static DEFINE_SPINLOCK(balancing); - -static void run_rebalance_domains(struct softirq_action *h) +static inline void rebalance_domains(int cpu, enum cpu_idle_type idle) { - int this_cpu = smp_processor_id(), balance = 1; - struct rq *this_rq = cpu_rq(this_cpu); + int balance = 1; + struct rq *rq = cpu_rq(cpu); unsigned long interval; struct sched_domain *sd; - /* - * We are idle if there are no processes running. This - * is valid even if we are the idle process (SMT). - */ - enum idle_type idle = !this_rq->nr_running ? - SCHED_IDLE : NOT_IDLE; - /* Earliest time when we have to call run_rebalance_domains again */ + /* Earliest time when we have to do rebalance again */ unsigned long next_balance = jiffies + 60*HZ; - for_each_domain(this_cpu, sd) { + for_each_domain(cpu, sd) { if (!(sd->flags & SD_LOAD_BALANCE)) continue; interval = sd->balance_interval; - if (idle != SCHED_IDLE) + if (idle != CPU_IDLE) interval *= sd->busy_factor; /* scale ms to jiffies */ @@ -2971,31 +3033,138 @@ static void run_rebalance_domains(struct softirq_action *h) } if (time_after_eq(jiffies, sd->last_balance + interval)) { - if (load_balance(this_cpu, this_rq, sd, idle, &balance)) { + if (load_balance(cpu, rq, sd, idle, &balance)) { /* * We've pulled tasks over so either we're no * longer idle, or one of our SMT siblings is * not idle. */ - idle = NOT_IDLE; + idle = CPU_NOT_IDLE; } sd->last_balance = jiffies; } - if (sd->flags & SD_SERIALIZE) - spin_unlock(&balancing); -out: - if (time_after(next_balance, sd->last_balance + interval)) - next_balance = sd->last_balance + interval; + if (sd->flags & SD_SERIALIZE) + spin_unlock(&balancing); +out: + if (time_after(next_balance, sd->last_balance + interval)) + next_balance = sd->last_balance + interval; + + /* + * Stop the load balance at this level. There is another + * CPU in our sched group which is doing load balancing more + * actively. + */ + if (!balance) + break; + } + rq->next_balance = next_balance; +} + +/* + * run_rebalance_domains is triggered when needed from the scheduler tick. + * In CONFIG_NO_HZ case, the idle load balance owner will do the + * rebalancing for all the cpus for whom scheduler ticks are stopped. + */ +static void run_rebalance_domains(struct softirq_action *h) +{ + int local_cpu = smp_processor_id(); + struct rq *local_rq = cpu_rq(local_cpu); + enum cpu_idle_type idle = local_rq->idle_at_tick ? CPU_IDLE : CPU_NOT_IDLE; + + rebalance_domains(local_cpu, idle); + +#ifdef CONFIG_NO_HZ + /* + * If this cpu is the owner for idle load balancing, then do the + * balancing on behalf of the other idle cpus whose ticks are + * stopped. + */ + if (local_rq->idle_at_tick && + atomic_read(&nohz.load_balancer) == local_cpu) { + cpumask_t cpus = nohz.cpu_mask; + struct rq *rq; + int balance_cpu; + + cpu_clear(local_cpu, cpus); + for_each_cpu_mask(balance_cpu, cpus) { + /* + * If this cpu gets work to do, stop the load balancing + * work being done for other cpus. Next load + * balancing owner will pick it up. + */ + if (need_resched()) + break; + + rebalance_domains(balance_cpu, CPU_IDLE); + + rq = cpu_rq(balance_cpu); + if (time_after(local_rq->next_balance, rq->next_balance)) + local_rq->next_balance = rq->next_balance; + } + } +#endif +} + +/* + * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing. + * + * In case of CONFIG_NO_HZ, this is the place where we nominate a new + * idle load balancing owner or decide to stop the periodic load balancing, + * if the whole system is idle. + */ +static inline void trigger_load_balance(int cpu) +{ + struct rq *rq = cpu_rq(cpu); +#ifdef CONFIG_NO_HZ + /* + * If we were in the nohz mode recently and busy at the current + * scheduler tick, then check if we need to nominate new idle + * load balancer. + */ + if (rq->in_nohz_recently && !rq->idle_at_tick) { + rq->in_nohz_recently = 0; + + if (atomic_read(&nohz.load_balancer) == cpu) { + cpu_clear(cpu, nohz.cpu_mask); + atomic_set(&nohz.load_balancer, -1); + } + + if (atomic_read(&nohz.load_balancer) == -1) { + /* + * simple selection for now: Nominate the + * first cpu in the nohz list to be the next + * ilb owner. + * + * TBD: Traverse the sched domains and nominate + * the nearest cpu in the nohz.cpu_mask. + */ + int ilb = first_cpu(nohz.cpu_mask); + + if (ilb != NR_CPUS) + resched_cpu(ilb); + } + } - /* - * Stop the load balance at this level. There is another - * CPU in our sched group which is doing load balancing more - * actively. - */ - if (!balance) - break; + /* + * If this cpu is idle and doing idle load balancing for all the + * cpus with ticks stopped, is it time for that to stop? + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) == cpu && + cpus_weight(nohz.cpu_mask) == num_online_cpus()) { + resched_cpu(cpu); + return; } - this_rq->next_balance = next_balance; + + /* + * If this cpu is idle and the idle load balancing is done by + * someone else, then no need raise the SCHED_SOFTIRQ + */ + if (rq->idle_at_tick && atomic_read(&nohz.load_balancer) != cpu && + cpu_isset(cpu, nohz.cpu_mask)) + return; +#endif + if (time_after_eq(jiffies, rq->next_balance)) + raise_softirq(SCHED_SOFTIRQ); } #else /* @@ -3011,28 +3180,23 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); /* - * This is called on clock ticks and on context switches. - * Bank in p->sched_time the ns elapsed since the last tick or switch. - */ -static inline void -update_cpu_clock(struct task_struct *p, struct rq *rq, unsigned long long now) -{ - p->sched_time += now - p->last_ran; - p->last_ran = rq->most_recent_timestamp = now; -} - -/* - * Return current->sched_time plus any more ns on the sched_clock - * that have not yet been banked. + * Return p->sum_exec_runtime plus any more ns on the sched_clock + * that have not yet been banked in case the task is currently running. */ -unsigned long long current_sched_time(const struct task_struct *p) +unsigned long long task_sched_runtime(struct task_struct *p) { - unsigned long long ns; unsigned long flags; + u64 ns, delta_exec; + struct rq *rq; - local_irq_save(flags); - ns = p->sched_time + sched_clock() - p->last_ran; - local_irq_restore(flags); + rq = task_rq_lock(p, &flags); + ns = p->se.sum_exec_runtime; + if (rq->curr == p) { + delta_exec = rq_clock(rq) - p->se.exec_start; + if ((s64)delta_exec > 0) + ns += delta_exec; + } + task_rq_unlock(rq, &flags); return ns; } @@ -3215,19 +3379,17 @@ out_unlock: */ void scheduler_tick(void) { - unsigned long long now = sched_clock(); struct task_struct *p = current; int cpu = smp_processor_id(); + int idle_at_tick = idle_cpu(cpu); struct rq *rq = cpu_rq(cpu); - update_cpu_clock(p, rq, now); - - if (p != rq->idle) + if (!idle_at_tick) task_running_tick(rq, p); #ifdef CONFIG_SMP update_load(rq); - if (time_after_eq(jiffies, rq->next_balance)) - raise_softirq(SCHED_SOFTIRQ); + rq->idle_at_tick = idle_at_tick; + trigger_load_balance(cpu); #endif } @@ -3404,8 +3566,6 @@ switch_tasks: clear_tsk_need_resched(prev); rcu_qsctr_inc(task_cpu(prev)); - update_cpu_clock(prev, rq, now); - prev->sleep_avg -= run_time; if ((long)prev->sleep_avg <= 0) prev->sleep_avg = 0; @@ -3903,7 +4063,7 @@ void set_user_nice(struct task_struct *p, long nice) * it wont have any effect on scheduling until the task is * not SCHED_NORMAL/SCHED_BATCH: */ - if (has_rt_policy(p)) { + if (task_has_rt_policy(p)) { p->static_prio = NICE_TO_PRIO(nice); goto out_unlock; } @@ -4092,14 +4252,14 @@ recheck: (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; - if (is_rt_policy(policy) != (param->sched_priority != 0)) + if (rt_policy(policy) != (param->sched_priority != 0)) return -EINVAL; /* * Allow unprivileged RT tasks to decrease priority: */ if (!capable(CAP_SYS_NICE)) { - if (is_rt_policy(policy)) { + if (rt_policy(policy)) { unsigned long rlim_rtprio; unsigned long flags; @@ -4291,13 +4451,13 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) struct task_struct *p; int retval; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); read_lock(&tasklist_lock); p = find_process_by_pid(pid); if (!p) { read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); return -ESRCH; } @@ -4324,7 +4484,7 @@ long sched_setaffinity(pid_t pid, cpumask_t new_mask) out_unlock: put_task_struct(p); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); return retval; } @@ -4381,7 +4541,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) struct task_struct *p; int retval; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); read_lock(&tasklist_lock); retval = -ESRCH; @@ -4397,7 +4557,7 @@ long sched_getaffinity(pid_t pid, cpumask_t *mask) out_unlock: read_unlock(&tasklist_lock); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); if (retval) return retval; @@ -4545,9 +4705,7 @@ int __sched cond_resched_softirq(void) BUG_ON(!in_softirq()); if (need_resched() && system_state == SYSTEM_RUNNING) { - raw_local_irq_disable(); - _local_bh_enable(); - raw_local_irq_enable(); + local_bh_enable(); __cond_resched(); local_bh_disable(); return 1; @@ -4750,6 +4908,8 @@ void show_state_filter(unsigned long state_filter) show_task(p); } while_each_thread(g, p); + touch_all_softlockup_watchdogs(); + read_unlock(&tasklist_lock); /* * Only show locks if all tasks are dumped: @@ -4758,6 +4918,11 @@ void show_state_filter(unsigned long state_filter) debug_show_all_locks(); } +void __cpuinit init_idle_bootup_task(struct task_struct *idle) +{ + /* nothing yet */ +} + /** * init_idle - set up an idle thread for a given CPU * @idle: task in question @@ -5157,7 +5322,12 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) struct rq *rq; switch (action) { + case CPU_LOCK_ACQUIRE: + mutex_lock(&sched_hotcpu_mutex); + break; + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: p = kthread_create(migration_thread, hcpu, "migration/%d",cpu); if (IS_ERR(p)) return NOTIFY_BAD; @@ -5171,12 +5341,14 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_ONLINE: + case CPU_ONLINE_FROZEN: /* Strictly unneccessary, as first user will wake it. */ wake_up_process(cpu_rq(cpu)->migration_thread); break; #ifdef CONFIG_HOTPLUG_CPU case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: if (!cpu_rq(cpu)->migration_thread) break; /* Unbind it from offline cpu so it can run. Fall thru. */ @@ -5187,6 +5359,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) break; case CPU_DEAD: + case CPU_DEAD_FROZEN: migrate_live_tasks(cpu); rq = cpu_rq(cpu); kthread_stop(rq->migration_thread); @@ -5202,7 +5375,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) BUG_ON(rq->nr_running != 0); /* No need to migrate the tasks: it was best-effort if - * they didn't do lock_cpu_hotplug(). Just wake up + * they didn't take sched_hotcpu_mutex. Just wake up * the requestors. */ spin_lock_irq(&rq->lock); while (!list_empty(&rq->migration_queue)) { @@ -5216,6 +5389,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) spin_unlock_irq(&rq->lock); break; #endif + case CPU_LOCK_RELEASE: + mutex_unlock(&sched_hotcpu_mutex); + break; } return NOTIFY_OK; } @@ -5244,6 +5420,11 @@ int __init migration_init(void) #endif #ifdef CONFIG_SMP + +/* Number of possible processor ids */ +int nr_cpu_ids __read_mostly = NR_CPUS; +EXPORT_SYMBOL(nr_cpu_ids); + #undef SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG static void sched_domain_debug(struct sched_domain *sd, int cpu) @@ -5299,7 +5480,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) break; } - if (!group->cpu_power) { + if (!group->__cpu_power) { printk("\n"); printk(KERN_ERR "ERROR: domain->cpu_power not " "set\n"); @@ -5476,7 +5657,7 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, continue; sg->cpumask = CPU_MASK_NONE; - sg->cpu_power = 0; + sg->__cpu_power = 0; for_each_cpu_mask(j, span) { if (group_fn(j, cpu_map, NULL) != group) @@ -5496,483 +5677,6 @@ init_sched_build_groups(cpumask_t span, const cpumask_t *cpu_map, #define SD_NODES_PER_DOMAIN 16 -/* - * Self-tuning task migration cost measurement between source and target CPUs. - * - * This is done by measuring the cost of manipulating buffers of varying - * sizes. For a given buffer-size here are the steps that are taken: - * - * 1) the source CPU reads+dirties a shared buffer - * 2) the target CPU reads+dirties the same shared buffer - * - * We measure how long they take, in the following 4 scenarios: - * - * - source: CPU1, target: CPU2 | cost1 - * - source: CPU2, target: CPU1 | cost2 - * - source: CPU1, target: CPU1 | cost3 - * - source: CPU2, target: CPU2 | cost4 - * - * We then calculate the cost3+cost4-cost1-cost2 difference - this is - * the cost of migration. - * - * We then start off from a small buffer-size and iterate up to larger - * buffer sizes, in 5% steps - measuring each buffer-size separately, and - * doing a maximum search for the cost. (The maximum cost for a migration - * normally occurs when the working set size is around the effective cache - * size.) - */ -#define SEARCH_SCOPE 2 -#define MIN_CACHE_SIZE (64*1024U) -#define DEFAULT_CACHE_SIZE (5*1024*1024U) -#define ITERATIONS 1 -#define SIZE_THRESH 130 -#define COST_THRESH 130 - -/* - * The migration cost is a function of 'domain distance'. Domain - * distance is the number of steps a CPU has to iterate down its - * domain tree to share a domain with the other CPU. The farther - * two CPUs are from each other, the larger the distance gets. - * - * Note that we use the distance only to cache measurement results, - * the distance value is not used numerically otherwise. When two - * CPUs have the same distance it is assumed that the migration - * cost is the same. (this is a simplification but quite practical) - */ -#define MAX_DOMAIN_DISTANCE 32 - -static unsigned long long migration_cost[MAX_DOMAIN_DISTANCE] = - { [ 0 ... MAX_DOMAIN_DISTANCE-1 ] = -/* - * Architectures may override the migration cost and thus avoid - * boot-time calibration. Unit is nanoseconds. Mostly useful for - * virtualized hardware: - */ -#ifdef CONFIG_DEFAULT_MIGRATION_COST - CONFIG_DEFAULT_MIGRATION_COST -#else - -1LL -#endif -}; - -/* - * Allow override of migration cost - in units of microseconds. - * E.g. migration_cost=1000,2000,3000 will set up a level-1 cost - * of 1 msec, level-2 cost of 2 msecs and level3 cost of 3 msecs: - */ -static int __init migration_cost_setup(char *str) -{ - int ints[MAX_DOMAIN_DISTANCE+1], i; - - str = get_options(str, ARRAY_SIZE(ints), ints); - - printk("#ints: %d\n", ints[0]); - for (i = 1; i <= ints[0]; i++) { - migration_cost[i-1] = (unsigned long long)ints[i]*1000; - printk("migration_cost[%d]: %Ld\n", i-1, migration_cost[i-1]); - } - return 1; -} - -__setup ("migration_cost=", migration_cost_setup); - -/* - * Global multiplier (divisor) for migration-cutoff values, - * in percentiles. E.g. use a value of 150 to get 1.5 times - * longer cache-hot cutoff times. - * - * (We scale it from 100 to 128 to long long handling easier.) - */ - -#define MIGRATION_FACTOR_SCALE 128 - -static unsigned int migration_factor = MIGRATION_FACTOR_SCALE; - -static int __init setup_migration_factor(char *str) -{ - get_option(&str, &migration_factor); - migration_factor = migration_factor * MIGRATION_FACTOR_SCALE / 100; - return 1; -} - -__setup("migration_factor=", setup_migration_factor); - -/* - * Estimated distance of two CPUs, measured via the number of domains - * we have to pass for the two CPUs to be in the same span: - */ -static unsigned long domain_distance(int cpu1, int cpu2) -{ - unsigned long distance = 0; - struct sched_domain *sd; - - for_each_domain(cpu1, sd) { - WARN_ON(!cpu_isset(cpu1, sd->span)); - if (cpu_isset(cpu2, sd->span)) - return distance; - distance++; - } - if (distance >= MAX_DOMAIN_DISTANCE) { - WARN_ON(1); - distance = MAX_DOMAIN_DISTANCE-1; - } - - return distance; -} - -static unsigned int migration_debug; - -static int __init setup_migration_debug(char *str) -{ - get_option(&str, &migration_debug); - return 1; -} - -__setup("migration_debug=", setup_migration_debug); - -/* - * Maximum cache-size that the scheduler should try to measure. - * Architectures with larger caches should tune this up during - * bootup. Gets used in the domain-setup code (i.e. during SMP - * bootup). - */ -unsigned int max_cache_size; - -static int __init setup_max_cache_size(char *str) -{ - get_option(&str, &max_cache_size); - return 1; -} - -__setup("max_cache_size=", setup_max_cache_size); - -/* - * Dirty a big buffer in a hard-to-predict (for the L2 cache) way. This - * is the operation that is timed, so we try to generate unpredictable - * cachemisses that still end up filling the L2 cache: - */ -static void touch_cache(void *__cache, unsigned long __size) -{ - unsigned long size = __size / sizeof(long); - unsigned long chunk1 = size / 3; - unsigned long chunk2 = 2 * size / 3; - unsigned long *cache = __cache; - int i; - - for (i = 0; i < size/6; i += 8) { - switch (i % 6) { - case 0: cache[i]++; - case 1: cache[size-1-i]++; - case 2: cache[chunk1-i]++; - case 3: cache[chunk1+i]++; - case 4: cache[chunk2-i]++; - case 5: cache[chunk2+i]++; - } - } -} - -/* - * Measure the cache-cost of one task migration. Returns in units of nsec. - */ -static unsigned long long -measure_one(void *cache, unsigned long size, int source, int target) -{ - cpumask_t mask, saved_mask; - unsigned long long t0, t1, t2, t3, cost; - - saved_mask = current->cpus_allowed; - - /* - * Flush source caches to RAM and invalidate them: - */ - sched_cacheflush(); - - /* - * Migrate to the source CPU: - */ - mask = cpumask_of_cpu(source); - set_cpus_allowed(current, mask); - WARN_ON(smp_processor_id() != source); - - /* - * Dirty the working set: - */ - t0 = sched_clock(); - touch_cache(cache, size); - t1 = sched_clock(); - - /* - * Migrate to the target CPU, dirty the L2 cache and access - * the shared buffer. (which represents the working set - * of a migrated task.) - */ - mask = cpumask_of_cpu(target); - set_cpus_allowed(current, mask); - WARN_ON(smp_processor_id() != target); - - t2 = sched_clock(); - touch_cache(cache, size); - t3 = sched_clock(); - - cost = t1-t0 + t3-t2; - - if (migration_debug >= 2) - printk("[%d->%d]: %8Ld %8Ld %8Ld => %10Ld.\n", - source, target, t1-t0, t1-t0, t3-t2, cost); - /* - * Flush target caches to RAM and invalidate them: - */ - sched_cacheflush(); - - set_cpus_allowed(current, saved_mask); - - return cost; -} - -/* - * Measure a series of task migrations and return the average - * result. Since this code runs early during bootup the system - * is 'undisturbed' and the average latency makes sense. - * - * The algorithm in essence auto-detects the relevant cache-size, - * so it will properly detect different cachesizes for different - * cache-hierarchies, depending on how the CPUs are connected. - * - * Architectures can prime the upper limit of the search range via - * max_cache_size, otherwise the search range defaults to 20MB...64K. - */ -static unsigned long long -measure_cost(int cpu1, int cpu2, void *cache, unsigned int size) -{ - unsigned long long cost1, cost2; - int i; - - /* - * Measure the migration cost of 'size' bytes, over an - * average of 10 runs: - * - * (We perturb the cache size by a small (0..4k) - * value to compensate size/alignment related artifacts. - * We also subtract the cost of the operation done on - * the same CPU.) - */ - cost1 = 0; - - /* - * dry run, to make sure we start off cache-cold on cpu1, - * and to get any vmalloc pagefaults in advance: - */ - measure_one(cache, size, cpu1, cpu2); - for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i * 1024, cpu1, cpu2); - - measure_one(cache, size, cpu2, cpu1); - for (i = 0; i < ITERATIONS; i++) - cost1 += measure_one(cache, size - i * 1024, cpu2, cpu1); - - /* - * (We measure the non-migrating [cached] cost on both - * cpu1 and cpu2, to handle CPUs with different speeds) - */ - cost2 = 0; - - measure_one(cache, size, cpu1, cpu1); - for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i * 1024, cpu1, cpu1); - - measure_one(cache, size, cpu2, cpu2); - for (i = 0; i < ITERATIONS; i++) - cost2 += measure_one(cache, size - i * 1024, cpu2, cpu2); - - /* - * Get the per-iteration migration cost: - */ - do_div(cost1, 2 * ITERATIONS); - do_div(cost2, 2 * ITERATIONS); - - return cost1 - cost2; -} - -static unsigned long long measure_migration_cost(int cpu1, int cpu2) -{ - unsigned long long max_cost = 0, fluct = 0, avg_fluct = 0; - unsigned int max_size, size, size_found = 0; - long long cost = 0, prev_cost; - void *cache; - - /* - * Search from max_cache_size*5 down to 64K - the real relevant - * cachesize has to lie somewhere inbetween. - */ - if (max_cache_size) { - max_size = max(max_cache_size * SEARCH_SCOPE, MIN_CACHE_SIZE); - size = max(max_cache_size / SEARCH_SCOPE, MIN_CACHE_SIZE); - } else { - /* - * Since we have no estimation about the relevant - * search range - */ - max_size = DEFAULT_CACHE_SIZE * SEARCH_SCOPE; - size = MIN_CACHE_SIZE; - } - - if (!cpu_online(cpu1) || !cpu_online(cpu2)) { - printk("cpu %d and %d not both online!\n", cpu1, cpu2); - return 0; - } - - /* - * Allocate the working set: - */ - cache = vmalloc(max_size); - if (!cache) { - printk("could not vmalloc %d bytes for cache!\n", 2 * max_size); - return 1000000; /* return 1 msec on very small boxen */ - } - - while (size <= max_size) { - prev_cost = cost; - cost = measure_cost(cpu1, cpu2, cache, size); - - /* - * Update the max: - */ - if (cost > 0) { - if (max_cost < cost) { - max_cost = cost; - size_found = size; - } - } - /* - * Calculate average fluctuation, we use this to prevent - * noise from triggering an early break out of the loop: - */ - fluct = abs(cost - prev_cost); - avg_fluct = (avg_fluct + fluct)/2; - - if (migration_debug) - printk("-> [%d][%d][%7d] %3ld.%ld [%3ld.%ld] (%ld): " - "(%8Ld %8Ld)\n", - cpu1, cpu2, size, - (long)cost / 1000000, - ((long)cost / 100000) % 10, - (long)max_cost / 1000000, - ((long)max_cost / 100000) % 10, - domain_distance(cpu1, cpu2), - cost, avg_fluct); - - /* - * If we iterated at least 20% past the previous maximum, - * and the cost has dropped by more than 20% already, - * (taking fluctuations into account) then we assume to - * have found the maximum and break out of the loop early: - */ - if (size_found && (size*100 > size_found*SIZE_THRESH)) - if (cost+avg_fluct <= 0 || - max_cost*100 > (cost+avg_fluct)*COST_THRESH) { - - if (migration_debug) - printk("-> found max.\n"); - break; - } - /* - * Increase the cachesize in 10% steps: - */ - size = size * 10 / 9; - } - - if (migration_debug) - printk("[%d][%d] working set size found: %d, cost: %Ld\n", - cpu1, cpu2, size_found, max_cost); - - vfree(cache); - - /* - * A task is considered 'cache cold' if at least 2 times - * the worst-case cost of migration has passed. - * - * (this limit is only listened to if the load-balancing - * situation is 'nice' - if there is a large imbalance we - * ignore it for the sake of CPU utilization and - * processing fairness.) - */ - return 2 * max_cost * migration_factor / MIGRATION_FACTOR_SCALE; -} - -static void calibrate_migration_costs(const cpumask_t *cpu_map) -{ - int cpu1 = -1, cpu2 = -1, cpu, orig_cpu = raw_smp_processor_id(); - unsigned long j0, j1, distance, max_distance = 0; - struct sched_domain *sd; - - j0 = jiffies; - - /* - * First pass - calculate the cacheflush times: - */ - for_each_cpu_mask(cpu1, *cpu_map) { - for_each_cpu_mask(cpu2, *cpu_map) { - if (cpu1 == cpu2) - continue; - distance = domain_distance(cpu1, cpu2); - max_distance = max(max_distance, distance); - /* - * No result cached yet? - */ - if (migration_cost[distance] == -1LL) - migration_cost[distance] = - measure_migration_cost(cpu1, cpu2); - } - } - /* - * Second pass - update the sched domain hierarchy with - * the new cache-hot-time estimations: - */ - for_each_cpu_mask(cpu, *cpu_map) { - distance = 0; - for_each_domain(cpu, sd) { - sd->cache_hot_time = migration_cost[distance]; - distance++; - } - } - /* - * Print the matrix: - */ - if (migration_debug) - printk("migration: max_cache_size: %d, cpu: %d MHz:\n", - max_cache_size, -#ifdef CONFIG_X86 - cpu_khz/1000 -#else - -1 -#endif - ); - if (system_state == SYSTEM_BOOTING && num_online_cpus() > 1) { - printk("migration_cost="); - for (distance = 0; distance <= max_distance; distance++) { - if (distance) - printk(","); - printk("%ld", (long)migration_cost[distance] / 1000); - } - printk("\n"); - } - j1 = jiffies; - if (migration_debug) - printk("migration: %ld seconds\n", (j1-j0) / HZ); - - /* - * Move back to the original CPU. NUMA-Q gets confused - * if we migrate to another quad during bootup. - */ - if (raw_smp_processor_id() != orig_cpu) { - cpumask_t mask = cpumask_of_cpu(orig_cpu), - saved_mask = current->cpus_allowed; - - set_cpus_allowed(current, mask); - set_cpus_allowed(current, saved_mask); - } -} - #ifdef CONFIG_NUMA /** @@ -6165,7 +5869,7 @@ next_sg: continue; } - sg->cpu_power += sd->groups->cpu_power; + sg_inc_cpu_power(sg, sd->groups->__cpu_power); } sg = sg->next; if (sg != group_head) @@ -6240,6 +5944,8 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) child = sd->child; + sd->groups->__cpu_power = 0; + /* * For perf policy, if the groups in child domain share resources * (for example cores sharing some portions of the cache hierarchy @@ -6250,18 +5956,16 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) if (!child || (!(sd->flags & SD_POWERSAVINGS_BALANCE) && (child->flags & (SD_SHARE_CPUPOWER | SD_SHARE_PKG_RESOURCES)))) { - sd->groups->cpu_power = SCHED_LOAD_SCALE; + sg_inc_cpu_power(sd->groups, SCHED_LOAD_SCALE); return; } - sd->groups->cpu_power = 0; - /* * add cpu_power of each child group to this groups cpu_power */ group = child->groups; do { - sd->groups->cpu_power += group->cpu_power; + sg_inc_cpu_power(sd->groups, group->__cpu_power); group = group->next; } while (group != child->groups); } @@ -6421,7 +6125,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) sd = &per_cpu(node_domains, j); sd->groups = sg; } - sg->cpu_power = 0; + sg->__cpu_power = 0; sg->cpumask = nodemask; sg->next = sg; cpus_or(covered, covered, nodemask); @@ -6449,7 +6153,7 @@ static int build_sched_domains(const cpumask_t *cpu_map) "Can not alloc domain group for node %d\n", j); goto error; } - sg->cpu_power = 0; + sg->__cpu_power = 0; sg->cpumask = tmp; sg->next = prev->next; cpus_or(covered, covered, tmp); @@ -6502,10 +6206,6 @@ static int build_sched_domains(const cpumask_t *cpu_map) #endif cpu_attach_domain(sd, i); } - /* - * Tune cache-hot values: - */ - calibrate_migration_costs(cpu_map); return 0; @@ -6586,10 +6286,10 @@ int arch_reinit_sched_domains(void) { int err; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); detach_destroy_domains(&cpu_online_map); err = arch_init_sched_domains(&cpu_online_map); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); return err; } @@ -6668,14 +6368,20 @@ static int update_sched_domains(struct notifier_block *nfb, { switch (action) { case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: detach_destroy_domains(&cpu_online_map); return NOTIFY_OK; case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: + case CPU_ONLINE_FROZEN: case CPU_DEAD: + case CPU_DEAD_FROZEN: /* * Fall through and re-initialise the domains. */ @@ -6694,12 +6400,12 @@ void __init sched_init_smp(void) { cpumask_t non_isolated_cpus; - lock_cpu_hotplug(); + mutex_lock(&sched_hotcpu_mutex); arch_init_sched_domains(&cpu_online_map); cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); - unlock_cpu_hotplug(); + mutex_unlock(&sched_hotcpu_mutex); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); @@ -6726,6 +6432,7 @@ int in_sched_functions(unsigned long addr) void __init sched_init(void) { int i, j, k; + int highest_cpu = 0; for_each_possible_cpu(i) { struct prio_array *array; @@ -6760,11 +6467,13 @@ void __init sched_init(void) // delimiter for bitsearch __set_bit(MAX_PRIO, array->bitmap); } + highest_cpu = i; } set_load_weight(&init_task); #ifdef CONFIG_SMP + nr_cpu_ids = highest_cpu + 1; open_softirq(SCHED_SOFTIRQ, run_rebalance_domains, NULL); #endif @@ -6816,12 +6525,13 @@ EXPORT_SYMBOL(__might_sleep); void normalize_rt_tasks(void) { struct prio_array *array; - struct task_struct *p; + struct task_struct *g, *p; unsigned long flags; struct rq *rq; read_lock_irq(&tasklist_lock); - for_each_process(p) { + + do_each_thread(g, p) { if (!rt_task(p)) continue; @@ -6839,7 +6549,8 @@ void normalize_rt_tasks(void) __task_rq_unlock(rq); spin_unlock_irqrestore(&p->pi_lock, flags); - } + } while_each_thread(g, p); + read_unlock_irq(&tasklist_lock); }