X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched_fair.c;h=86a93376282c2e43773dba5d6a11762e111a5eb6;hb=320787c75ccac3189a1b7aae81f0efc1055f6d3a;hp=d3c03070872d9f335d3997e6e542ceb5006f2951;hpb=d06fc1d9b5518582f4982f908e820f2303d1eece;p=linux-2.6 diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index d3c0307087..86a9337628 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -20,9 +20,11 @@ * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra */ +#include + /* * Targeted preemption latency for CPU-bound tasks: - * (default: 20ms * ilog(ncpus), units: nanoseconds) + * (default: 20ms * (1 + ilog(ncpus)), units: nanoseconds) * * NOTE: this latency value is not the same as the concept of * 'timeslice length' - timeslices in CFS are of variable length @@ -36,14 +38,14 @@ unsigned int sysctl_sched_latency = 20000000ULL; /* * Minimal preemption granularity for CPU-bound tasks: - * (default: 1 msec * ilog(ncpus), units: nanoseconds) + * (default: 4 msec * (1 + ilog(ncpus)), units: nanoseconds) */ -unsigned int sysctl_sched_min_granularity = 1000000ULL; +unsigned int sysctl_sched_min_granularity = 4000000ULL; /* * is kept at sysctl_sched_latency / sysctl_sched_min_granularity */ -unsigned int sched_nr_latency = 20; +static unsigned int sched_nr_latency = 5; /* * After fork, child runs first. (default) If set to 0 then @@ -61,7 +63,7 @@ unsigned int __read_mostly sysctl_sched_compat_yield; /* * SCHED_BATCH wake-up granularity. - * (default: 10 msec * ilog(ncpus), units: nanoseconds) + * (default: 10 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still @@ -71,13 +73,13 @@ unsigned int sysctl_sched_batch_wakeup_granularity = 10000000UL; /* * SCHED_OTHER wake-up granularity. - * (default: 10 msec * ilog(ncpus), units: nanoseconds) + * (default: 5 msec * (1 + ilog(ncpus)), units: nanoseconds) * * This option delays the preemption effects of decoupled workloads * and reduces their over-scheduling. Synchronous workloads will still * have immediate wakeup/sleep latencies. */ -unsigned int sysctl_sched_wakeup_granularity = 10000000UL; +unsigned int sysctl_sched_wakeup_granularity = 5000000UL; const_debug unsigned int sysctl_sched_migration_cost = 500000UL; @@ -173,8 +175,15 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) * Maintain a cache of leftmost tree entries (it is frequently * used): */ - if (leftmost) + if (leftmost) { cfs_rq->rb_leftmost = &se->run_node; + /* + * maintain cfs_rq->min_vruntime to be a monotonic increasing + * value tracking the leftmost vruntime in the tree. + */ + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, se->vruntime); + } rb_link_node(&se->run_node, parent, link); rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline); @@ -182,8 +191,24 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) { - if (cfs_rq->rb_leftmost == &se->run_node) - cfs_rq->rb_leftmost = rb_next(&se->run_node); + if (cfs_rq->rb_leftmost == &se->run_node) { + struct rb_node *next_node; + struct sched_entity *next; + + next_node = rb_next(&se->run_node); + cfs_rq->rb_leftmost = next_node; + + if (next_node) { + next = rb_entry(next_node, + struct sched_entity, run_node); + cfs_rq->min_vruntime = + max_vruntime(cfs_rq->min_vruntime, + next->vruntime); + } + } + + if (cfs_rq->next == se) + cfs_rq->next = NULL; rb_erase(&se->run_node, &cfs_rq->tasks_timeline); } @@ -200,17 +225,12 @@ static struct sched_entity *__pick_next_entity(struct cfs_rq *cfs_rq) static inline struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) { - struct rb_node **link = &cfs_rq->tasks_timeline.rb_node; - struct sched_entity *se = NULL; - struct rb_node *parent; + struct rb_node *last = rb_last(&cfs_rq->tasks_timeline); - while (*link) { - parent = *link; - se = rb_entry(parent, struct sched_entity, run_node); - link = &parent->rb_right; - } + if (!last) + return NULL; - return se; + return rb_entry(last, struct sched_entity, run_node); } /************************************************************** @@ -248,8 +268,8 @@ static u64 __sched_period(unsigned long nr_running) unsigned long nr_latency = sched_nr_latency; if (unlikely(nr_running > nr_latency)) { + period = sysctl_sched_min_granularity; period *= nr_running; - do_div(period, nr_latency); } return period; @@ -263,12 +283,8 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 slice = __sched_period(cfs_rq->nr_running); - - slice *= se->load.weight; - do_div(slice, cfs_rq->load.weight); - - return slice; + return calc_delta_mine(__sched_period(cfs_rq->nr_running), + se->load.weight, &cfs_rq->load); } /* @@ -286,11 +302,6 @@ static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) return vslice; } -static u64 sched_vslice(struct cfs_rq *cfs_rq) -{ - return __sched_vslice(cfs_rq->load.weight, cfs_rq->nr_running); -} - static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { return __sched_vslice(cfs_rq->load.weight + se->load.weight, @@ -306,7 +317,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, unsigned long delta_exec) { unsigned long delta_exec_weighted; - u64 vruntime; schedstat_set(curr->exec_max, max((u64)delta_exec, curr->exec_max)); @@ -318,19 +328,6 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, &curr->load); } curr->vruntime += delta_exec_weighted; - - /* - * maintain cfs_rq->min_vruntime to be a monotonic increasing - * value tracking the leftmost vruntime in the tree. - */ - if (first_fair(cfs_rq)) { - vruntime = min_vruntime(curr->vruntime, - __pick_next_entity(cfs_rq)->vruntime); - } else - vruntime = curr->vruntime; - - cfs_rq->min_vruntime = - max_vruntime(cfs_rq->min_vruntime, vruntime); } static void update_curr(struct cfs_rq *cfs_rq) @@ -351,6 +348,12 @@ static void update_curr(struct cfs_rq *cfs_rq) __update_curr(cfs_rq, curr, delta_exec); curr->exec_start = now; + + if (entity_is_task(curr)) { + struct task_struct *curtask = task_of(curr); + + cpuacct_charge(curtask, delta_exec); + } } static inline void @@ -377,6 +380,9 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) { schedstat_set(se->wait_max, max(se->wait_max, rq_of(cfs_rq)->clock - se->wait_start)); + schedstat_set(se->wait_count, se->wait_count + 1); + schedstat_set(se->wait_sum, se->wait_sum + + rq_of(cfs_rq)->clock - se->wait_start); schedstat_set(se->wait_start, 0); } @@ -428,6 +434,7 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) #ifdef CONFIG_SCHEDSTATS if (se->sleep_start) { u64 delta = rq_of(cfs_rq)->clock - se->sleep_start; + struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -437,9 +444,12 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) se->sleep_start = 0; se->sum_sleep_runtime += delta; + + account_scheduler_latency(tsk, delta >> 10, 1); } if (se->block_start) { u64 delta = rq_of(cfs_rq)->clock - se->block_start; + struct task_struct *tsk = task_of(se); if ((s64)delta < 0) delta = 0; @@ -456,11 +466,11 @@ static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) * time that the task spent sleeping: */ if (unlikely(prof_on == SLEEP_PROFILING)) { - struct task_struct *tsk = task_of(se); profile_hits(SLEEP_PROFILING, (void *)get_wchan(tsk), delta >> 20); } + account_scheduler_latency(tsk, delta >> 10, 0); } #endif } @@ -483,16 +493,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) { u64 vruntime; - vruntime = cfs_rq->min_vruntime; - - if (sched_feat(TREE_AVG)) { - struct sched_entity *last = __pick_last_entity(cfs_rq); - if (last) { - vruntime += last->vruntime; - vruntime >>= 1; - } - } else if (sched_feat(APPROX_AVG) && cfs_rq->nr_running) - vruntime += sched_vslice(cfs_rq)/2; + if (first_fair(cfs_rq)) { + vruntime = min_vruntime(cfs_rq->min_vruntime, + __pick_next_entity(cfs_rq)->vruntime); + } else + vruntime = cfs_rq->min_vruntime; /* * The 'current' period is already promised to the current tasks, @@ -505,9 +510,10 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ - if (sched_feat(NEW_FAIR_SLEEPERS) && entity_is_task(se) && - task_of(se)->policy != SCHED_BATCH) - vruntime -= sysctl_sched_latency; + if (sched_feat(NEW_FAIR_SLEEPERS)) { + vruntime -= calc_delta_fair(sysctl_sched_latency, + &cfs_rq->load); + } /* ensure we never gain time by being placed backwards. */ vruntime = max_vruntime(se->vruntime, vruntime); @@ -536,6 +542,21 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) account_entity_enqueue(cfs_rq, se); } +static void update_avg(u64 *avg, u64 sample) +{ + s64 diff = sample - *avg; + *avg += diff >> 3; +} + +static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + if (!se->last_wakeup) + return; + + update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); + se->last_wakeup = 0; +} + static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -546,6 +567,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { + update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -607,12 +629,32 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) se->prev_sum_exec_runtime = se->sum_exec_runtime; } +static struct sched_entity * +pick_next(struct cfs_rq *cfs_rq, struct sched_entity *se) +{ + s64 diff, gran; + + if (!cfs_rq->next) + return se; + + diff = cfs_rq->next->vruntime - se->vruntime; + if (diff < 0) + return se; + + gran = calc_delta_fair(sysctl_sched_wakeup_granularity, &cfs_rq->load); + if (diff > gran) + return se; + + return cfs_rq->next; +} + static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq) { struct sched_entity *se = NULL; if (first_fair(cfs_rq)) { se = __pick_next_entity(cfs_rq); + se = pick_next(cfs_rq, se); set_next_entity(cfs_rq, se); } @@ -637,13 +679,29 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) cfs_rq->curr = NULL; } -static void entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) +static void +entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) { /* * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); +#ifdef CONFIG_SCHED_HRTICK + /* + * queued ticks are scheduled to match the slice, so don't bother + * validating it and just reschedule. + */ + if (queued) + return resched_task(rq_of(cfs_rq)->curr); + /* + * don't let the period tick interfere with the hrtick preemption + */ + if (!sched_feat(DOUBLE_TICK) && + hrtimer_active(&rq_of(cfs_rq)->hrtick_timer)) + return; +#endif + if (cfs_rq->nr_running > 1 || !sched_feat(WAKEUP_PREEMPT)) check_preempt_tick(cfs_rq, curr); } @@ -685,7 +743,7 @@ static inline struct cfs_rq *cpu_cfs_rq(struct cfs_rq *cfs_rq, int this_cpu) /* Iterate thr' all leaf cfs_rq's on a runqueue */ #define for_each_leaf_cfs_rq(rq, cfs_rq) \ - list_for_each_entry(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) + list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list) /* Do the two (enqueued) entities belong to the same group ? */ static inline int @@ -747,6 +805,43 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se) #endif /* CONFIG_FAIR_GROUP_SCHED */ +#ifdef CONFIG_SCHED_HRTICK +static void hrtick_start_fair(struct rq *rq, struct task_struct *p) +{ + int requeue = rq->curr == p; + struct sched_entity *se = &p->se; + struct cfs_rq *cfs_rq = cfs_rq_of(se); + + WARN_ON(task_rq(p) != rq); + + if (hrtick_enabled(rq) && cfs_rq->nr_running > 1) { + u64 slice = sched_slice(cfs_rq, se); + u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime; + s64 delta = slice - ran; + + if (delta < 0) { + if (rq->curr == p) + resched_task(p); + return; + } + + /* + * Don't schedule slices shorter than 10000ns, that just + * doesn't make sense. Rely on vruntime for fairness. + */ + if (!requeue) + delta = max(10000LL, delta); + + hrtick_start(rq, delta, requeue); + } +} +#else +static inline void +hrtick_start_fair(struct rq *rq, struct task_struct *p) +{ +} +#endif + /* * The enqueue_task method is called before nr_running is * increased. Here we update the fair scheduling stats and @@ -764,6 +859,8 @@ static void enqueue_task_fair(struct rq *rq, struct task_struct *p, int wakeup) enqueue_entity(cfs_rq, se, wakeup); wakeup = 1; } + + hrtick_start_fair(rq, rq->curr); } /* @@ -784,6 +881,8 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) break; sleep = 1; } + + hrtick_start_fair(rq, rq->curr); } /* @@ -793,8 +892,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int sleep) */ static void yield_task_fair(struct rq *rq) { - struct cfs_rq *cfs_rq = task_cfs_rq(rq->curr); - struct sched_entity *rightmost, *se = &rq->curr->se; + struct task_struct *curr = rq->curr; + struct cfs_rq *cfs_rq = task_cfs_rq(curr); + struct sched_entity *rightmost, *se = &curr->se; /* * Are we the only task in the tree? @@ -802,7 +902,7 @@ static void yield_task_fair(struct rq *rq) if (unlikely(cfs_rq->nr_running == 1)) return; - if (likely(!sysctl_sched_compat_yield)) { + if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { __update_rq_clock(rq); /* * Update run-time statistics of the 'current'. @@ -829,6 +929,179 @@ static void yield_task_fair(struct rq *rq) se->vruntime = rightmost->vruntime + 1; } +/* + * wake_idle() will wake a task on an idle cpu if task->cpu is + * not idle and an idle cpu is available. The span of cpus to + * search starts with cpus closest then further out as needed, + * so we always favor a closer, idle cpu. + * + * Returns the CPU we should wake onto. + */ +#if defined(ARCH_HAS_SCHED_WAKE_IDLE) +static int wake_idle(int cpu, struct task_struct *p) +{ + cpumask_t tmp; + struct sched_domain *sd; + int i; + + /* + * If it is idle, then it is the best cpu to run this task. + * + * This cpu is also the best, if it has more than one task already. + * Siblings must be also busy(in most cases) as they didn't already + * pickup the extra load from this cpu and hence we need not check + * sibling runqueue info. This will avoid the checks and cache miss + * penalities associated with that. + */ + if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) + return cpu; + + for_each_domain(cpu, sd) { + if (sd->flags & SD_WAKE_IDLE) { + cpus_and(tmp, sd->span, p->cpus_allowed); + for_each_cpu_mask(i, tmp) { + if (idle_cpu(i)) { + if (i != task_cpu(p)) { + schedstat_inc(p, + se.nr_wakeups_idle); + } + return i; + } + } + } else { + break; + } + } + return cpu; +} +#else +static inline int wake_idle(int cpu, struct task_struct *p) +{ + return cpu; +} +#endif + +#ifdef CONFIG_SMP + +static const struct sched_class fair_sched_class; + +static int +wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, + struct task_struct *p, int prev_cpu, int this_cpu, int sync, + int idx, unsigned long load, unsigned long this_load, + unsigned int imbalance) +{ + struct task_struct *curr = this_rq->curr; + unsigned long tl = this_load; + unsigned long tl_per_task; + + if (!(this_sd->flags & SD_WAKE_AFFINE)) + return 0; + + /* + * If the currently running task will sleep within + * a reasonable amount of time then attract this newly + * woken task: + */ + if (sync && curr->sched_class == &fair_sched_class) { + if (curr->se.avg_overlap < sysctl_sched_migration_cost && + p->se.avg_overlap < sysctl_sched_migration_cost) + return 1; + } + + schedstat_inc(p, se.nr_wakeups_affine_attempts); + tl_per_task = cpu_avg_load_per_task(this_cpu); + + /* + * If sync wakeup then subtract the (maximum possible) + * effect of the currently running task from the load + * of the current CPU: + */ + if (sync) + tl -= current->se.load.weight; + + if ((tl <= load && tl + target_load(prev_cpu, idx) <= tl_per_task) || + 100*(tl + p->se.load.weight) <= imbalance*load) { + /* + * This domain has SD_WAKE_AFFINE and + * p is cache cold in this domain, and + * there is no bad imbalance. + */ + schedstat_inc(this_sd, ttwu_move_affine); + schedstat_inc(p, se.nr_wakeups_affine); + + return 1; + } + return 0; +} + +static int select_task_rq_fair(struct task_struct *p, int sync) +{ + struct sched_domain *sd, *this_sd = NULL; + int prev_cpu, this_cpu, new_cpu; + unsigned long load, this_load; + struct rq *rq, *this_rq; + unsigned int imbalance; + int idx; + + prev_cpu = task_cpu(p); + rq = task_rq(p); + this_cpu = smp_processor_id(); + this_rq = cpu_rq(this_cpu); + new_cpu = prev_cpu; + + /* + * 'this_sd' is the first domain that both + * this_cpu and prev_cpu are present in: + */ + for_each_domain(this_cpu, sd) { + if (cpu_isset(prev_cpu, sd->span)) { + this_sd = sd; + break; + } + } + + if (unlikely(!cpu_isset(this_cpu, p->cpus_allowed))) + goto out; + + /* + * Check for affine wakeup and passive balancing possibilities. + */ + if (!this_sd) + goto out; + + idx = this_sd->wake_idx; + + imbalance = 100 + (this_sd->imbalance_pct - 100) / 2; + + load = source_load(prev_cpu, idx); + this_load = target_load(this_cpu, idx); + + if (wake_affine(rq, this_sd, this_rq, p, prev_cpu, this_cpu, sync, idx, + load, this_load, imbalance)) + return this_cpu; + + if (prev_cpu == this_cpu) + goto out; + + /* + * Start passive balancing when half the imbalance_pct + * limit is reached. + */ + if (this_sd->flags & SD_WAKE_BALANCE) { + if (imbalance*this_load <= 100*load) { + schedstat_inc(this_sd, ttwu_move_balance); + schedstat_inc(p, se.nr_wakeups_passive); + return this_cpu; + } + } + +out: + return wake_idle(new_cpu, p); +} +#endif /* CONFIG_SMP */ + + /* * Preempt the current task with a newly woken task if needed: */ @@ -845,6 +1118,13 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) resched_task(curr); return; } + + se->last_wakeup = se->sum_exec_runtime; + if (unlikely(se == pse)) + return; + + cfs_rq_of(pse)->next = pse; + /* * Batch tasks do not preempt (their preemption is driven by * the tick): @@ -861,7 +1141,11 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) } gran = sysctl_sched_wakeup_granularity; - if (unlikely(se->load.weight != NICE_0_LOAD)) + /* + * More easily preempt - nice tasks, while not making + * it harder for + nice tasks. + */ + if (unlikely(se->load.weight > NICE_0_LOAD)) gran = calc_delta_fair(gran, &se->load); if (pse->vruntime + gran < se->vruntime) @@ -870,6 +1154,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) static struct task_struct *pick_next_task_fair(struct rq *rq) { + struct task_struct *p; struct cfs_rq *cfs_rq = &rq->cfs; struct sched_entity *se; @@ -881,7 +1166,10 @@ static struct task_struct *pick_next_task_fair(struct rq *rq) cfs_rq = group_cfs_rq(se); } while (cfs_rq); - return task_of(se); + p = task_of(se); + hrtick_start_fair(rq, p); + + return p; } /* @@ -944,7 +1232,7 @@ static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) struct sched_entity *curr; struct task_struct *p; - if (!cfs_rq->nr_running) + if (!cfs_rq->nr_running || !first_fair(cfs_rq)) return MAX_PRIO; curr = cfs_rq->curr; @@ -1036,14 +1324,14 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, /* * scheduler tick hitting a task of our scheduling class: */ -static void task_tick_fair(struct rq *rq, struct task_struct *curr) +static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) { struct cfs_rq *cfs_rq; struct sched_entity *se = &curr->se; for_each_sched_entity(se) { cfs_rq = cfs_rq_of(se); - entity_tick(cfs_rq, se); + entity_tick(cfs_rq, se, queued); } } @@ -1081,6 +1369,42 @@ static void task_new_fair(struct rq *rq, struct task_struct *p) resched_task(rq->curr); } +/* + * Priority of the task has changed. Check to see if we preempt + * the current task. + */ +static void prio_changed_fair(struct rq *rq, struct task_struct *p, + int oldprio, int running) +{ + /* + * Reschedule if we are currently running on this runqueue and + * our priority decreased, or if we are not currently running on + * this runqueue and our priority is higher than the current's + */ + if (running) { + if (p->prio > oldprio) + resched_task(rq->curr); + } else + check_preempt_curr(rq, p); +} + +/* + * We switched to the sched_fair class. + */ +static void switched_to_fair(struct rq *rq, struct task_struct *p, + int running) +{ + /* + * We were most likely switched from sched_rt, so + * kick off the schedule if running, otherwise just see + * if we can still preempt the current task. + */ + if (running) + resched_task(rq->curr); + else + check_preempt_curr(rq, p); +} + /* Account for a task changing its policy or group. * * This routine is mostly called to set cfs_rq->curr field when a task @@ -1094,6 +1418,16 @@ static void set_curr_task_fair(struct rq *rq) set_next_entity(cfs_rq_of(se), se); } +#ifdef CONFIG_FAIR_GROUP_SCHED +static void moved_group_fair(struct task_struct *p) +{ + struct cfs_rq *cfs_rq = task_cfs_rq(p); + + update_curr(cfs_rq); + place_entity(cfs_rq, &p->se, 1); +} +#endif + /* * All the scheduling class methods: */ @@ -1102,6 +1436,9 @@ static const struct sched_class fair_sched_class = { .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, +#ifdef CONFIG_SMP + .select_task_rq = select_task_rq_fair, +#endif /* CONFIG_SMP */ .check_preempt_curr = check_preempt_wakeup, @@ -1116,6 +1453,13 @@ static const struct sched_class fair_sched_class = { .set_curr_task = set_curr_task_fair, .task_tick = task_tick_fair, .task_new = task_new_fair, + + .prio_changed = prio_changed_fair, + .switched_to = switched_to_fair, + +#ifdef CONFIG_FAIR_GROUP_SCHED + .moved_group = moved_group_fair, +#endif }; #ifdef CONFIG_SCHED_DEBUG @@ -1126,7 +1470,9 @@ static void print_cfs_stats(struct seq_file *m, int cpu) #ifdef CONFIG_FAIR_GROUP_SCHED print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); #endif + rcu_read_lock(); for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) print_cfs_rq(m, cpu, cfs_rq); + rcu_read_unlock(); } #endif