X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched_fair.c;h=e24ecd39c4b8aec9786d0ab3df0a01ad6dcba08d;hb=424de91dd6163808729d7082de55c319e1096bee;hp=b43748efaa7f2246c22df70fd976599bdddb3fa7;hpb=1d3504fcf5606579d60b649d19f44b3871c1ddae;p=linux-2.6 diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index b43748efaa..e24ecd39c4 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -333,6 +333,34 @@ int sched_nr_latency_handler(struct ctl_table *table, int write, } #endif +/* + * delta *= w / rw + */ +static inline unsigned long +calc_delta_weight(unsigned long delta, struct sched_entity *se) +{ + for_each_sched_entity(se) { + delta = calc_delta_mine(delta, + se->load.weight, &cfs_rq_of(se)->load); + } + + return delta; +} + +/* + * delta *= rw / w + */ +static inline unsigned long +calc_delta_fair(unsigned long delta, struct sched_entity *se) +{ + for_each_sched_entity(se) { + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, &se->load); + } + + return delta; +} + /* * The idea is to set a period in which each task runs once. * @@ -362,29 +390,54 @@ static u64 __sched_period(unsigned long nr_running) */ static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) { - return calc_delta_mine(__sched_period(cfs_rq->nr_running), - se->load.weight, &cfs_rq->load); + return calc_delta_weight(__sched_period(cfs_rq->nr_running), se); } /* - * We calculate the vruntime slice. + * We calculate the vruntime slice of a to be inserted task * - * vs = s/w = p/rw + * vs = s*rw/w = p */ -static u64 __sched_vslice(unsigned long rq_weight, unsigned long nr_running) +static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) { - u64 vslice = __sched_period(nr_running); + unsigned long nr_running = cfs_rq->nr_running; - vslice *= NICE_0_LOAD; - do_div(vslice, rq_weight); + if (!se->on_rq) + nr_running++; - return vslice; + return __sched_period(nr_running); } -static u64 sched_vslice_add(struct cfs_rq *cfs_rq, struct sched_entity *se) +/* + * The goal of calc_delta_asym() is to be asymmetrically around NICE_0_LOAD, in + * that it favours >=0 over <0. + * + * -20 | + * | + * 0 --------+------- + * .' + * 19 .' + * + */ +static unsigned long +calc_delta_asym(unsigned long delta, struct sched_entity *se) { - return __sched_vslice(cfs_rq->load.weight + se->load.weight, - cfs_rq->nr_running + 1); + struct load_weight lw = { + .weight = NICE_0_LOAD, + .inv_weight = 1UL << (WMULT_SHIFT-NICE_0_SHIFT) + }; + + for_each_sched_entity(se) { + struct load_weight *se_lw = &se->load; + + if (se->load.weight < NICE_0_LOAD) + se_lw = &lw; + + delta = calc_delta_mine(delta, + cfs_rq_of(se)->load.weight, se_lw); + } + + return delta; } /* @@ -401,11 +454,7 @@ __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr, curr->sum_exec_runtime += delta_exec; schedstat_add(cfs_rq, exec_clock, delta_exec); - delta_exec_weighted = delta_exec; - if (unlikely(curr->load.weight != NICE_0_LOAD)) { - delta_exec_weighted = calc_delta_fair(delta_exec_weighted, - &curr->load); - } + delta_exec_weighted = calc_delta_fair(delta_exec, curr); curr->vruntime += delta_exec_weighted; } @@ -492,20 +541,43 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se) * Scheduling class queueing methods: */ +#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED +static void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ + cfs_rq->task_weight += weight; +} +#else +static inline void +add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight) +{ +} +#endif + static void account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_add(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + inc_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, se->load.weight); cfs_rq->nr_running++; se->on_rq = 1; + list_add(&se->group_node, &cfs_rq->tasks); } static void account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) { update_load_sub(&cfs_rq->load, se->load.weight); + if (!parent_entity(se)) + dec_cpu_load(rq_of(cfs_rq), se->load.weight); + if (entity_is_task(se)) + add_cfs_task_weight(cfs_rq, -se->load.weight); cfs_rq->nr_running--; se->on_rq = 0; + list_del_init(&se->group_node); } static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se) @@ -590,11 +662,15 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial) if (!initial) { /* sleeps upto a single latency don't count. */ if (sched_feat(NEW_FAIR_SLEEPERS)) { + unsigned long thresh = sysctl_sched_latency; + + /* + * convert the sleeper threshold into virtual time + */ if (sched_feat(NORMALIZED_SLEEPER)) - vruntime -= calc_delta_fair(sysctl_sched_latency, - &cfs_rq->load); - else - vruntime -= sysctl_sched_latency; + thresh = calc_delta_fair(thresh, se); + + vruntime -= thresh; } /* ensure we never gain time by being placed backwards. */ @@ -611,6 +687,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) * Update run-time statistics of the 'current'. */ update_curr(cfs_rq); + account_entity_enqueue(cfs_rq, se); if (wakeup) { place_entity(cfs_rq, se, 0); @@ -621,7 +698,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) check_spread(cfs_rq, se); if (se != cfs_rq->curr) __enqueue_entity(cfs_rq, se); - account_entity_enqueue(cfs_rq, se); } static void update_avg(u64 *avg, u64 sample) @@ -770,8 +846,10 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * queued ticks are scheduled to match the slice, so don't bother * validating it and just reschedule. */ - if (queued) - return resched_task(rq_of(cfs_rq)->curr); + if (queued) { + resched_task(rq_of(cfs_rq)->curr); + return; + } /* * don't let the period tick interfere with the hrtick preemption */ @@ -886,7 +964,7 @@ static void yield_task_fair(struct rq *rq) return; if (likely(!sysctl_sched_compat_yield) && curr->policy != SCHED_BATCH) { - __update_rq_clock(rq); + update_rq_clock(rq); /* * Update run-time statistics of the 'current'. */ @@ -936,7 +1014,7 @@ static int wake_idle(int cpu, struct task_struct *p) * sibling runqueue info. This will avoid the checks and cache miss * penalities associated with that. */ - if (idle_cpu(cpu) || cpu_rq(cpu)->nr_running > 1) + if (idle_cpu(cpu) || cpu_rq(cpu)->cfs.nr_running > 1) return cpu; for_each_domain(cpu, sd) { @@ -1091,11 +1169,10 @@ static unsigned long wakeup_gran(struct sched_entity *se) unsigned long gran = sysctl_sched_wakeup_granularity; /* - * More easily preempt - nice tasks, while not making - * it harder for + nice tasks. + * More easily preempt - nice tasks, while not making it harder for + * + nice tasks. */ - if (unlikely(se->load.weight > NICE_0_LOAD)) - gran = calc_delta_fair(gran, &se->load); + gran = calc_delta_asym(sysctl_sched_wakeup_granularity, se); return gran; } @@ -1250,21 +1327,24 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) * the current task: */ static struct task_struct * -__load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) +__load_balance_iterator(struct cfs_rq *cfs_rq, struct list_head *next) { struct task_struct *p = NULL; struct sched_entity *se; - if (!curr) + if (next == &cfs_rq->tasks) return NULL; /* Skip over entities that are not tasks */ do { - se = rb_entry(curr, struct sched_entity, run_node); - curr = rb_next(curr); - } while (curr && !entity_is_task(se)); + se = list_entry(next, struct sched_entity, group_node); + next = next->next; + } while (next != &cfs_rq->tasks && !entity_is_task(se)); + + if (next == &cfs_rq->tasks) + return NULL; - cfs_rq->rb_load_balance_curr = curr; + cfs_rq->balance_iterator = next; if (entity_is_task(se)) p = task_of(se); @@ -1276,85 +1356,100 @@ static struct task_struct *load_balance_start_fair(void *arg) { struct cfs_rq *cfs_rq = arg; - return __load_balance_iterator(cfs_rq, first_fair(cfs_rq)); + return __load_balance_iterator(cfs_rq, cfs_rq->tasks.next); } static struct task_struct *load_balance_next_fair(void *arg) { struct cfs_rq *cfs_rq = arg; - return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr); + return __load_balance_iterator(cfs_rq, cfs_rq->balance_iterator); } -#ifdef CONFIG_FAIR_GROUP_SCHED -static int cfs_rq_best_prio(struct cfs_rq *cfs_rq) +static unsigned long +__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, struct sched_domain *sd, + enum cpu_idle_type idle, int *all_pinned, int *this_best_prio, + struct cfs_rq *cfs_rq) { - struct sched_entity *curr; - struct task_struct *p; - - if (!cfs_rq->nr_running || !first_fair(cfs_rq)) - return MAX_PRIO; - - curr = cfs_rq->curr; - if (!curr) - curr = __pick_next_entity(cfs_rq); + struct rq_iterator cfs_rq_iterator; - p = task_of(curr); + cfs_rq_iterator.start = load_balance_start_fair; + cfs_rq_iterator.next = load_balance_next_fair; + cfs_rq_iterator.arg = cfs_rq; - return p->prio; + return balance_tasks(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &cfs_rq_iterator); } -#endif +#ifdef CONFIG_FAIR_GROUP_SCHED static unsigned long load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, unsigned long max_load_move, struct sched_domain *sd, enum cpu_idle_type idle, int *all_pinned, int *this_best_prio) { - struct cfs_rq *busy_cfs_rq; long rem_load_move = max_load_move; - struct rq_iterator cfs_rq_iterator; - - cfs_rq_iterator.start = load_balance_start_fair; - cfs_rq_iterator.next = load_balance_next_fair; + int busiest_cpu = cpu_of(busiest); + struct task_group *tg; - for_each_leaf_cfs_rq(busiest, busy_cfs_rq) { -#ifdef CONFIG_FAIR_GROUP_SCHED - struct cfs_rq *this_cfs_rq; + rcu_read_lock(); + list_for_each_entry(tg, &task_groups, list) { long imbalance; - unsigned long maxload; + unsigned long this_weight, busiest_weight; + long rem_load, max_load, moved_load; - this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu); + /* + * empty group + */ + if (!aggregate(tg, sd)->task_weight) + continue; + + rem_load = rem_load_move * aggregate(tg, sd)->rq_weight; + rem_load /= aggregate(tg, sd)->load + 1; + + this_weight = tg->cfs_rq[this_cpu]->task_weight; + busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight; + + imbalance = (busiest_weight - this_weight) / 2; + + if (imbalance < 0) + imbalance = busiest_weight; + + max_load = max(rem_load, imbalance); + moved_load = __load_balance_fair(this_rq, this_cpu, busiest, + max_load, sd, idle, all_pinned, this_best_prio, + tg->cfs_rq[busiest_cpu]); - imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight; - /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */ - if (imbalance <= 0) + if (!moved_load) continue; - /* Don't pull more than imbalance/2 */ - imbalance /= 2; - maxload = min(rem_load_move, imbalance); + move_group_shares(tg, sd, busiest_cpu, this_cpu); - *this_best_prio = cfs_rq_best_prio(this_cfs_rq); -#else -# define maxload rem_load_move -#endif - /* - * pass busy_cfs_rq argument into - * load_balance_[start|next]_fair iterators - */ - cfs_rq_iterator.arg = busy_cfs_rq; - rem_load_move -= balance_tasks(this_rq, this_cpu, busiest, - maxload, sd, idle, all_pinned, - this_best_prio, - &cfs_rq_iterator); + moved_load *= aggregate(tg, sd)->load; + moved_load /= aggregate(tg, sd)->rq_weight + 1; - if (rem_load_move <= 0) + rem_load_move -= moved_load; + if (rem_load_move < 0) break; } + rcu_read_unlock(); return max_load_move - rem_load_move; } +#else +static unsigned long +load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, + unsigned long max_load_move, + struct sched_domain *sd, enum cpu_idle_type idle, + int *all_pinned, int *this_best_prio) +{ + return __load_balance_fair(this_rq, this_cpu, busiest, + max_load_move, sd, idle, all_pinned, + this_best_prio, &busiest->cfs); +} +#endif static int move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,