X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched_fair.c;h=cf2cd6ce4cb25ad2bedc59b94205b33b24f8a9e9;hb=82736f4d1d2b7063b829cc93171a6e5aea8a9c49;hp=7b8d664d6f221c512a5f14ab35ae22deee735b91;hpb=c8cba857b4997d5b00451d01474638f6a153f713;p=linux-2.6 diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 7b8d664d6f..cf2cd6ce4c 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -726,21 +726,6 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup) __enqueue_entity(cfs_rq, se); } -static void update_avg(u64 *avg, u64 sample) -{ - s64 diff = sample - *avg; - *avg += diff >> 3; -} - -static void update_avg_stats(struct cfs_rq *cfs_rq, struct sched_entity *se) -{ - if (!se->last_wakeup) - return; - - update_avg(&se->avg_overlap, se->sum_exec_runtime - se->last_wakeup); - se->last_wakeup = 0; -} - static void dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) { @@ -751,7 +736,6 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep) update_stats_dequeue(cfs_rq, se); if (sleep) { - update_avg_stats(cfs_rq, se); #ifdef CONFIG_SCHEDSTATS if (entity_is_task(se)) { struct task_struct *tsk = task_of(se); @@ -894,7 +878,6 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) #ifdef CONFIG_SCHED_HRTICK static void hrtick_start_fair(struct rq *rq, struct task_struct *p) { - int requeue = rq->curr == p; struct sched_entity *se = &p->se; struct cfs_rq *cfs_rq = cfs_rq_of(se); @@ -915,13 +898,13 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) * Don't schedule slices shorter than 10000ns, that just * doesn't make sense. Rely on vruntime for fairness. */ - if (!requeue) + if (rq->curr != p) delta = max(10000LL, delta); - hrtick_start(rq, delta, requeue); + hrtick_start(rq, delta); } } -#else +#else /* !CONFIG_SCHED_HRTICK */ static inline void hrtick_start_fair(struct rq *rq, struct task_struct *p) { @@ -1020,6 +1003,8 @@ static void yield_task_fair(struct rq *rq) * not idle and an idle cpu is available. The span of cpus to * search starts with cpus closest then further out as needed, * so we always favor a closer, idle cpu. + * Domains may include CPUs that are not usable for migration, + * hence we need to mask them out (cpu_active_map) * * Returns the CPU we should wake onto. */ @@ -1047,7 +1032,8 @@ static int wake_idle(int cpu, struct task_struct *p) || ((sd->flags & SD_WAKE_IDLE_FAR) && !task_hot(p, task_rq(p)->clock, sd))) { cpus_and(tmp, sd->span, p->cpus_allowed); - for_each_cpu_mask(i, tmp) { + cpus_and(tmp, tmp, cpu_active_map); + for_each_cpu_mask_nr(i, tmp) { if (idle_cpu(i)) { if (i != task_cpu(p)) { schedstat_inc(p, @@ -1062,7 +1048,7 @@ static int wake_idle(int cpu, struct task_struct *p) } return cpu; } -#else +#else /* !ARCH_HAS_SCHED_WAKE_IDLE*/ static inline int wake_idle(int cpu, struct task_struct *p) { return cpu; @@ -1073,6 +1059,89 @@ static inline int wake_idle(int cpu, struct task_struct *p) static const struct sched_class fair_sched_class; +#ifdef CONFIG_FAIR_GROUP_SCHED +/* + * effective_load() calculates the load change as seen from the root_task_group + * + * Adding load to a group doesn't make a group heavier, but can cause movement + * of group shares between cpus. Assuming the shares were perfectly aligned one + * can calculate the shift in shares. + * + * The problem is that perfectly aligning the shares is rather expensive, hence + * we try to avoid doing that too often - see update_shares(), which ratelimits + * this change. + * + * We compensate this by not only taking the current delta into account, but + * also considering the delta between when the shares were last adjusted and + * now. + * + * We still saw a performance dip, some tracing learned us that between + * cgroup:/ and cgroup:/foo balancing the number of affine wakeups increased + * significantly. Therefore try to bias the error in direction of failing + * the affine wakeup. + * + */ +static long effective_load(struct task_group *tg, int cpu, + long wl, long wg) +{ + struct sched_entity *se = tg->se[cpu]; + long more_w; + + if (!tg->parent) + return wl; + + /* + * By not taking the decrease of shares on the other cpu into + * account our error leans towards reducing the affine wakeups. + */ + if (!wl && sched_feat(ASYM_EFF_LOAD)) + return wl; + + /* + * Instead of using this increment, also add the difference + * between when the shares were last updated and now. + */ + more_w = se->my_q->load.weight - se->my_q->rq_weight; + wl += more_w; + wg += more_w; + + for_each_sched_entity(se) { +#define D(n) (likely(n) ? (n) : 1) + + long S, rw, s, a, b; + + S = se->my_q->tg->shares; + s = se->my_q->shares; + rw = se->my_q->rq_weight; + + a = S*(rw + wl); + b = S*rw + s*wg; + + wl = s*(a-b)/D(b); + /* + * Assume the group is already running and will + * thus already be accounted for in the weight. + * + * That is, moving shares between CPUs, does not + * alter the group weight. + */ + wg = 0; +#undef D + } + + return wl; +} + +#else + +static inline unsigned long effective_load(struct task_group *tg, int cpu, + unsigned long wl, unsigned long wg) +{ + return wl; +} + +#endif + static int wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, struct task_struct *p, int prev_cpu, int this_cpu, int sync, @@ -1080,8 +1149,10 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, unsigned int imbalance) { struct task_struct *curr = this_rq->curr; + struct task_group *tg; unsigned long tl = this_load; unsigned long tl_per_task; + unsigned long weight; int balanced; if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS)) @@ -1092,19 +1163,28 @@ wake_affine(struct rq *rq, struct sched_domain *this_sd, struct rq *this_rq, * effect of the currently running task from the load * of the current CPU: */ - if (sync) - tl -= current->se.load.weight; + if (sync) { + tg = task_group(current); + weight = current->se.load.weight; + + tl += effective_load(tg, this_cpu, -weight, -weight); + load += effective_load(tg, prev_cpu, 0, -weight); + } - balanced = 100*(tl + p->se.load.weight) <= imbalance*load; + tg = task_group(p); + weight = p->se.load.weight; + + balanced = 100*(tl + effective_load(tg, this_cpu, weight, weight)) <= + imbalance*(load + effective_load(tg, prev_cpu, 0, weight)); /* * If the currently running task will sleep within * a reasonable amount of time then attract this newly * woken task: */ - if (sync && balanced && curr->sched_class == &fair_sched_class) { + if (sync && balanced) { if (curr->se.avg_overlap < sysctl_sched_migration_cost && - p->se.avg_overlap < sysctl_sched_migration_cost) + p->se.avg_overlap < sysctl_sched_migration_cost) return 1; } @@ -1265,7 +1345,6 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) return; } - se->last_wakeup = se->sum_exec_runtime; if (unlikely(se == pse)) return; @@ -1425,7 +1504,9 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, list_for_each_entry(tg, &task_groups, list) { struct cfs_rq *busiest_cfs_rq = tg->cfs_rq[busiest_cpu]; - long rem_load, moved_load; + unsigned long busiest_h_load = busiest_cfs_rq->h_load; + unsigned long busiest_weight = busiest_cfs_rq->load.weight; + u64 rem_load, moved_load; /* * empty group @@ -1433,8 +1514,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!busiest_cfs_rq->task_weight) continue; - rem_load = rem_load_move * busiest_cfs_rq->load.weight; - rem_load /= busiest_cfs_rq->h_load + 1; + rem_load = (u64)rem_load_move * busiest_weight; + rem_load = div_u64(rem_load, busiest_h_load + 1); moved_load = __load_balance_fair(this_rq, this_cpu, busiest, rem_load, sd, idle, all_pinned, this_best_prio, @@ -1443,8 +1524,8 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, if (!moved_load) continue; - moved_load *= busiest_cfs_rq->h_load; - moved_load /= busiest_cfs_rq->load.weight + 1; + moved_load *= busiest_h_load; + moved_load = div_u64(moved_load, busiest_weight + 1); rem_load_move -= moved_load; if (rem_load_move < 0) @@ -1490,7 +1571,7 @@ move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest, return 0; } -#endif +#endif /* CONFIG_SMP */ /* * scheduler tick hitting a task of our scheduling class: