From: Dhaval Giani Date: Sat, 19 Apr 2008 17:44:59 +0000 (+0200) Subject: sched: mix tasks and groups X-Git-Tag: v2.6.26-rc1~1132^2~19 X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=354d60c2ff72d86627dfe2089d186824abf4bb8e;p=linux-2.6 sched: mix tasks and groups This patch allows tasks and groups to exist in the same cfs_rq. With this change the CFS group scheduling follows a 1/(M+N) model from a 1/(1+N) fairness model where M tasks and N groups exist at the cfs_rq level. [a.p.zijlstra@chello.nl: rt bits and assorted fixes] Signed-off-by: Dhaval Giani Signed-off-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- diff --git a/kernel/sched.c b/kernel/sched.c index 62830eaec5..1b7399dfa3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -273,6 +273,7 @@ struct task_group { struct list_head list; }; +#ifdef CONFIG_USER_SCHED #ifdef CONFIG_FAIR_GROUP_SCHED /* Default task group's sched entity on each cpu */ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); @@ -284,6 +285,7 @@ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; #endif +#endif /* task_group_lock serializes add/remove of task groups and also changes to * a task group's cpu shares. @@ -7447,6 +7449,10 @@ static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); tg->se[cpu] = se; + /* se could be NULL for init_task_group */ + if (!se) + return; + se->cfs_rq = &rq->cfs; se->my_q = cfs_rq; se->load.weight = tg->shares; @@ -7469,6 +7475,9 @@ static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); tg->rt_se[cpu] = rt_se; + if (!rt_se) + return; + rt_se->rt_rq = &rq->rt; rt_se->my_q = rt_rq; rt_se->parent = NULL; @@ -7539,18 +7548,56 @@ void __init sched_init(void) #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.shares = init_task_group_load; INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); +#ifdef CONFIG_CGROUP_SCHED + /* + * How much cpu bandwidth does init_task_group get? + * + * In case of task-groups formed thr' the cgroup filesystem, it + * gets 100% of the cpu resources in the system. This overall + * system cpu resource is divided among the tasks of + * init_task_group and its child task-groups in a fair manner, + * based on each entity's (task or task-group's) weight + * (se->load.weight). + * + * In other words, if init_task_group has 10 tasks of weight + * 1024) and two child groups A0 and A1 (of weight 1024 each), + * then A0's share of the cpu resource is: + * + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * + * We achieve this by letting init_task_group's tasks sit + * directly in rq->cfs (i.e init_task_group->se[] = NULL). + */ + init_tg_cfs_entry(rq, &init_task_group, &rq->cfs, NULL, i, 1); +#elif defined CONFIG_USER_SCHED + /* + * In case of task-groups formed thr' the user id of tasks, + * init_task_group represents tasks belonging to root user. + * Hence it forms a sibling of all subsequent groups formed. + * In this case, init_task_group gets only a fraction of overall + * system cpu resource, based on the weight assigned to root + * user's cpu share (INIT_TASK_GROUP_LOAD). This is accomplished + * by letting tasks of init_task_group sit in a separate cfs_rq + * (init_cfs_rq) and having one entity represent this group of + * tasks in rq->cfs (i.e init_task_group->se[] != NULL). + */ init_tg_cfs_entry(rq, &init_task_group, &per_cpu(init_cfs_rq, i), &per_cpu(init_sched_entity, i), i, 1); #endif +#endif /* CONFIG_FAIR_GROUP_SCHED */ + + rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; #ifdef CONFIG_RT_GROUP_SCHED INIT_LIST_HEAD(&rq->leaf_rt_rq_list); +#ifdef CONFIG_CGROUP_SCHED + init_tg_rt_entry(rq, &init_task_group, &rq->rt, NULL, i, 1); +#elif defined CONFIG_USER_SCHED init_tg_rt_entry(rq, &init_task_group, &per_cpu(init_rt_rq, i), &per_cpu(init_sched_rt_entity, i), i, 1); -#else - rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; +#endif #endif for (j = 0; j < CPU_LOAD_IDX_MAX; j++) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 022e036f2c..3dde0f0ec9 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1133,6 +1133,17 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se) return 0; } +/* return depth at which a sched entity is present in the hierarchy */ +static inline int depth_se(struct sched_entity *se) +{ + int depth = 0; + + for_each_sched_entity(se) + depth++; + + return depth; +} + /* * Preempt the current task with a newly woken task if needed: */ @@ -1141,6 +1152,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) struct task_struct *curr = rq->curr; struct cfs_rq *cfs_rq = task_cfs_rq(curr); struct sched_entity *se = &curr->se, *pse = &p->se; + int se_depth, pse_depth; if (unlikely(rt_prio(p->prio))) { update_rq_clock(rq); @@ -1165,6 +1177,27 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p) if (!sched_feat(WAKEUP_PREEMPT)) return; + /* + * preemption test can be made between sibling entities who are in the + * same cfs_rq i.e who have a common parent. Walk up the hierarchy of + * both tasks until we find their ancestors who are siblings of common + * parent. + */ + + /* First walk up until both entities are at same depth */ + se_depth = depth_se(se); + pse_depth = depth_se(pse); + + while (se_depth > pse_depth) { + se_depth--; + se = parent_entity(se); + } + + while (pse_depth > se_depth) { + pse_depth--; + pse = parent_entity(pse); + } + while (!is_same_group(se, pse)) { se = parent_entity(se); pse = parent_entity(pse); @@ -1223,13 +1256,22 @@ static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) static struct task_struct * __load_balance_iterator(struct cfs_rq *cfs_rq, struct rb_node *curr) { - struct task_struct *p; + struct task_struct *p = NULL; + struct sched_entity *se; if (!curr) return NULL; - p = rb_entry(curr, struct task_struct, se.run_node); - cfs_rq->rb_load_balance_curr = rb_next(curr); + /* Skip over entities that are not tasks */ + do { + se = rb_entry(curr, struct sched_entity, run_node); + curr = rb_next(curr); + } while (curr && !entity_is_task(se)); + + cfs_rq->rb_load_balance_curr = curr; + + if (entity_is_task(se)) + p = task_of(se); return p; } @@ -1489,9 +1531,6 @@ static void print_cfs_stats(struct seq_file *m, int cpu) { struct cfs_rq *cfs_rq; -#ifdef CONFIG_FAIR_GROUP_SCHED - print_cfs_rq(m, cpu, &cpu_rq(cpu)->cfs); -#endif rcu_read_lock(); for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq) print_cfs_rq(m, cpu, cfs_rq); diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c index 8ff824565e..201a69382a 100644 --- a/kernel/sched_rt.c +++ b/kernel/sched_rt.c @@ -374,11 +374,15 @@ static void update_curr_rt(struct rq *rq) curr->se.exec_start = rq->clock; cpuacct_charge(curr, delta_exec); - spin_lock(&rt_rq->rt_runtime_lock); - rt_rq->rt_time += delta_exec; - if (sched_rt_runtime_exceeded(rt_rq)) - resched_task(curr); - spin_unlock(&rt_rq->rt_runtime_lock); + for_each_sched_rt_entity(rt_se) { + rt_rq = rt_rq_of_se(rt_se); + + spin_lock(&rt_rq->rt_runtime_lock); + rt_rq->rt_time += delta_exec; + if (sched_rt_runtime_exceeded(rt_rq)) + resched_task(curr); + spin_unlock(&rt_rq->rt_runtime_lock); + } } static inline @@ -477,7 +481,6 @@ static void dequeue_rt_entity(struct sched_rt_entity *rt_se) * entries, we must remove entries top - down. * * XXX: O(1/2 h^2) because we can only walk up, not down the chain. - * doesn't matter much for now, as h=2 for GROUP_SCHED. */ static void dequeue_rt_stack(struct task_struct *p) {