X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=22712b2e058a45f82f01f65ccb89c61117ccaa37;hb=1020387f5f3b52929b387103cf976321981f8e26;hp=02d468844a91b79beb4975499a80d9703eaee9e2;hpb=dc938520d2bf343b239795cfa24e4f44649358dc;p=linux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 02d468844a..22712b2e05 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -65,6 +65,7 @@ #include #include #include +#include #include #include @@ -160,6 +161,8 @@ struct rt_prio_array { struct cfs_rq; +static LIST_HEAD(task_groups); + /* task group related information */ struct task_group { #ifdef CONFIG_FAIR_CGROUP_SCHED @@ -170,6 +173,11 @@ struct task_group { /* runqueue "owned" by this group on each cpu */ struct cfs_rq **cfs_rq; + struct sched_rt_entity **rt_se; + struct rt_rq **rt_rq; + + unsigned int rt_ratio; + /* * shares assigned to a task group governs how much of cpu bandwidth * is allocated to the group. The more shares a group has, the more is @@ -182,7 +190,7 @@ struct task_group { * * Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66% * Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33% - * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% + * Bw(C) = 3000/(1000+2000+3000) * 100 = 50% * * The weight assigned to a task group's schedulable entities on every * cpu (task_group.se[a_cpu]->load.weight) is derived from the task @@ -192,9 +200,9 @@ struct task_group { * tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000; * * Note: It's not necessary that each of a task's group schedulable - * entity have the same weight on all CPUs. If the group - * has 2 of its tasks on CPU0 and 1 task on CPU1, then a - * better distribution of weight could be: + * entity have the same weight on all CPUs. If the group + * has 2 of its tasks on CPU0 and 1 task on CPU1, then a + * better distribution of weight could be: * * tg_A->se[0]->load.weight = 2/3 * 2000 = 1333 * tg_A->se[1]->load.weight = 1/2 * 2000 = 667 @@ -207,6 +215,7 @@ struct task_group { unsigned long shares; struct rcu_head rcu; + struct list_head list; }; /* Default task group's sched entity on each cpu */ @@ -214,9 +223,15 @@ static DEFINE_PER_CPU(struct sched_entity, init_sched_entity); /* Default task group's cfs_rq on each cpu */ static DEFINE_PER_CPU(struct cfs_rq, init_cfs_rq) ____cacheline_aligned_in_smp; +static DEFINE_PER_CPU(struct sched_rt_entity, init_sched_rt_entity); +static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; + static struct sched_entity *init_sched_entity_p[NR_CPUS]; static struct cfs_rq *init_cfs_rq_p[NR_CPUS]; +static struct sched_rt_entity *init_sched_rt_entity_p[NR_CPUS]; +static struct rt_rq *init_rt_rq_p[NR_CPUS]; + /* task_group_mutex serializes add/remove of task groups and also changes to * a task group's cpu shares. */ @@ -239,6 +254,9 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares); struct task_group init_task_group = { .se = init_sched_entity_p, .cfs_rq = init_cfs_rq_p, + + .rt_se = init_sched_rt_entity_p, + .rt_rq = init_rt_rq_p, }; #ifdef CONFIG_FAIR_USER_SCHED @@ -268,10 +286,13 @@ static inline struct task_group *task_group(struct task_struct *p) } /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ -static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; + + p->rt.rt_rq = task_group(p)->rt_rq[cpu]; + p->rt.parent = task_group(p)->rt_se[cpu]; } static inline void lock_task_group_list(void) @@ -296,7 +317,7 @@ static inline void unlock_doms_cur(void) #else -static inline void set_task_cfs_rq(struct task_struct *p, unsigned int cpu) { } +static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } static inline void lock_task_group_list(void) { } static inline void unlock_task_group_list(void) { } static inline void lock_doms_cur(void) { } @@ -341,13 +362,23 @@ struct cfs_rq { /* Real-Time classes' related field in a runqueue: */ struct rt_rq { struct rt_prio_array active; - int rt_load_balance_idx; - struct list_head *rt_load_balance_head, *rt_load_balance_curr; unsigned long rt_nr_running; +#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED + int highest_prio; /* highest queued rt task prio */ +#endif +#ifdef CONFIG_SMP unsigned long rt_nr_migratory; - /* highest queued rt task prio */ - int highest_prio; int overloaded; +#endif + int rt_throttled; + u64 rt_time; + +#ifdef CONFIG_FAIR_GROUP_SCHED + struct rq *rq; + struct list_head leaf_rt_rq_list; + struct task_group *tg; + struct sched_rt_entity *rt_se; +#endif }; #ifdef CONFIG_SMP @@ -409,11 +440,15 @@ struct rq { u64 nr_switches; struct cfs_rq cfs; + struct rt_rq rt; + u64 rt_period_expire; + int rt_throttled; + #ifdef CONFIG_FAIR_GROUP_SCHED /* list of leaf cfs_rq on this cpu: */ struct list_head leaf_cfs_rq_list; + struct list_head leaf_rt_rq_list; #endif - struct rt_rq rt; /* * This is part of a global counter where only the total sum @@ -451,6 +486,12 @@ struct rq { struct list_head migration_queue; #endif +#ifdef CONFIG_SCHED_HRTICK + unsigned long hrtick_flags; + ktime_t hrtick_expire; + struct hrtimer hrtick_timer; +#endif + #ifdef CONFIG_SCHEDSTATS /* latency stats */ struct sched_info rq_sched_info; @@ -554,6 +595,23 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +unsigned long rt_needs_cpu(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + u64 delta; + + if (!rq->rt_throttled) + return 0; + + if (rq->clock > rq->rt_period_expire) + return 1; + + delta = rq->rt_period_expire - rq->clock; + do_div(delta, NSEC_PER_SEC / HZ); + + return (unsigned long)delta; +} + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -572,6 +630,8 @@ enum { SCHED_FEAT_START_DEBIT = 4, SCHED_FEAT_TREE_AVG = 8, SCHED_FEAT_APPROX_AVG = 16, + SCHED_FEAT_HRTICK = 32, + SCHED_FEAT_DOUBLE_TICK = 64, }; const_debug unsigned int sysctl_sched_features = @@ -579,7 +639,9 @@ const_debug unsigned int sysctl_sched_features = SCHED_FEAT_WAKEUP_PREEMPT * 1 | SCHED_FEAT_START_DEBIT * 1 | SCHED_FEAT_TREE_AVG * 0 | - SCHED_FEAT_APPROX_AVG * 0; + SCHED_FEAT_APPROX_AVG * 0 | + SCHED_FEAT_HRTICK * 1 | + SCHED_FEAT_DOUBLE_TICK * 0; #define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) @@ -589,6 +651,21 @@ const_debug unsigned int sysctl_sched_features = */ const_debug unsigned int sysctl_sched_nr_migrate = 32; +/* + * period over which we measure -rt task cpu usage in ms. + * default: 1s + */ +const_debug unsigned int sysctl_sched_rt_period = 1000; + +#define SCHED_RT_FRAC_SHIFT 16 +#define SCHED_RT_FRAC (1UL << SCHED_RT_FRAC_SHIFT) + +/* + * ratio of time -rt tasks may consume. + * default: 95% + */ +const_debug unsigned int sysctl_sched_rt_ratio = 62259; + /* * For kernel-internal use: high-speed (but slightly incorrect) per-cpu * clock constructed from sched_clock(): @@ -796,6 +873,173 @@ void sched_clock_idle_wakeup_event(u64 delta_ns) } EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); +static void __resched_task(struct task_struct *p, int tif_bit); + +static inline void resched_task(struct task_struct *p) +{ + __resched_task(p, TIF_NEED_RESCHED); +} + +#ifdef CONFIG_SCHED_HRTICK +/* + * Use HR-timers to deliver accurate preemption points. + * + * Its all a bit involved since we cannot program an hrt while holding the + * rq->lock. So what we do is store a state in in rq->hrtick_* and ask for a + * reschedule event. + * + * When we get rescheduled we reprogram the hrtick_timer outside of the + * rq->lock. + */ +static inline void resched_hrt(struct task_struct *p) +{ + __resched_task(p, TIF_HRTICK_RESCHED); +} + +static inline void resched_rq(struct rq *rq) +{ + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + resched_task(rq->curr); + spin_unlock_irqrestore(&rq->lock, flags); +} + +enum { + HRTICK_SET, /* re-programm hrtick_timer */ + HRTICK_RESET, /* not a new slice */ +}; + +/* + * Use hrtick when: + * - enabled by features + * - hrtimer is actually high res + */ +static inline int hrtick_enabled(struct rq *rq) +{ + if (!sched_feat(HRTICK)) + return 0; + return hrtimer_is_hres_active(&rq->hrtick_timer); +} + +/* + * Called to set the hrtick timer state. + * + * called with rq->lock held and irqs disabled + */ +static void hrtick_start(struct rq *rq, u64 delay, int reset) +{ + assert_spin_locked(&rq->lock); + + /* + * preempt at: now + delay + */ + rq->hrtick_expire = + ktime_add_ns(rq->hrtick_timer.base->get_time(), delay); + /* + * indicate we need to program the timer + */ + __set_bit(HRTICK_SET, &rq->hrtick_flags); + if (reset) + __set_bit(HRTICK_RESET, &rq->hrtick_flags); + + /* + * New slices are called from the schedule path and don't need a + * forced reschedule. + */ + if (reset) + resched_hrt(rq->curr); +} + +static void hrtick_clear(struct rq *rq) +{ + if (hrtimer_active(&rq->hrtick_timer)) + hrtimer_cancel(&rq->hrtick_timer); +} + +/* + * Update the timer from the possible pending state. + */ +static void hrtick_set(struct rq *rq) +{ + ktime_t time; + int set, reset; + unsigned long flags; + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + spin_lock_irqsave(&rq->lock, flags); + set = __test_and_clear_bit(HRTICK_SET, &rq->hrtick_flags); + reset = __test_and_clear_bit(HRTICK_RESET, &rq->hrtick_flags); + time = rq->hrtick_expire; + clear_thread_flag(TIF_HRTICK_RESCHED); + spin_unlock_irqrestore(&rq->lock, flags); + + if (set) { + hrtimer_start(&rq->hrtick_timer, time, HRTIMER_MODE_ABS); + if (reset && !hrtimer_active(&rq->hrtick_timer)) + resched_rq(rq); + } else + hrtick_clear(rq); +} + +/* + * High-resolution timer tick. + * Runs from hardirq context with interrupts disabled. + */ +static enum hrtimer_restart hrtick(struct hrtimer *timer) +{ + struct rq *rq = container_of(timer, struct rq, hrtick_timer); + + WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); + + spin_lock(&rq->lock); + __update_rq_clock(rq); + rq->curr->sched_class->task_tick(rq, rq->curr, 1); + spin_unlock(&rq->lock); + + return HRTIMER_NORESTART; +} + +static inline void init_rq_hrtick(struct rq *rq) +{ + rq->hrtick_flags = 0; + hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + rq->hrtick_timer.function = hrtick; + rq->hrtick_timer.cb_mode = HRTIMER_CB_IRQSAFE_NO_SOFTIRQ; +} + +void hrtick_resched(void) +{ + struct rq *rq; + unsigned long flags; + + if (!test_thread_flag(TIF_HRTICK_RESCHED)) + return; + + local_irq_save(flags); + rq = cpu_rq(smp_processor_id()); + hrtick_set(rq); + local_irq_restore(flags); +} +#else +static inline void hrtick_clear(struct rq *rq) +{ +} + +static inline void hrtick_set(struct rq *rq) +{ +} + +static inline void init_rq_hrtick(struct rq *rq) +{ +} + +void hrtick_resched(void) +{ +} +#endif + /* * resched_task - mark a task 'to be rescheduled now'. * @@ -809,16 +1053,16 @@ EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); #define tsk_is_polling(t) test_tsk_thread_flag(t, TIF_POLLING_NRFLAG) #endif -static void resched_task(struct task_struct *p) +static void __resched_task(struct task_struct *p, int tif_bit) { int cpu; assert_spin_locked(&task_rq(p)->lock); - if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED))) + if (unlikely(test_tsk_thread_flag(p, tif_bit))) return; - set_tsk_thread_flag(p, TIF_NEED_RESCHED); + set_tsk_thread_flag(p, tif_bit); cpu = task_cpu(p); if (cpu == smp_processor_id()) @@ -841,10 +1085,10 @@ static void resched_cpu(int cpu) spin_unlock_irqrestore(&rq->lock, flags); } #else -static inline void resched_task(struct task_struct *p) +static void __resched_task(struct task_struct *p, int tif_bit) { assert_spin_locked(&task_rq(p)->lock); - set_tsk_need_resched(p); + set_tsk_thread_flag(p, tif_bit); } #endif @@ -1142,7 +1386,7 @@ unsigned long weighted_cpuload(const int cpu) static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu) { - set_task_cfs_rq(p, cpu); + set_task_rq(p, cpu); #ifdef CONFIG_SMP /* * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be @@ -1685,7 +1929,7 @@ static void __sched_fork(struct task_struct *p) p->se.wait_max = 0; #endif - INIT_LIST_HEAD(&p->run_list); + INIT_LIST_HEAD(&p->rt.run_list); p->se.on_rq = 0; #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -3496,8 +3740,8 @@ void scheduler_tick(void) rq->clock = next_tick; rq->tick_timestamp = rq->clock; update_cpu_load(rq); - if (curr != rq->idle) /* FIXME: needed? */ - curr->sched_class->task_tick(rq, curr); + curr->sched_class->task_tick(rq, curr, 0); + update_sched_rt_period(rq); spin_unlock(&rq->lock); #ifdef CONFIG_SMP @@ -3643,6 +3887,8 @@ need_resched_nonpreemptible: schedule_debug(prev); + hrtick_clear(rq); + /* * Do the rq-clock update outside the rq lock: */ @@ -3680,14 +3926,20 @@ need_resched_nonpreemptible: ++*switch_count; context_switch(rq, prev, next); /* unlocks the rq */ + /* + * the context switch might have flipped the stack from under + * us, hence refresh the local variables. + */ + cpu = smp_processor_id(); + rq = cpu_rq(cpu); } else spin_unlock_irq(&rq->lock); - if (unlikely(reacquire_kernel_lock(current) < 0)) { - cpu = smp_processor_id(); - rq = cpu_rq(cpu); + hrtick_set(rq); + + if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; - } + preempt_enable_no_resched(); if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) goto need_resched; @@ -4678,7 +4930,8 @@ static void __cond_resched(void) } while (need_resched()); } -int __sched cond_resched(void) +#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) +int __sched _cond_resched(void) { if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && system_state == SYSTEM_RUNNING) { @@ -4687,7 +4940,8 @@ int __sched cond_resched(void) } return 0; } -EXPORT_SYMBOL(cond_resched); +EXPORT_SYMBOL(_cond_resched); +#endif /* * cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -5076,7 +5330,7 @@ int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) p->sched_class->set_cpus_allowed(p, &new_mask); else { p->cpus_allowed = new_mask; - p->nr_cpus_allowed = cpus_weight(new_mask); + p->rt.nr_cpus_allowed = cpus_weight(new_mask); } /* Can the task run on the task's current CPU? If so, we're done */ @@ -6853,6 +7107,73 @@ static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) cfs_rq->min_vruntime = (u64)(-(1LL << 20)); } +static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq) +{ + struct rt_prio_array *array; + int i; + + array = &rt_rq->active; + for (i = 0; i < MAX_RT_PRIO; i++) { + INIT_LIST_HEAD(array->queue + i); + __clear_bit(i, array->bitmap); + } + /* delimiter for bitsearch: */ + __set_bit(MAX_RT_PRIO, array->bitmap); + +#if defined CONFIG_SMP || defined CONFIG_FAIR_GROUP_SCHED + rt_rq->highest_prio = MAX_RT_PRIO; +#endif +#ifdef CONFIG_SMP + rt_rq->rt_nr_migratory = 0; + rt_rq->overloaded = 0; +#endif + + rt_rq->rt_time = 0; + rt_rq->rt_throttled = 0; + +#ifdef CONFIG_FAIR_GROUP_SCHED + rt_rq->rq = rq; +#endif +} + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void init_tg_cfs_entry(struct rq *rq, struct task_group *tg, + struct cfs_rq *cfs_rq, struct sched_entity *se, + int cpu, int add) +{ + tg->cfs_rq[cpu] = cfs_rq; + init_cfs_rq(cfs_rq, rq); + cfs_rq->tg = tg; + if (add) + list_add(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + + tg->se[cpu] = se; + se->cfs_rq = &rq->cfs; + se->my_q = cfs_rq; + se->load.weight = tg->shares; + se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); + se->parent = NULL; +} + +static void init_tg_rt_entry(struct rq *rq, struct task_group *tg, + struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, + int cpu, int add) +{ + tg->rt_rq[cpu] = rt_rq; + init_rt_rq(rt_rq, rq); + rt_rq->tg = tg; + rt_rq->rt_se = rt_se; + if (add) + list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); + + tg->rt_se[cpu] = rt_se; + rt_se->rt_rq = &rq->rt; + rt_se->my_q = rt_rq; + rt_se->parent = NULL; + INIT_LIST_HEAD(&rt_se->run_list); +} +#endif + void __init sched_init(void) { int highest_cpu = 0; @@ -6862,8 +7183,11 @@ void __init sched_init(void) init_defrootdomain(); #endif +#ifdef CONFIG_FAIR_GROUP_SCHED + list_add(&init_task_group.list, &task_groups); +#endif + for_each_possible_cpu(i) { - struct rt_prio_array *array; struct rq *rq; rq = cpu_rq(i); @@ -6872,29 +7196,22 @@ void __init sched_init(void) rq->nr_running = 0; rq->clock = 1; init_cfs_rq(&rq->cfs, rq); + init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED - INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); - { - struct cfs_rq *cfs_rq = &per_cpu(init_cfs_rq, i); - struct sched_entity *se = - &per_cpu(init_sched_entity, i); - - init_cfs_rq_p[i] = cfs_rq; - init_cfs_rq(cfs_rq, rq); - cfs_rq->tg = &init_task_group; - list_add(&cfs_rq->leaf_cfs_rq_list, - &rq->leaf_cfs_rq_list); - - init_sched_entity_p[i] = se; - se->cfs_rq = &rq->cfs; - se->my_q = cfs_rq; - se->load.weight = init_task_group_load; - se->load.inv_weight = - div64_64(1ULL<<32, init_task_group_load); - se->parent = NULL; - } init_task_group.shares = init_task_group_load; + INIT_LIST_HEAD(&rq->leaf_cfs_rq_list); + init_tg_cfs_entry(rq, &init_task_group, + &per_cpu(init_cfs_rq, i), + &per_cpu(init_sched_entity, i), i, 1); + + init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */ + INIT_LIST_HEAD(&rq->leaf_rt_rq_list); + init_tg_rt_entry(rq, &init_task_group, + &per_cpu(init_rt_rq, i), + &per_cpu(init_sched_rt_entity, i), i, 1); #endif + rq->rt_period_expire = 0; + rq->rt_throttled = 0; for (j = 0; j < CPU_LOAD_IDX_MAX; j++) rq->cpu_load[j] = 0; @@ -6907,20 +7224,11 @@ void __init sched_init(void) rq->cpu = i; rq->migration_thread = NULL; INIT_LIST_HEAD(&rq->migration_queue); - rq->rt.highest_prio = MAX_RT_PRIO; - rq->rt.overloaded = 0; rq_attach_root(rq, &def_root_domain); #endif + init_rq_hrtick(rq); atomic_set(&rq->nr_iowait, 0); - - array = &rq->rt.active; - for (j = 0; j < MAX_RT_PRIO; j++) { - INIT_LIST_HEAD(array->queue + j); - __clear_bit(j, array->bitmap); - } highest_cpu = i; - /* delimiter for bitsearch: */ - __set_bit(MAX_RT_PRIO, array->bitmap); } set_load_weight(&init_task); @@ -7092,7 +7400,7 @@ void set_curr_task(int cpu, struct task_struct *p) #ifdef CONFIG_SMP /* * distribute shares of all task groups among their schedulable entities, - * to reflect load distrbution across cpus. + * to reflect load distribution across cpus. */ static int rebalance_shares(struct sched_domain *sd, int this_cpu) { @@ -7159,7 +7467,7 @@ static int rebalance_shares(struct sched_domain *sd, int this_cpu) * sysctl_sched_max_bal_int_shares represents the maximum interval between * consecutive calls to rebalance_shares() in the same sched domain. * - * These settings allows for the appropriate tradeoff between accuracy of + * These settings allows for the appropriate trade-off between accuracy of * fairness and the associated overhead. * */ @@ -7240,12 +7548,36 @@ static int load_balance_monitor(void *unused) } #endif /* CONFIG_SMP */ +static void free_sched_group(struct task_group *tg) +{ + int i; + + for_each_possible_cpu(i) { + if (tg->cfs_rq) + kfree(tg->cfs_rq[i]); + if (tg->se) + kfree(tg->se[i]); + if (tg->rt_rq) + kfree(tg->rt_rq[i]); + if (tg->rt_se) + kfree(tg->rt_se[i]); + } + + kfree(tg->cfs_rq); + kfree(tg->se); + kfree(tg->rt_rq); + kfree(tg->rt_se); + kfree(tg); +} + /* allocate runqueue etc for a new task group */ struct task_group *sched_create_group(void) { struct task_group *tg; struct cfs_rq *cfs_rq; struct sched_entity *se; + struct rt_rq *rt_rq; + struct sched_rt_entity *rt_se; struct rq *rq; int i; @@ -7259,100 +7591,89 @@ struct task_group *sched_create_group(void) tg->se = kzalloc(sizeof(se) * NR_CPUS, GFP_KERNEL); if (!tg->se) goto err; + tg->rt_rq = kzalloc(sizeof(rt_rq) * NR_CPUS, GFP_KERNEL); + if (!tg->rt_rq) + goto err; + tg->rt_se = kzalloc(sizeof(rt_se) * NR_CPUS, GFP_KERNEL); + if (!tg->rt_se) + goto err; + + tg->shares = NICE_0_LOAD; + tg->rt_ratio = 0; /* XXX */ for_each_possible_cpu(i) { rq = cpu_rq(i); - cfs_rq = kmalloc_node(sizeof(struct cfs_rq), GFP_KERNEL, - cpu_to_node(i)); + cfs_rq = kmalloc_node(sizeof(struct cfs_rq), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!cfs_rq) goto err; - se = kmalloc_node(sizeof(struct sched_entity), GFP_KERNEL, - cpu_to_node(i)); + se = kmalloc_node(sizeof(struct sched_entity), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); if (!se) goto err; - memset(cfs_rq, 0, sizeof(struct cfs_rq)); - memset(se, 0, sizeof(struct sched_entity)); + rt_rq = kmalloc_node(sizeof(struct rt_rq), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + if (!rt_rq) + goto err; - tg->cfs_rq[i] = cfs_rq; - init_cfs_rq(cfs_rq, rq); - cfs_rq->tg = tg; + rt_se = kmalloc_node(sizeof(struct sched_rt_entity), + GFP_KERNEL|__GFP_ZERO, cpu_to_node(i)); + if (!rt_se) + goto err; - tg->se[i] = se; - se->cfs_rq = &rq->cfs; - se->my_q = cfs_rq; - se->load.weight = NICE_0_LOAD; - se->load.inv_weight = div64_64(1ULL<<32, NICE_0_LOAD); - se->parent = NULL; + init_tg_cfs_entry(rq, tg, cfs_rq, se, i, 0); + init_tg_rt_entry(rq, tg, rt_rq, rt_se, i, 0); } - tg->shares = NICE_0_LOAD; - lock_task_group_list(); for_each_possible_cpu(i) { rq = cpu_rq(i); cfs_rq = tg->cfs_rq[i]; list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list); + rt_rq = tg->rt_rq[i]; + list_add_rcu(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list); } + list_add_rcu(&tg->list, &task_groups); unlock_task_group_list(); return tg; err: - for_each_possible_cpu(i) { - if (tg->cfs_rq) - kfree(tg->cfs_rq[i]); - if (tg->se) - kfree(tg->se[i]); - } - kfree(tg->cfs_rq); - kfree(tg->se); - kfree(tg); - + free_sched_group(tg); return ERR_PTR(-ENOMEM); } /* rcu callback to free various structures associated with a task group */ -static void free_sched_group(struct rcu_head *rhp) +static void free_sched_group_rcu(struct rcu_head *rhp) { - struct task_group *tg = container_of(rhp, struct task_group, rcu); - struct cfs_rq *cfs_rq; - struct sched_entity *se; - int i; - /* now it should be safe to free those cfs_rqs */ - for_each_possible_cpu(i) { - cfs_rq = tg->cfs_rq[i]; - kfree(cfs_rq); - - se = tg->se[i]; - kfree(se); - } - - kfree(tg->cfs_rq); - kfree(tg->se); - kfree(tg); + free_sched_group(container_of(rhp, struct task_group, rcu)); } /* Destroy runqueue etc associated with a task group */ void sched_destroy_group(struct task_group *tg) { struct cfs_rq *cfs_rq = NULL; + struct rt_rq *rt_rq = NULL; int i; lock_task_group_list(); for_each_possible_cpu(i) { cfs_rq = tg->cfs_rq[i]; list_del_rcu(&cfs_rq->leaf_cfs_rq_list); + rt_rq = tg->rt_rq[i]; + list_del_rcu(&rt_rq->leaf_rt_rq_list); } + list_del_rcu(&tg->list); unlock_task_group_list(); BUG_ON(!cfs_rq); /* wait for possible concurrent references to cfs_rqs complete */ - call_rcu(&tg->rcu, free_sched_group); + call_rcu(&tg->rcu, free_sched_group_rcu); } /* change task's runqueue when it moves between groups. @@ -7368,11 +7689,6 @@ void sched_move_task(struct task_struct *tsk) rq = task_rq_lock(tsk, &flags); - if (tsk->sched_class != &fair_sched_class) { - set_task_cfs_rq(tsk, task_cpu(tsk)); - goto done; - } - update_rq_clock(rq); running = task_current(rq, tsk); @@ -7384,7 +7700,7 @@ void sched_move_task(struct task_struct *tsk) tsk->sched_class->put_prev_task(rq, tsk); } - set_task_cfs_rq(tsk, task_cpu(tsk)); + set_task_rq(tsk, task_cpu(tsk)); if (on_rq) { if (unlikely(running)) @@ -7392,7 +7708,6 @@ void sched_move_task(struct task_struct *tsk) enqueue_task(rq, tsk, 0); } -done: task_rq_unlock(rq, &flags); } @@ -7477,6 +7792,31 @@ unsigned long sched_group_shares(struct task_group *tg) return tg->shares; } +/* + * Ensure the total rt_ratio <= sysctl_sched_rt_ratio + */ +int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio) +{ + struct task_group *tgi; + unsigned long total = 0; + + rcu_read_lock(); + list_for_each_entry_rcu(tgi, &task_groups, list) + total += tgi->rt_ratio; + rcu_read_unlock(); + + if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio) + return -EINVAL; + + tg->rt_ratio = rt_ratio; + return 0; +} + +unsigned long sched_group_rt_ratio(struct task_group *tg) +{ + return tg->rt_ratio; +} + #endif /* CONFIG_FAIR_GROUP_SCHED */ #ifdef CONFIG_FAIR_CGROUP_SCHED @@ -7552,12 +7892,30 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) return (u64) tg->shares; } +static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype, + u64 rt_ratio_val) +{ + return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val); +} + +static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft) +{ + struct task_group *tg = cgroup_tg(cgrp); + + return (u64) tg->rt_ratio; +} + static struct cftype cpu_files[] = { { .name = "shares", .read_uint = cpu_shares_read_uint, .write_uint = cpu_shares_write_uint, }, + { + .name = "rt_ratio", + .read_uint = cpu_rt_ratio_read_uint, + .write_uint = cpu_rt_ratio_write_uint, + }, }; static int cpu_cgroup_populate(struct cgroup_subsys *ss, struct cgroup *cont)