X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=04228524d160e3f45e718377b9c0a77f1cf98405;hb=95e904c7da715aa2dbfb595da66b63de37a0bb04;hp=62d7481caca53314d45f139440c33d138e74b5cd;hpb=1d3504fcf5606579d60b649d19f44b3871c1ddae;p=linux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 62d7481cac..04228524d1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -68,20 +68,12 @@ #include #include #include +#include +#include #include #include -/* - * Scheduler clock - returns current time in nanosec units. - * This is default implementation. - * Architectures and sub-architectures can override this. - */ -unsigned long long __attribute__((weak)) sched_clock(void) -{ - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); -} - /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -144,7 +136,7 @@ static inline void sg_inc_cpu_power(struct sched_group *sg, u32 val) static inline int rt_policy(int policy) { - if (unlikely(policy == SCHED_FIFO) || unlikely(policy == SCHED_RR)) + if (unlikely(policy == SCHED_FIFO || policy == SCHED_RR)) return 1; return 0; } @@ -240,6 +232,12 @@ static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) } #endif +/* + * sched_domains_mutex serializes calls to arch_init_sched_domains, + * detach_destroy_domains and partition_sched_domains. + */ +static DEFINE_MUTEX(sched_domains_mutex); + #ifdef CONFIG_GROUP_SCHED #include @@ -306,9 +304,6 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp; */ static DEFINE_SPINLOCK(task_group_lock); -/* doms_cur_mutex serializes access to doms_cur[] array */ -static DEFINE_MUTEX(doms_cur_mutex); - #ifdef CONFIG_FAIR_GROUP_SCHED #ifdef CONFIG_USER_SCHED # define INIT_TASK_GROUP_LOAD (2*NICE_0_LOAD) @@ -316,6 +311,17 @@ static DEFINE_MUTEX(doms_cur_mutex); # define INIT_TASK_GROUP_LOAD NICE_0_LOAD #endif +/* + * A weight of 0 or 1 can cause arithmetics problems. + * A weight of a cfs_rq is the sum of weights of which entities + * are queued on this cfs_rq, so a weight of a entity should not be + * too large, so as the shares value of a task group. + * (The default weight is 1024 - so there's no practical + * limitation from this.) + */ +#define MIN_SHARES 2 +#define MAX_SHARES (1UL << 18) + static int init_task_group_load = INIT_TASK_GROUP_LOAD; #endif @@ -354,21 +360,9 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) #endif } -static inline void lock_doms_cur(void) -{ - mutex_lock(&doms_cur_mutex); -} - -static inline void unlock_doms_cur(void) -{ - mutex_unlock(&doms_cur_mutex); -} - #else static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { } -static inline void lock_doms_cur(void) { } -static inline void unlock_doms_cur(void) { } #endif /* CONFIG_GROUP_SCHED */ @@ -382,8 +376,12 @@ struct cfs_rq { struct rb_root tasks_timeline; struct rb_node *rb_leftmost; - struct rb_node *rb_load_balance_curr; - /* 'curr' points to currently running entity on this cfs_rq. + + struct list_head tasks; + struct list_head *balance_iterator; + + /* + * 'curr' points to currently running entity on this cfs_rq. * It is set to NULL otherwise (i.e when none are currently running). */ struct sched_entity *curr, *next; @@ -515,13 +513,7 @@ struct rq { unsigned long next_balance; struct mm_struct *prev_mm; - u64 clock, prev_clock_raw; - s64 clock_max_delta; - - unsigned int clock_warps, clock_overflows, clock_underflows; - u64 idle_clock; - unsigned int clock_deep_idle_events; - u64 tick_timestamp; + u64 clock; atomic_t nr_iowait; @@ -586,82 +578,6 @@ static inline int cpu_of(struct rq *rq) #endif } -#ifdef CONFIG_NO_HZ -static inline bool nohz_on(int cpu) -{ - return tick_get_tick_sched(cpu)->nohz_mode != NOHZ_MODE_INACTIVE; -} - -static inline u64 max_skipped_ticks(struct rq *rq) -{ - return nohz_on(cpu_of(rq)) ? jiffies - rq->last_tick_seen + 2 : 1; -} - -static inline void update_last_tick_seen(struct rq *rq) -{ - rq->last_tick_seen = jiffies; -} -#else -static inline u64 max_skipped_ticks(struct rq *rq) -{ - return 1; -} - -static inline void update_last_tick_seen(struct rq *rq) -{ -} -#endif - -/* - * Update the per-runqueue clock, as finegrained as the platform can give - * us, but without assuming monotonicity, etc.: - */ -static void __update_rq_clock(struct rq *rq) -{ - u64 prev_raw = rq->prev_clock_raw; - u64 now = sched_clock(); - s64 delta = now - prev_raw; - u64 clock = rq->clock; - -#ifdef CONFIG_SCHED_DEBUG - WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); -#endif - /* - * Protect against sched_clock() occasionally going backwards: - */ - if (unlikely(delta < 0)) { - clock++; - rq->clock_warps++; - } else { - /* - * Catch too large forward jumps too: - */ - u64 max_jump = max_skipped_ticks(rq) * TICK_NSEC; - u64 max_time = rq->tick_timestamp + max_jump; - - if (unlikely(clock + delta > max_time)) { - if (clock < max_time) - clock = max_time; - else - clock++; - rq->clock_overflows++; - } else { - if (unlikely(delta > rq->clock_max_delta)) - rq->clock_max_delta = delta; - clock += delta; - } - } - - rq->prev_clock_raw = now; - rq->clock = clock; -} - -static void update_rq_clock(struct rq *rq) -{ - if (likely(smp_processor_id() == cpu_of(rq))) - __update_rq_clock(rq); -} - /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -677,6 +593,11 @@ static void update_rq_clock(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) +static inline void update_rq_clock(struct rq *rq) +{ + rq->clock = sched_clock_cpu(cpu_of(rq)); +} + /* * Tunables that become constants when CONFIG_SCHED_DEBUG is off: */ @@ -689,30 +610,137 @@ static void update_rq_clock(struct rq *rq) /* * Debugging: various feature bits */ + +#define SCHED_FEAT(name, enabled) \ + __SCHED_FEAT_##name , + enum { - SCHED_FEAT_NEW_FAIR_SLEEPERS = 1, - SCHED_FEAT_WAKEUP_PREEMPT = 2, - SCHED_FEAT_START_DEBIT = 4, - SCHED_FEAT_AFFINE_WAKEUPS = 8, - SCHED_FEAT_CACHE_HOT_BUDDY = 16, - SCHED_FEAT_SYNC_WAKEUPS = 32, - SCHED_FEAT_HRTICK = 64, - SCHED_FEAT_DOUBLE_TICK = 128, - SCHED_FEAT_NORMALIZED_SLEEPER = 256, +#include "sched_features.h" }; +#undef SCHED_FEAT + +#define SCHED_FEAT(name, enabled) \ + (1UL << __SCHED_FEAT_##name) * enabled | + const_debug unsigned int sysctl_sched_features = - SCHED_FEAT_NEW_FAIR_SLEEPERS * 1 | - SCHED_FEAT_WAKEUP_PREEMPT * 1 | - SCHED_FEAT_START_DEBIT * 1 | - SCHED_FEAT_AFFINE_WAKEUPS * 1 | - SCHED_FEAT_CACHE_HOT_BUDDY * 1 | - SCHED_FEAT_SYNC_WAKEUPS * 1 | - SCHED_FEAT_HRTICK * 1 | - SCHED_FEAT_DOUBLE_TICK * 0 | - SCHED_FEAT_NORMALIZED_SLEEPER * 1; +#include "sched_features.h" + 0; + +#undef SCHED_FEAT + +#ifdef CONFIG_SCHED_DEBUG +#define SCHED_FEAT(name, enabled) \ + #name , + +static __read_mostly char *sched_feat_names[] = { +#include "sched_features.h" + NULL +}; + +#undef SCHED_FEAT + +static int sched_feat_open(struct inode *inode, struct file *filp) +{ + filp->private_data = inode->i_private; + return 0; +} + +static ssize_t +sched_feat_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf; + int r = 0; + int len = 0; + int i; + + for (i = 0; sched_feat_names[i]; i++) { + len += strlen(sched_feat_names[i]); + len += 4; + } + + buf = kmalloc(len + 2, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + for (i = 0; sched_feat_names[i]; i++) { + if (sysctl_sched_features & (1UL << i)) + r += sprintf(buf + r, "%s ", sched_feat_names[i]); + else + r += sprintf(buf + r, "NO_%s ", sched_feat_names[i]); + } + + r += sprintf(buf + r, "\n"); + WARN_ON(r >= len + 2); + + r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r); + + kfree(buf); + + return r; +} + +static ssize_t +sched_feat_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + char *cmp = buf; + int neg = 0; + int i; + + if (cnt > 63) + cnt = 63; -#define sched_feat(x) (sysctl_sched_features & SCHED_FEAT_##x) + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + if (strncmp(buf, "NO_", 3) == 0) { + neg = 1; + cmp += 3; + } + + for (i = 0; sched_feat_names[i]; i++) { + int len = strlen(sched_feat_names[i]); + + if (strncmp(cmp, sched_feat_names[i], len) == 0) { + if (neg) + sysctl_sched_features &= ~(1UL << i); + else + sysctl_sched_features |= (1UL << i); + break; + } + } + + if (!sched_feat_names[i]) + return -EINVAL; + + filp->f_pos += cnt; + + return cnt; +} + +static struct file_operations sched_feat_fops = { + .open = sched_feat_open, + .read = sched_feat_read, + .write = sched_feat_write, +}; + +static __init int sched_init_debug(void) +{ + debugfs_create_file("sched_features", 0644, NULL, NULL, + &sched_feat_fops); + + return 0; +} +late_initcall(sched_init_debug); + +#endif + +#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x)) /* * Number of tasks to iterate in a single balance run. @@ -747,7 +775,7 @@ static inline u64 global_rt_runtime(void) return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC; } -static const unsigned long long time_sync_thresh = 100000; +unsigned long long time_sync_thresh = 100000; static DEFINE_PER_CPU(unsigned long long, time_offset); static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); @@ -761,11 +789,14 @@ static DEFINE_PER_CPU(unsigned long long, prev_cpu_time); static DEFINE_SPINLOCK(time_sync_lock); static unsigned long long prev_global_time; -static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) +static unsigned long long __sync_cpu_clock(unsigned long long time, int cpu) { - unsigned long flags; - - spin_lock_irqsave(&time_sync_lock, flags); + /* + * We want this inlined, to not get tracer function calls + * in this critical section: + */ + spin_acquire(&time_sync_lock.dep_map, 0, 0, _THIS_IP_); + __raw_spin_lock(&time_sync_lock.raw_lock); if (time < prev_global_time) { per_cpu(time_offset, cpu) += prev_global_time - time; @@ -774,7 +805,8 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) prev_global_time = time; } - spin_unlock_irqrestore(&time_sync_lock, flags); + __raw_spin_unlock(&time_sync_lock.raw_lock); + spin_release(&time_sync_lock.dep_map, 1, _THIS_IP_); return time; } @@ -782,8 +814,6 @@ static unsigned long long __sync_cpu_clock(cycles_t time, int cpu) static unsigned long long __cpu_clock(int cpu) { unsigned long long now; - unsigned long flags; - struct rq *rq; /* * Only call sched_clock() if the scheduler has already been @@ -792,11 +822,7 @@ static unsigned long long __cpu_clock(int cpu) if (unlikely(!scheduler_running)) return 0; - local_irq_save(flags); - rq = cpu_rq(cpu); - update_rq_clock(rq); - now = rq->clock; - local_irq_restore(flags); + now = sched_clock_cpu(cpu); return now; } @@ -808,13 +834,18 @@ static unsigned long long __cpu_clock(int cpu) unsigned long long cpu_clock(int cpu) { unsigned long long prev_cpu_time, time, delta_time; + unsigned long flags; + local_irq_save(flags); prev_cpu_time = per_cpu(prev_cpu_time, cpu); time = __cpu_clock(cpu) + per_cpu(time_offset, cpu); delta_time = time-prev_cpu_time; - if (unlikely(delta_time > time_sync_thresh)) + if (unlikely(delta_time > time_sync_thresh)) { time = __sync_cpu_clock(time, cpu); + per_cpu(prev_cpu_time, cpu) = time; + } + local_irq_restore(flags); return time; } @@ -965,43 +996,6 @@ static struct rq *this_rq_lock(void) return rq; } -/* - * We are going deep-idle (irqs are disabled): - */ -void sched_clock_idle_sleep_event(void) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - - spin_lock(&rq->lock); - __update_rq_clock(rq); - spin_unlock(&rq->lock); - rq->clock_deep_idle_events++; -} -EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); - -/* - * We just idled delta nanoseconds (called with irqs disabled): - */ -void sched_clock_idle_wakeup_event(u64 delta_ns) -{ - struct rq *rq = cpu_rq(smp_processor_id()); - u64 now = sched_clock(); - - rq->idle_clock += delta_ns; - /* - * Override the previous timestamp and ignore all - * sched_clock() deltas that occured while we idled, - * and use the PM-provided delta_ns to advance the - * rq clock: - */ - spin_lock(&rq->lock); - rq->prev_clock_raw = now; - rq->clock += delta_ns; - spin_unlock(&rq->lock); - touch_softlockup_watchdog(); -} -EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); - static void __resched_task(struct task_struct *p, int tif_bit); static inline void resched_task(struct task_struct *p) @@ -1037,6 +1031,7 @@ static inline void resched_rq(struct rq *rq) enum { HRTICK_SET, /* re-programm hrtick_timer */ HRTICK_RESET, /* not a new slice */ + HRTICK_BLOCK, /* stop hrtick operations */ }; /* @@ -1048,6 +1043,8 @@ static inline int hrtick_enabled(struct rq *rq) { if (!sched_feat(HRTICK)) return 0; + if (unlikely(test_bit(HRTICK_BLOCK, &rq->hrtick_flags))) + return 0; return hrtimer_is_hres_active(&rq->hrtick_timer); } @@ -1123,14 +1120,72 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer) WARN_ON_ONCE(cpu_of(rq) != smp_processor_id()); spin_lock(&rq->lock); - __update_rq_clock(rq); + update_rq_clock(rq); rq->curr->sched_class->task_tick(rq, rq->curr, 1); spin_unlock(&rq->lock); return HRTIMER_NORESTART; } -static inline void init_rq_hrtick(struct rq *rq) +#ifdef CONFIG_SMP +static void hotplug_hrtick_disable(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + rq->hrtick_flags = 0; + __set_bit(HRTICK_BLOCK, &rq->hrtick_flags); + spin_unlock_irqrestore(&rq->lock, flags); + + hrtick_clear(rq); +} + +static void hotplug_hrtick_enable(int cpu) +{ + struct rq *rq = cpu_rq(cpu); + unsigned long flags; + + spin_lock_irqsave(&rq->lock, flags); + __clear_bit(HRTICK_BLOCK, &rq->hrtick_flags); + spin_unlock_irqrestore(&rq->lock, flags); +} + +static int +hotplug_hrtick(struct notifier_block *nfb, unsigned long action, void *hcpu) +{ + int cpu = (int)(long)hcpu; + + switch (action) { + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + case CPU_DEAD: + case CPU_DEAD_FROZEN: + hotplug_hrtick_disable(cpu); + return NOTIFY_OK; + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + hotplug_hrtick_enable(cpu); + return NOTIFY_OK; + } + + return NOTIFY_DONE; +} + +static void init_hrtick(void) +{ + hotcpu_notifier(hotplug_hrtick, 0); +} +#endif /* CONFIG_SMP */ + +static void init_rq_hrtick(struct rq *rq) { rq->hrtick_flags = 0; hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); @@ -1167,6 +1222,10 @@ static inline void init_rq_hrtick(struct rq *rq) void hrtick_resched(void) { } + +static inline void init_hrtick(void) +{ +} #endif /* @@ -1283,8 +1342,13 @@ calc_delta_mine(unsigned long delta_exec, unsigned long weight, { u64 tmp; - if (unlikely(!lw->inv_weight)) - lw->inv_weight = (WMULT_CONST-lw->weight/2) / (lw->weight+1); + if (!lw->inv_weight) { + if (BITS_PER_LONG > 32 && unlikely(lw->weight >= WMULT_CONST)) + lw->inv_weight = 1; + else + lw->inv_weight = 1 + (WMULT_CONST-lw->weight/2) + / (lw->weight+1); + } tmp = (u64)delta_exec * weight; /* @@ -1402,11 +1466,29 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime); static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {} #endif +static inline void inc_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_add(&rq->load, load); +} + +static inline void dec_cpu_load(struct rq *rq, unsigned long load) +{ + update_load_sub(&rq->load, load); +} + #ifdef CONFIG_SMP static unsigned long source_load(int cpu, int type); static unsigned long target_load(int cpu, int type); static unsigned long cpu_avg_load_per_task(int cpu); static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd); +#else /* CONFIG_SMP */ + +#ifdef CONFIG_FAIR_GROUP_SCHED +static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) +{ +} +#endif + #endif /* CONFIG_SMP */ #include "sched_stats.h" @@ -2119,6 +2201,7 @@ static void __sched_fork(struct task_struct *p) INIT_LIST_HEAD(&p->rt.run_list); p->se.on_rq = 0; + INIT_LIST_HEAD(&p->se.group_node); #ifdef CONFIG_PREEMPT_NOTIFIERS INIT_HLIST_HEAD(&p->preempt_notifiers); @@ -3856,8 +3939,10 @@ void account_system_time(struct task_struct *p, int hardirq_offset, struct rq *rq = this_rq(); cputime64_t tmp; - if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) - return account_guest_time(p, cputime); + if ((p->flags & PF_VCPU) && (irq_count() - hardirq_offset == 0)) { + account_guest_time(p, cputime); + return; + } p->stime = cputime_add(p->stime, cputime); @@ -3921,19 +4006,11 @@ void scheduler_tick(void) int cpu = smp_processor_id(); struct rq *rq = cpu_rq(cpu); struct task_struct *curr = rq->curr; - u64 next_tick = rq->tick_timestamp + TICK_NSEC; + + sched_clock_tick(); spin_lock(&rq->lock); - __update_rq_clock(rq); - /* - * Let rq->clock advance by at least TICK_NSEC: - */ - if (unlikely(rq->clock < next_tick)) { - rq->clock = next_tick; - rq->clock_underflows++; - } - rq->tick_timestamp = rq->clock; - update_last_tick_seen(rq); + update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); spin_unlock(&rq->lock); @@ -4012,7 +4089,7 @@ static inline void schedule_debug(struct task_struct *prev) * schedule() atomically, we ignore that path for now. * Otherwise, whine if we are scheduling when we should not be. */ - if (unlikely(in_atomic_preempt_off()) && unlikely(!prev->exit_state)) + if (unlikely(in_atomic_preempt_off() && !prev->exit_state)) __schedule_bug(prev); profile_hit(SCHED_PROFILING, __builtin_return_address(0)); @@ -4087,17 +4164,15 @@ need_resched_nonpreemptible: * Do the rq-clock update outside the rq lock: */ local_irq_disable(); - __update_rq_clock(rq); + update_rq_clock(rq); spin_lock(&rq->lock); clear_tsk_need_resched(prev); if (prev->state && !(preempt_count() & PREEMPT_ACTIVE)) { - if (unlikely((prev->state & TASK_INTERRUPTIBLE) && - signal_pending(prev))) { + if (unlikely(signal_pending_state(prev->state, prev))) prev->state = TASK_RUNNING; - } else { + else deactivate_task(rq, prev, 1); - } switch_count = &prev->nvcsw; } @@ -4112,9 +4187,9 @@ need_resched_nonpreemptible: prev->sched_class->put_prev_task(rq, prev); next = pick_next_task(rq, prev); - sched_info_switch(prev, next); - if (likely(prev != next)) { + sched_info_switch(prev, next); + rq->nr_switches++; rq->curr = next; ++*switch_count; @@ -4149,8 +4224,6 @@ EXPORT_SYMBOL(schedule); asmlinkage void __sched preempt_schedule(void) { struct thread_info *ti = current_thread_info(); - struct task_struct *task = current; - int saved_lock_depth; /* * If there is a non-zero preempt_count or interrupts are disabled, @@ -4161,16 +4234,7 @@ asmlinkage void __sched preempt_schedule(void) do { add_preempt_count(PREEMPT_ACTIVE); - - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; schedule(); - task->lock_depth = saved_lock_depth; sub_preempt_count(PREEMPT_ACTIVE); /* @@ -4191,26 +4255,15 @@ EXPORT_SYMBOL(preempt_schedule); asmlinkage void __sched preempt_schedule_irq(void) { struct thread_info *ti = current_thread_info(); - struct task_struct *task = current; - int saved_lock_depth; /* Catch callers which need to be fixed */ BUG_ON(ti->preempt_count || !irqs_disabled()); do { add_preempt_count(PREEMPT_ACTIVE); - - /* - * We keep the big kernel semaphore locked, but we - * clear ->lock_depth so that schedule() doesnt - * auto-release the semaphore: - */ - saved_lock_depth = task->lock_depth; - task->lock_depth = -1; local_irq_enable(); schedule(); local_irq_disable(); - task->lock_depth = saved_lock_depth; sub_preempt_count(PREEMPT_ACTIVE); /* @@ -5132,7 +5185,6 @@ static void __cond_resched(void) } while (need_resched()); } -#if !defined(CONFIG_PREEMPT) || defined(CONFIG_PREEMPT_VOLUNTARY) int __sched _cond_resched(void) { if (need_resched() && !(preempt_count() & PREEMPT_ACTIVE) && @@ -5143,7 +5195,6 @@ int __sched _cond_resched(void) return 0; } EXPORT_SYMBOL(_cond_resched); -#endif /* * cond_resched_lock() - if a reschedule is pending, drop the given lock, @@ -5438,8 +5489,11 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ +#if defined(CONFIG_PREEMPT) + task_thread_info(idle)->preempt_count = (idle->lock_depth >= 0); +#else task_thread_info(idle)->preempt_count = 0; - +#endif /* * The idle tasks have their own, simple scheduling class: */ @@ -6512,6 +6566,7 @@ static int find_next_best_node(int node, nodemask_t *used_nodes) /** * sched_domain_node_span - get a cpumask for a node's sched_domain * @node: node whose cpumask we're constructing + * @span: resulting cpumask * * Given a node, construct a good cpumask for its sched_domain to span. It * should be one that prevents unnecessary balancing, but also spreads tasks @@ -7168,8 +7223,8 @@ static int build_sched_domains(const cpumask_t *cpu_map) static cpumask_t *doms_cur; /* current sched domains */ static int ndoms_cur; /* number of sched domains in 'doms_cur' */ -static struct sched_domain_attr *dattr_cur; /* attribues of custom domains - in 'doms_cur' */ +static struct sched_domain_attr *dattr_cur; + /* attribues of custom domains in 'doms_cur' */ /* * Special case: If a kmalloc of a doms_cur partition (array of @@ -7269,7 +7324,7 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, { int i, j; - lock_doms_cur(); + mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); @@ -7318,7 +7373,7 @@ match2: register_sched_domain_sysctl(); - unlock_doms_cur(); + mutex_unlock(&sched_domains_mutex); } #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) @@ -7327,8 +7382,10 @@ int arch_reinit_sched_domains(void) int err; get_online_cpus(); + mutex_lock(&sched_domains_mutex); detach_destroy_domains(&cpu_online_map); err = arch_init_sched_domains(&cpu_online_map); + mutex_unlock(&sched_domains_mutex); put_online_cpus(); return err; @@ -7446,13 +7503,16 @@ void __init sched_init_smp(void) BUG_ON(sched_group_nodes_bycpu == NULL); #endif get_online_cpus(); + mutex_lock(&sched_domains_mutex); arch_init_sched_domains(&cpu_online_map); cpus_andnot(non_isolated_cpus, cpu_possible_map, cpu_isolated_map); if (cpus_empty(non_isolated_cpus)) cpu_set(smp_processor_id(), non_isolated_cpus); + mutex_unlock(&sched_domains_mutex); put_online_cpus(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0); + init_hrtick(); /* Move init over to a non-isolated CPU */ if (set_cpus_allowed_ptr(current, &non_isolated_cpus) < 0) @@ -7462,11 +7522,6 @@ void __init sched_init_smp(void) #else void __init sched_init_smp(void) { -#if defined(CONFIG_NUMA) - sched_group_nodes_bycpu = kzalloc(nr_cpu_ids * sizeof(void **), - GFP_KERNEL); - BUG_ON(sched_group_nodes_bycpu == NULL); -#endif sched_init_granularity(); } #endif /* CONFIG_SMP */ @@ -7481,6 +7536,7 @@ int in_sched_functions(unsigned long addr) static void init_cfs_rq(struct cfs_rq *cfs_rq, struct rq *rq) { cfs_rq->tasks_timeline = RB_ROOT; + INIT_LIST_HEAD(&cfs_rq->tasks); #ifdef CONFIG_FAIR_GROUP_SCHED cfs_rq->rq = rq; #endif @@ -7543,7 +7599,7 @@ static void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq, se->my_q = cfs_rq; se->load.weight = tg->shares; - se->load.inv_weight = div64_64(1ULL<<32, se->load.weight); + se->load.inv_weight = 0; se->parent = parent; } #endif @@ -7598,7 +7654,7 @@ void __init sched_init(void) * we use alloc_bootmem(). */ if (alloc_size) { - ptr = (unsigned long)alloc_bootmem_low(alloc_size); + ptr = (unsigned long)alloc_bootmem(alloc_size); #ifdef CONFIG_FAIR_GROUP_SCHED init_task_group.se = (struct sched_entity **)ptr; @@ -7666,8 +7722,6 @@ void __init sched_init(void) spin_lock_init(&rq->lock); lockdep_set_class(&rq->lock, &rq->rq_lock_key); rq->nr_running = 0; - rq->clock = 1; - update_last_tick_seen(rq); init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -7811,6 +7865,7 @@ EXPORT_SYMBOL(__might_sleep); static void normalize_task(struct rq *rq, struct task_struct *p) { int on_rq; + update_rq_clock(rq); on_rq = p->se.on_rq; if (on_rq) @@ -7842,7 +7897,6 @@ void normalize_rt_tasks(void) p->se.sleep_start = 0; p->se.block_start = 0; #endif - task_rq(p)->clock = 0; if (!rt_task(p)) { /* @@ -8212,7 +8266,7 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares) dequeue_entity(cfs_rq, se, 0); se->load.weight = shares; - se->load.inv_weight = div64_64((1ULL<<32), shares); + se->load.inv_weight = 0; if (on_rq) enqueue_entity(cfs_rq, se, 0); @@ -8233,13 +8287,10 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares) if (!tg->se[0]) return -EINVAL; - /* - * A weight of 0 or 1 can cause arithmetics problems. - * (The default weight is 1024 - so there's no practical - * limitation from this.) - */ - if (shares < 2) - shares = 2; + if (shares < MIN_SHARES) + shares = MIN_SHARES; + else if (shares > MAX_SHARES) + shares = MAX_SHARES; mutex_lock(&shares_mutex); if (tg->shares == shares) @@ -8293,7 +8344,7 @@ static unsigned long to_ratio(u64 period, u64 runtime) if (runtime == RUNTIME_INF) return 1ULL << 16; - return div64_64(runtime << 16, period); + return div64_u64(runtime << 16, period); } #ifdef CONFIG_CGROUP_SCHED @@ -8563,13 +8614,13 @@ cpu_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, } #ifdef CONFIG_FAIR_GROUP_SCHED -static int cpu_shares_write_uint(struct cgroup *cgrp, struct cftype *cftype, +static int cpu_shares_write_u64(struct cgroup *cgrp, struct cftype *cftype, u64 shareval) { return sched_group_set_shares(cgroup_tg(cgrp), shareval); } -static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) +static u64 cpu_shares_read_u64(struct cgroup *cgrp, struct cftype *cft) { struct task_group *tg = cgroup_tg(cgrp); @@ -8578,49 +8629,15 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft) #endif #ifdef CONFIG_RT_GROUP_SCHED -static ssize_t cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - const char __user *userbuf, - size_t nbytes, loff_t *unused_ppos) +static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft, + s64 val) { - char buffer[64]; - int retval = 0; - s64 val; - char *end; - - if (!nbytes) - return -EINVAL; - if (nbytes >= sizeof(buffer)) - return -E2BIG; - if (copy_from_user(buffer, userbuf, nbytes)) - return -EFAULT; - - buffer[nbytes] = 0; /* nul-terminate */ - - /* strip newline if necessary */ - if (nbytes && (buffer[nbytes-1] == '\n')) - buffer[nbytes-1] = 0; - val = simple_strtoll(buffer, &end, 0); - if (*end) - return -EINVAL; - - /* Pass to subsystem */ - retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val); - if (!retval) - retval = nbytes; - return retval; + return sched_group_set_rt_runtime(cgroup_tg(cgrp), val); } -static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft, - struct file *file, - char __user *buf, size_t nbytes, - loff_t *ppos) +static s64 cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft) { - char tmp[64]; - long val = sched_group_rt_runtime(cgroup_tg(cgrp)); - int len = sprintf(tmp, "%ld\n", val); - - return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); + return sched_group_rt_runtime(cgroup_tg(cgrp)); } static int cpu_rt_period_write_uint(struct cgroup *cgrp, struct cftype *cftype, @@ -8639,20 +8656,20 @@ static struct cftype cpu_files[] = { #ifdef CONFIG_FAIR_GROUP_SCHED { .name = "shares", - .read_uint = cpu_shares_read_uint, - .write_uint = cpu_shares_write_uint, + .read_u64 = cpu_shares_read_u64, + .write_u64 = cpu_shares_write_u64, }, #endif #ifdef CONFIG_RT_GROUP_SCHED { .name = "rt_runtime_us", - .read = cpu_rt_runtime_read, - .write = cpu_rt_runtime_write, + .read_s64 = cpu_rt_runtime_read, + .write_s64 = cpu_rt_runtime_write, }, { .name = "rt_period_us", - .read_uint = cpu_rt_period_read_uint, - .write_uint = cpu_rt_period_write_uint, + .read_u64 = cpu_rt_period_read_uint, + .write_u64 = cpu_rt_period_write_uint, }, #endif }; @@ -8783,8 +8800,8 @@ out: static struct cftype files[] = { { .name = "usage", - .read_uint = cpuusage_read, - .write_uint = cpuusage_write, + .read_u64 = cpuusage_read, + .write_u64 = cpuusage_write, }, };