X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fsched.c;h=5f889d0cbfcc2e614aef87963ec0bea5ea6a5f21;hb=9d8f53f2bba3c2c06e1e78126222aecf91f8ecdd;hp=54ce787b6207ff8077b1ab68308e2bedf5fcb9e4;hpb=674311d5b411e9042df4fdf7aef0b3c8217b6240;p=linux-2.6 diff --git a/kernel/sched.c b/kernel/sched.c index 54ce787b62..5f889d0cbf 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -262,7 +262,7 @@ static DEFINE_PER_CPU(struct runqueue, runqueues); /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. - * See update_sched_domains: synchronize_kernel for details. + * See detach_destroy_domains: synchronize_sched for details. * * The domain tree of any CPU may only be accessed from within * preempt-disabled sections. @@ -673,7 +673,7 @@ static inline void __activate_idle_task(task_t *p, runqueue_t *rq) rq->nr_running++; } -static void recalc_task_prio(task_t *p, unsigned long long now) +static int recalc_task_prio(task_t *p, unsigned long long now) { /* Caller must always ensure 'now >= p->timestamp' */ unsigned long long __sleep_time = now - p->timestamp; @@ -732,7 +732,7 @@ static void recalc_task_prio(task_t *p, unsigned long long now) } } - p->prio = effective_prio(p); + return effective_prio(p); } /* @@ -755,7 +755,7 @@ static void activate_task(task_t *p, runqueue_t *rq, int local) } #endif - recalc_task_prio(p, now); + p->prio = recalc_task_prio(p, now); /* * This checks to make sure it's not an uninterruptible task @@ -1021,8 +1021,59 @@ static int find_idlest_cpu(struct sched_group *group, int this_cpu) return idlest; } +/* + * sched_balance_self: balance the current task (running on cpu) in domains + * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and + * SD_BALANCE_EXEC. + * + * Balance, ie. select the least loaded group. + * + * Returns the target CPU number, or the same CPU if no balancing is needed. + * + * preempt must be disabled. + */ +static int sched_balance_self(int cpu, int flag) +{ + struct task_struct *t = current; + struct sched_domain *tmp, *sd = NULL; -#endif + for_each_domain(cpu, tmp) + if (tmp->flags & flag) + sd = tmp; + + while (sd) { + cpumask_t span; + struct sched_group *group; + int new_cpu; + int weight; + + span = sd->span; + group = find_idlest_group(sd, t, cpu); + if (!group) + goto nextlevel; + + new_cpu = find_idlest_cpu(group, cpu); + if (new_cpu == -1 || new_cpu == cpu) + goto nextlevel; + + /* Now try balancing at a lower domain level */ + cpu = new_cpu; +nextlevel: + sd = NULL; + weight = cpus_weight(span); + for_each_domain(cpu, tmp) { + if (weight <= cpus_weight(tmp->span)) + break; + if (tmp->flags & flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + return cpu; +} + +#endif /* CONFIG_SMP */ /* * wake_idle() will wake a task on an idle cpu if task->cpu is @@ -1240,8 +1291,15 @@ int fastcall wake_up_state(task_t *p, unsigned int state) * Perform scheduler related setup for a newly forked process p. * p is forked by current. */ -void fastcall sched_fork(task_t *p) +void fastcall sched_fork(task_t *p, int clone_flags) { + int cpu = get_cpu(); + +#ifdef CONFIG_SMP + cpu = sched_balance_self(cpu, SD_BALANCE_FORK); +#endif + set_task_cpu(p, cpu); + /* * We mark the process as running here, but have not actually * inserted it onto the runqueue yet. This guarantees that @@ -1282,12 +1340,10 @@ void fastcall sched_fork(task_t *p) * runqueue lock is not a problem. */ current->time_slice = 1; - preempt_disable(); scheduler_tick(); - local_irq_enable(); - preempt_enable(); - } else - local_irq_enable(); + } + local_irq_enable(); + put_cpu(); } /* @@ -1302,64 +1358,12 @@ void fastcall wake_up_new_task(task_t * p, unsigned long clone_flags) unsigned long flags; int this_cpu, cpu; runqueue_t *rq, *this_rq; -#ifdef CONFIG_SMP - struct sched_domain *tmp, *sd = NULL; -#endif rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_RUNNING); this_cpu = smp_processor_id(); cpu = task_cpu(p); -#ifdef CONFIG_SMP - for_each_domain(cpu, tmp) - if (tmp->flags & SD_BALANCE_FORK) - sd = tmp; - - if (sd) { - cpumask_t span; - int new_cpu; - struct sched_group *group; - -again: - schedstat_inc(sd, sbf_cnt); - span = sd->span; - cpu = task_cpu(p); - group = find_idlest_group(sd, p, cpu); - if (!group) { - schedstat_inc(sd, sbf_balanced); - goto nextlevel; - } - - new_cpu = find_idlest_cpu(group, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - schedstat_inc(sd, sbf_balanced); - goto nextlevel; - } - - if (cpu_isset(new_cpu, p->cpus_allowed)) { - schedstat_inc(sd, sbf_pushed); - set_task_cpu(p, new_cpu); - task_rq_unlock(rq, &flags); - rq = task_rq_lock(p, &flags); - cpu = task_cpu(p); - } - - /* Now try balancing at a lower domain level */ -nextlevel: - sd = NULL; - for_each_domain(cpu, tmp) { - if (cpus_subset(span, tmp->span)) - break; - if (tmp->flags & SD_BALANCE_FORK) - sd = tmp; - } - - if (sd) - goto again; - } - -#endif /* * We decrease the sleep average of forking parents * and children as well, to keep max-interactive tasks @@ -1708,58 +1712,16 @@ out: } /* - * sched_exec(): find the highest-level, exec-balance-capable - * domain and try to migrate the task to the least loaded CPU. - * - * execve() is a valuable balancing opportunity, because at this point - * the task has the smallest effective memory and cache footprint. + * sched_exec - execve() is a valuable balancing opportunity, because at + * this point the task has the smallest effective memory and cache footprint. */ void sched_exec(void) { - struct sched_domain *tmp, *sd = NULL; int new_cpu, this_cpu = get_cpu(); - - for_each_domain(this_cpu, tmp) - if (tmp->flags & SD_BALANCE_EXEC) - sd = tmp; - - if (sd) { - cpumask_t span; - struct sched_group *group; -again: - schedstat_inc(sd, sbe_cnt); - span = sd->span; - group = find_idlest_group(sd, current, this_cpu); - if (!group) { - schedstat_inc(sd, sbe_balanced); - goto nextlevel; - } - new_cpu = find_idlest_cpu(group, this_cpu); - if (new_cpu == -1 || new_cpu == this_cpu) { - schedstat_inc(sd, sbe_balanced); - goto nextlevel; - } - - schedstat_inc(sd, sbe_pushed); - put_cpu(); - sched_migrate_task(current, new_cpu); - - /* Now try balancing at a lower domain level */ - this_cpu = get_cpu(); -nextlevel: - sd = NULL; - for_each_domain(this_cpu, tmp) { - if (cpus_subset(span, tmp->span)) - break; - if (tmp->flags & SD_BALANCE_EXEC) - sd = tmp; - } - - if (sd) - goto again; - } - + new_cpu = sched_balance_self(this_cpu, SD_BALANCE_EXEC); put_cpu(); + if (new_cpu != this_cpu) + sched_migrate_task(current, new_cpu); } /* @@ -2068,6 +2030,12 @@ static runqueue_t *find_busiest_queue(struct sched_group *group) return busiest; } +/* + * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but + * so long as it is large enough. + */ +#define MAX_PINNED_INTERVAL 512 + /* * Check this_cpu to ensure it is balanced within domain. Attempt to move * tasks if there is an imbalance. @@ -2080,7 +2048,7 @@ static int load_balance(int this_cpu, runqueue_t *this_rq, struct sched_group *group; runqueue_t *busiest; unsigned long imbalance; - int nr_moved, all_pinned; + int nr_moved, all_pinned = 0; int active_balance = 0; spin_lock(&this_rq->lock); @@ -2171,7 +2139,8 @@ out_balanced: sd->nr_balance_failed = 0; /* tune up the balancing interval */ - if (sd->balance_interval < sd->max_interval) + if ((all_pinned && sd->balance_interval < MAX_PINNED_INTERVAL) || + (sd->balance_interval < sd->max_interval)) sd->balance_interval *= 2; return 0; @@ -2782,7 +2751,7 @@ asmlinkage void __sched schedule(void) struct list_head *queue; unsigned long long now; unsigned long run_time; - int cpu, idx; + int cpu, idx, new_prio; /* * Test if we are atomic. Since do_exit() needs to call into @@ -2904,9 +2873,14 @@ go_idle: delta = delta * (ON_RUNQUEUE_WEIGHT * 128 / 100) / 128; array = next->array; - dequeue_task(next, array); - recalc_task_prio(next, next->timestamp + delta); - enqueue_task(next, array); + new_prio = recalc_task_prio(next, next->timestamp + delta); + + if (unlikely(next->prio != new_prio)) { + dequeue_task(next, array); + next->prio = new_prio; + enqueue_task(next, array); + } else + requeue_task(next, array); } next->activated = 0; switch_tasks: @@ -3404,8 +3378,8 @@ EXPORT_SYMBOL(set_user_nice); */ int can_nice(const task_t *p, const int nice) { - /* convert nice value [19,-20] to rlimit style value [0,39] */ - int nice_rlim = 19 - nice; + /* convert nice value [19,-20] to rlimit style value [1,40] */ + int nice_rlim = 20 - nice; return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur || capable(CAP_SYS_NICE)); } @@ -3474,15 +3448,7 @@ int task_nice(const task_t *p) { return TASK_NICE(p); } - -/* - * The only users of task_nice are binfmt_elf and binfmt_elf32. - * binfmt_elf is no longer modular, but binfmt_elf32 still is. - * Therefore, task_nice is needed if there is a compat_mode. - */ -#ifdef CONFIG_COMPAT EXPORT_SYMBOL_GPL(task_nice); -#endif /** * idle_cpu - is a given cpu idle currently? @@ -3520,7 +3486,7 @@ static void __setscheduler(struct task_struct *p, int policy, int prio) p->policy = policy; p->rt_priority = prio; if (policy != SCHED_NORMAL) - p->prio = MAX_USER_RT_PRIO-1 - p->rt_priority; + p->prio = MAX_RT_PRIO-1 - p->rt_priority; else p->prio = p->static_prio; } @@ -3552,18 +3518,31 @@ recheck: * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL is 0. */ if (param->sched_priority < 0 || - param->sched_priority > MAX_USER_RT_PRIO-1) + (p->mm && param->sched_priority > MAX_USER_RT_PRIO-1) || + (!p->mm && param->sched_priority > MAX_RT_PRIO-1)) return -EINVAL; if ((policy == SCHED_NORMAL) != (param->sched_priority == 0)) return -EINVAL; - if ((policy == SCHED_FIFO || policy == SCHED_RR) && - param->sched_priority > p->signal->rlim[RLIMIT_RTPRIO].rlim_cur && - !capable(CAP_SYS_NICE)) - return -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_NICE)) - return -EPERM; + /* + * Allow unprivileged RT tasks to decrease priority: + */ + if (!capable(CAP_SYS_NICE)) { + /* can't change policy */ + if (policy != p->policy && + !p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) + return -EPERM; + /* can't increase priority */ + if (policy != SCHED_NORMAL && + param->sched_priority > p->rt_priority && + param->sched_priority > + p->signal->rlim[RLIMIT_RTPRIO].rlim_cur) + return -EPERM; + /* can't change other user's priorities */ + if ((current->euid != p->euid) && + (current->euid != p->uid)) + return -EPERM; + } retval = security_task_setscheduler(p, policy, param); if (retval) @@ -3900,6 +3879,13 @@ asmlinkage long sys_sched_yield(void) static inline void __cond_resched(void) { + /* + * The BKS might be reacquired before we have dropped + * PREEMPT_ACTIVE, which could trigger a second + * cond_resched() call. + */ + if (unlikely(preempt_count())) + return; do { add_preempt_count(PREEMPT_ACTIVE); schedule(); @@ -4189,6 +4175,14 @@ void show_state(void) read_unlock(&tasklist_lock); } +/** + * init_idle - set up an idle thread for a given CPU + * @idle: task in question + * @cpu: cpu the idle task belongs to + * + * NOTE: this function does not set the idle thread's NEED_RESCHED + * flag, to make booting more robust. + */ void __devinit init_idle(task_t *idle, int cpu) { runqueue_t *rq = cpu_rq(cpu); @@ -4206,7 +4200,6 @@ void __devinit init_idle(task_t *idle, int cpu) #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW) idle->oncpu = 1; #endif - set_tsk_need_resched(idle); spin_unlock_irqrestore(&rq->lock, flags); /* Set the preempt count _outside_ the spinlocks! */ @@ -4350,8 +4343,7 @@ static int migration_thread(void * data) struct list_head *head; migration_req_t *req; - if (current->flags & PF_FREEZE) - refrigerator(PF_FREEZE); + try_to_freeze(); spin_lock_irq(&rq->lock); @@ -4639,7 +4631,7 @@ int __init migration_init(void) #endif #ifdef CONFIG_SMP -#define SCHED_DOMAIN_DEBUG +#undef SCHED_DOMAIN_DEBUG #ifdef SCHED_DOMAIN_DEBUG static void sched_domain_debug(struct sched_domain *sd, int cpu) { @@ -4732,7 +4724,7 @@ static void sched_domain_debug(struct sched_domain *sd, int cpu) #define sched_domain_debug(sd, cpu) {} #endif -static int __devinit sd_degenerate(struct sched_domain *sd) +static int sd_degenerate(struct sched_domain *sd) { if (cpus_weight(sd->span) == 1) return 1; @@ -4755,7 +4747,7 @@ static int __devinit sd_degenerate(struct sched_domain *sd) return 1; } -static int __devinit sd_parent_degenerate(struct sched_domain *sd, +static int sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) { unsigned long cflags = sd->flags, pflags = parent->flags; @@ -4787,7 +4779,7 @@ static int __devinit sd_parent_degenerate(struct sched_domain *sd, * Attach the domain 'sd' to 'cpu' as its base domain. Callers must * hold the hotplug lock. */ -void __devinit cpu_attach_domain(struct sched_domain *sd, int cpu) +void cpu_attach_domain(struct sched_domain *sd, int cpu) { runqueue_t *rq = cpu_rq(cpu); struct sched_domain *tmp; @@ -4838,7 +4830,7 @@ __setup ("isolcpus=", isolated_cpu_setup); * covered by the given span, and will set each group's ->cpumask correctly, * and ->cpu_power to 0. */ -void __devinit init_sched_build_groups(struct sched_group groups[], +void init_sched_build_groups(struct sched_group groups[], cpumask_t span, int (*group_fn)(int cpu)) { struct sched_group *first = NULL, *last = NULL; @@ -4874,13 +4866,14 @@ void __devinit init_sched_build_groups(struct sched_group groups[], #ifdef ARCH_HAS_SCHED_DOMAIN -extern void __devinit arch_init_sched_domains(void); -extern void __devinit arch_destroy_sched_domains(void); +extern void build_sched_domains(const cpumask_t *cpu_map); +extern void arch_init_sched_domains(const cpumask_t *cpu_map); +extern void arch_destroy_sched_domains(const cpumask_t *cpu_map); #else #ifdef CONFIG_SCHED_SMT static DEFINE_PER_CPU(struct sched_domain, cpu_domains); static struct sched_group sched_group_cpus[NR_CPUS]; -static int __devinit cpu_to_cpu_group(int cpu) +static int cpu_to_cpu_group(int cpu) { return cpu; } @@ -4888,7 +4881,7 @@ static int __devinit cpu_to_cpu_group(int cpu) static DEFINE_PER_CPU(struct sched_domain, phys_domains); static struct sched_group sched_group_phys[NR_CPUS]; -static int __devinit cpu_to_phys_group(int cpu) +static int cpu_to_phys_group(int cpu) { #ifdef CONFIG_SCHED_SMT return first_cpu(cpu_sibling_map[cpu]); @@ -4901,7 +4894,7 @@ static int __devinit cpu_to_phys_group(int cpu) static DEFINE_PER_CPU(struct sched_domain, node_domains); static struct sched_group sched_group_nodes[MAX_NUMNODES]; -static int __devinit cpu_to_node_group(int cpu) +static int cpu_to_node_group(int cpu) { return cpu_to_node(cpu); } @@ -4932,39 +4925,28 @@ static void check_sibling_maps(void) #endif /* - * Set up scheduler domains and groups. Callers must hold the hotplug lock. + * Build sched domains for a given set of cpus and attach the sched domains + * to the individual cpus */ -static void __devinit arch_init_sched_domains(void) +static void build_sched_domains(const cpumask_t *cpu_map) { int i; - cpumask_t cpu_default_map; - -#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) - check_sibling_maps(); -#endif - /* - * Setup mask for cpus without special case scheduling requirements. - * For now this just excludes isolated cpus, but could be used to - * exclude other special cases in the future. - */ - cpus_complement(cpu_default_map, cpu_isolated_map); - cpus_and(cpu_default_map, cpu_default_map, cpu_online_map); /* - * Set up domains. Isolated domains just stay on the NULL domain. + * Set up domains for cpus specified by the cpu_map. */ - for_each_cpu_mask(i, cpu_default_map) { + for_each_cpu_mask(i, *cpu_map) { int group; struct sched_domain *sd = NULL, *p; cpumask_t nodemask = node_to_cpumask(cpu_to_node(i)); - cpus_and(nodemask, nodemask, cpu_default_map); + cpus_and(nodemask, nodemask, *cpu_map); #ifdef CONFIG_NUMA sd = &per_cpu(node_domains, i); group = cpu_to_node_group(i); *sd = SD_NODE_INIT; - sd->span = cpu_default_map; + sd->span = *cpu_map; sd->groups = &sched_group_nodes[group]; #endif @@ -4982,7 +4964,7 @@ static void __devinit arch_init_sched_domains(void) group = cpu_to_cpu_group(i); *sd = SD_SIBLING_INIT; sd->span = cpu_sibling_map[i]; - cpus_and(sd->span, sd->span, cpu_default_map); + cpus_and(sd->span, sd->span, *cpu_map); sd->parent = p; sd->groups = &sched_group_cpus[group]; #endif @@ -4992,7 +4974,7 @@ static void __devinit arch_init_sched_domains(void) /* Set up CPU (sibling) groups */ for_each_online_cpu(i) { cpumask_t this_sibling_map = cpu_sibling_map[i]; - cpus_and(this_sibling_map, this_sibling_map, cpu_default_map); + cpus_and(this_sibling_map, this_sibling_map, *cpu_map); if (i != first_cpu(this_sibling_map)) continue; @@ -5005,7 +4987,7 @@ static void __devinit arch_init_sched_domains(void) for (i = 0; i < MAX_NUMNODES; i++) { cpumask_t nodemask = node_to_cpumask(i); - cpus_and(nodemask, nodemask, cpu_default_map); + cpus_and(nodemask, nodemask, *cpu_map); if (cpus_empty(nodemask)) continue; @@ -5015,12 +4997,12 @@ static void __devinit arch_init_sched_domains(void) #ifdef CONFIG_NUMA /* Set up node groups */ - init_sched_build_groups(sched_group_nodes, cpu_default_map, + init_sched_build_groups(sched_group_nodes, *cpu_map, &cpu_to_node_group); #endif /* Calculate CPU power for physical packages and nodes */ - for_each_cpu_mask(i, cpu_default_map) { + for_each_cpu_mask(i, *cpu_map) { int power; struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT @@ -5044,7 +5026,7 @@ static void __devinit arch_init_sched_domains(void) } /* Attach the domains */ - for_each_online_cpu(i) { + for_each_cpu_mask(i, *cpu_map) { struct sched_domain *sd; #ifdef CONFIG_SCHED_SMT sd = &per_cpu(cpu_domains, i); @@ -5054,16 +5036,71 @@ static void __devinit arch_init_sched_domains(void) cpu_attach_domain(sd, i); } } +/* + * Set up scheduler domains and groups. Callers must hold the hotplug lock. + */ +static void arch_init_sched_domains(cpumask_t *cpu_map) +{ + cpumask_t cpu_default_map; -#ifdef CONFIG_HOTPLUG_CPU -static void __devinit arch_destroy_sched_domains(void) +#if defined(CONFIG_SCHED_SMT) && defined(CONFIG_NUMA) + check_sibling_maps(); +#endif + /* + * Setup mask for cpus without special case scheduling requirements. + * For now this just excludes isolated cpus, but could be used to + * exclude other special cases in the future. + */ + cpus_andnot(cpu_default_map, *cpu_map, cpu_isolated_map); + + build_sched_domains(&cpu_default_map); +} + +static void arch_destroy_sched_domains(const cpumask_t *cpu_map) { /* Do nothing: everything is statically allocated. */ } -#endif #endif /* ARCH_HAS_SCHED_DOMAIN */ +/* + * Detach sched domains from a group of cpus specified in cpu_map + * These cpus will now be attached to the NULL domain + */ +static inline void detach_destroy_domains(const cpumask_t *cpu_map) +{ + int i; + + for_each_cpu_mask(i, *cpu_map) + cpu_attach_domain(NULL, i); + synchronize_sched(); + arch_destroy_sched_domains(cpu_map); +} + +/* + * Partition sched domains as specified by the cpumasks below. + * This attaches all cpus from the cpumasks to the NULL domain, + * waits for a RCU quiescent period, recalculates sched + * domain information and then attaches them back to the + * correct sched domains + * Call with hotplug lock held + */ +void partition_sched_domains(cpumask_t *partition1, cpumask_t *partition2) +{ + cpumask_t change_map; + + cpus_and(*partition1, *partition1, cpu_online_map); + cpus_and(*partition2, *partition2, cpu_online_map); + cpus_or(change_map, *partition1, *partition2); + + /* Detach sched domains from all of the affected cpus */ + detach_destroy_domains(&change_map); + if (!cpus_empty(*partition1)) + build_sched_domains(partition1); + if (!cpus_empty(*partition2)) + build_sched_domains(partition2); +} + #ifdef CONFIG_HOTPLUG_CPU /* * Force a reinitialization of the sched domains hierarchy. The domains @@ -5074,15 +5111,10 @@ static void __devinit arch_destroy_sched_domains(void) static int update_sched_domains(struct notifier_block *nfb, unsigned long action, void *hcpu) { - int i; - switch (action) { case CPU_UP_PREPARE: case CPU_DOWN_PREPARE: - for_each_online_cpu(i) - cpu_attach_domain(NULL, i); - synchronize_kernel(); - arch_destroy_sched_domains(); + detach_destroy_domains(&cpu_online_map); return NOTIFY_OK; case CPU_UP_CANCELED: @@ -5098,7 +5130,7 @@ static int update_sched_domains(struct notifier_block *nfb, } /* The hotplug lock is already held by cpu_up/cpu_down */ - arch_init_sched_domains(); + arch_init_sched_domains(&cpu_online_map); return NOTIFY_OK; } @@ -5107,7 +5139,7 @@ static int update_sched_domains(struct notifier_block *nfb, void __init sched_init_smp(void) { lock_cpu_hotplug(); - arch_init_sched_domains(); + arch_init_sched_domains(&cpu_online_map); unlock_cpu_hotplug(); /* XXX: Theoretical race here - CPU may be hotplugged now */ hotcpu_notifier(update_sched_domains, 0);