X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fcpuset.c;h=d5ab79cf516d7edf77f8e45afe15c04cdae2cda4;hb=73909f7a665991013dcff42a815fda76d3a7300a;hp=7326d51eefe13169254a14edd4e05e77bb108fa1;hpb=0b2f630a28d53b5a2082a5275bc3334b10373508;p=linux-2.6 diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 7326d51eef..d5ab79cf51 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -54,7 +54,6 @@ #include #include #include -#include #include #include @@ -365,7 +364,7 @@ void cpuset_update_task_memory_state(void) my_cpusets_mem_gen = top_cpuset.mems_generation; } else { rcu_read_lock(); - my_cpusets_mem_gen = task_cs(current)->mems_generation; + my_cpusets_mem_gen = task_cs(tsk)->mems_generation; rcu_read_unlock(); } @@ -486,21 +485,51 @@ static int cpusets_overlap(struct cpuset *a, struct cpuset *b) static void update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) { - if (!dattr) - return; if (dattr->relax_domain_level < c->relax_domain_level) dattr->relax_domain_level = c->relax_domain_level; return; } +static void +update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) +{ + LIST_HEAD(q); + + list_add(&c->stack_list, &q); + while (!list_empty(&q)) { + struct cpuset *cp; + struct cgroup *cont; + struct cpuset *child; + + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + + if (cpus_empty(cp->cpus_allowed)) + continue; + + if (is_sched_load_balance(cp)) + update_domain_attr(dattr, cp); + + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { + child = cgroup_cs(cont); + list_add_tail(&child->stack_list, &q); + } + } +} + /* * rebuild_sched_domains() * - * If the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset - * which has that flag enabled, or if any cpuset with a non-empty - * 'cpus' is removed, then call this routine to rebuild the - * scheduler's dynamic sched domains. + * This routine will be called to rebuild the scheduler's dynamic + * sched domains: + * - if the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, + * - or if the 'cpus' allowed changes in any cpuset which has that + * flag enabled, + * - or if the 'sched_relax_domain_level' of any cpuset which has + * that flag enabled and with non-empty 'cpus' changes, + * - or if any cpuset with non-empty 'cpus' is removed, + * - or if a cpu gets offlined. * * This routine builds a partial partition of the systems CPUs * (the set of non-overlappping cpumask_t's in the array 'part' @@ -527,7 +556,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) * So the reverse nesting would risk an ABBA deadlock. * * The three key local variables below are: - * q - a kfifo queue of cpuset pointers, used to implement a + * q - a linked-list queue of cpuset pointers, used to implement a * top-down scan of all cpusets. This scan loads a pointer * to each cpuset marked is_sched_load_balance into the * array 'csa'. For our purposes, rebuilding the schedulers @@ -562,7 +591,7 @@ update_domain_attr(struct sched_domain_attr *dattr, struct cpuset *c) void rebuild_sched_domains(void) { - struct kfifo *q; /* queue of cpusets to be scanned */ + LIST_HEAD(q); /* queue of cpusets to be scanned*/ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -572,7 +601,6 @@ void rebuild_sched_domains(void) int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - q = NULL; csa = NULL; doms = NULL; dattr = NULL; @@ -586,30 +614,42 @@ void rebuild_sched_domains(void) dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; - update_domain_attr(dattr, &top_cpuset); + update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; goto rebuild; } - q = kfifo_alloc(number_of_cpusets * sizeof(cp), GFP_KERNEL, NULL); - if (IS_ERR(q)) - goto done; csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); if (!csa) goto done; csn = 0; - cp = &top_cpuset; - __kfifo_put(q, (void *)&cp, sizeof(cp)); - while (__kfifo_get(q, (void *)&cp, sizeof(cp))) { + list_add(&top_cpuset.stack_list, &q); + while (!list_empty(&q)) { struct cgroup *cont; struct cpuset *child; /* scans child cpusets of cp */ - if (is_sched_load_balance(cp)) + + cp = list_first_entry(&q, struct cpuset, stack_list); + list_del(q.next); + + if (cpus_empty(cp->cpus_allowed)) + continue; + + /* + * All child cpusets contain a subset of the parent's cpus, so + * just skip them, and then we call update_domain_attr_tree() + * to calc relax_domain_level of the corresponding sched + * domain. + */ + if (is_sched_load_balance(cp)) { csa[csn++] = cp; + continue; + } + list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); - __kfifo_put(q, (void *)&child, sizeof(cp)); + list_add_tail(&child->stack_list, &q); } } @@ -676,7 +716,7 @@ restart: cpus_or(*dp, *dp, b->cpus_allowed); b->pn = -1; if (dattr) - update_domain_attr(dattr + update_domain_attr_tree(dattr + nslot, b); } } @@ -692,43 +732,11 @@ rebuild: put_online_cpus(); done: - if (q && !IS_ERR(q)) - kfifo_free(q); kfree(csa); /* Don't kfree(doms) -- partition_sched_domains() does that. */ /* Don't kfree(dattr) -- partition_sched_domains() does that. */ } -static inline int started_after_time(struct task_struct *t1, - struct timespec *time, - struct task_struct *t2) -{ - int start_diff = timespec_compare(&t1->start_time, time); - if (start_diff > 0) { - return 1; - } else if (start_diff < 0) { - return 0; - } else { - /* - * Arbitrarily, if two processes started at the same - * time, we'll say that the lower pointer value - * started first. Note that t2 may have exited by now - * so this may not be a valid pointer any longer, but - * that's fine - it still serves to distinguish - * between two tasks started (effectively) - * simultaneously. - */ - return t1 > t2; - } -} - -static inline int started_after(void *p1, void *p2) -{ - struct task_struct *t1 = p1; - struct task_struct *t2 = p2; - return started_after_time(t1, &t2->start_time, t2); -} - /** * cpuset_test_cpumask - test a task's cpus_allowed versus its cpuset's * @tsk: task to test @@ -780,7 +788,12 @@ static int update_tasks_cpumask(struct cpuset *cs) struct ptr_heap heap; int retval; - retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, &started_after); + /* + * cgroup_scan_tasks() will initialize heap->gt for us. + * heap_init() is still needed here for we should not change + * cs->cpus_allowed when heap_init() fails. + */ + retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL); if (retval) return retval; @@ -1076,7 +1089,8 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; - rebuild_sched_domains(); + if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) + rebuild_sched_domains(); } return 0; @@ -1791,7 +1805,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) scan.scan.heap = NULL; scan.to = to->css.cgroup; - if (cgroup_scan_tasks((struct cgroup_scanner *)&scan)) + if (cgroup_scan_tasks(&scan.scan)) printk(KERN_ERR "move_member_tasks_to_cpuset: " "cgroup_scan_tasks failed\n"); } @@ -1847,29 +1861,29 @@ static void remove_tasks_in_empty_cpuset(struct cpuset *cs) */ static void scan_for_empty_cpusets(const struct cpuset *root) { + LIST_HEAD(queue); struct cpuset *cp; /* scans cpusets being updated */ struct cpuset *child; /* scans child cpusets of cp */ - struct list_head queue; struct cgroup *cont; - - INIT_LIST_HEAD(&queue); + nodemask_t oldmems; list_add_tail((struct list_head *)&root->stack_list, &queue); while (!list_empty(&queue)) { - cp = container_of(queue.next, struct cpuset, stack_list); + cp = list_first_entry(&queue, struct cpuset, stack_list); list_del(queue.next); list_for_each_entry(cont, &cp->css.cgroup->children, sibling) { child = cgroup_cs(cont); list_add_tail(&child->stack_list, &queue); } - cont = cp->css.cgroup; /* Continue past cpusets with all cpus, mems online */ if (cpus_subset(cp->cpus_allowed, cpu_online_map) && nodes_subset(cp->mems_allowed, node_states[N_HIGH_MEMORY])) continue; + oldmems = cp->mems_allowed; + /* Remove offline cpus and mems from this cpuset. */ mutex_lock(&callback_mutex); cpus_and(cp->cpus_allowed, cp->cpus_allowed, cpu_online_map); @@ -1881,6 +1895,10 @@ static void scan_for_empty_cpusets(const struct cpuset *root) if (cpus_empty(cp->cpus_allowed) || nodes_empty(cp->mems_allowed)) remove_tasks_in_empty_cpuset(cp); + else { + update_tasks_cpumask(cp); + update_tasks_nodemask(cp, &oldmems); + } } }