X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fcpuset.c;h=1f06e76901067ffa1c08d586d0128f64ad6d6c65;hb=a57ebc90f1350296edded12d33d7c278831bc3bf;hp=805fb9097318b761b60b98e8ad8f11a7d3756fcf;hpb=49302d0c42592b37f49ae96e0f06a3599cf5a8a0;p=linux-2.6 diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 805fb90973..1f06e76901 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -398,21 +398,31 @@ static int cpuset_path(const struct cpuset *cs, char *buf, int buflen) * to continue to serve a useful existence. Next time it's released, * we will get notified again, if it still has 'notify_on_release' set. * - * Note final arg to call_usermodehelper() is 0 - that means - * don't wait. Since we are holding the global cpuset_sem here, - * and we are asking another thread (started from keventd) to rmdir a - * cpuset, we can't wait - or we'd deadlock with the removing thread - * on cpuset_sem. + * The final arg to call_usermodehelper() is 0, which means don't + * wait. The separate /sbin/cpuset_release_agent task is forked by + * call_usermodehelper(), then control in this thread returns here, + * without waiting for the release agent task. We don't bother to + * wait because the caller of this routine has no use for the exit + * status of the /sbin/cpuset_release_agent task, so no sense holding + * our caller up for that. + * + * The simple act of forking that task might require more memory, + * which might need cpuset_sem. So this routine must be called while + * cpuset_sem is not held, to avoid a possible deadlock. See also + * comments for check_for_release(), below. */ -static int cpuset_release_agent(char *cpuset_str) +static void cpuset_release_agent(const char *pathbuf) { char *argv[3], *envp[3]; int i; + if (!pathbuf) + return; + i = 0; argv[i++] = "/sbin/cpuset_release_agent"; - argv[i++] = cpuset_str; + argv[i++] = (char *)pathbuf; argv[i] = NULL; i = 0; @@ -421,17 +431,29 @@ static int cpuset_release_agent(char *cpuset_str) envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; envp[i] = NULL; - return call_usermodehelper(argv[0], argv, envp, 0); + call_usermodehelper(argv[0], argv, envp, 0); + kfree(pathbuf); } /* * Either cs->count of using tasks transitioned to zero, or the * cs->children list of child cpusets just became empty. If this * cs is notify_on_release() and now both the user count is zero and - * the list of children is empty, send notice to user land. + * the list of children is empty, prepare cpuset path in a kmalloc'd + * buffer, to be returned via ppathbuf, so that the caller can invoke + * cpuset_release_agent() with it later on, once cpuset_sem is dropped. + * Call here with cpuset_sem held. + * + * This check_for_release() routine is responsible for kmalloc'ing + * pathbuf. The above cpuset_release_agent() is responsible for + * kfree'ing pathbuf. The caller of these routines is responsible + * for providing a pathbuf pointer, initialized to NULL, then + * calling check_for_release() with cpuset_sem held and the address + * of the pathbuf pointer, then dropping cpuset_sem, then calling + * cpuset_release_agent() with pathbuf, as set by check_for_release(). */ -static void check_for_release(struct cpuset *cs) +static void check_for_release(struct cpuset *cs, char **ppathbuf) { if (notify_on_release(cs) && atomic_read(&cs->count) == 0 && list_empty(&cs->children)) { @@ -441,10 +463,9 @@ static void check_for_release(struct cpuset *cs) if (!buf) return; if (cpuset_path(cs, buf, PAGE_SIZE) < 0) - goto out; - cpuset_release_agent(buf); -out: - kfree(buf); + kfree(buf); + else + *ppathbuf = buf; } } @@ -606,6 +627,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) * Call with cpuset_sem held. May nest a call to the * lock_cpu_hotplug()/unlock_cpu_hotplug() pair. */ + static void update_cpu_domains(struct cpuset *cur) { struct cpuset *c, *par = cur->parent; @@ -727,14 +749,14 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, char *buf) return 0; } -static int attach_task(struct cpuset *cs, char *buf) +static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) { pid_t pid; struct task_struct *tsk; struct cpuset *oldcs; cpumask_t cpus; - if (sscanf(buf, "%d", &pid) != 1) + if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; if (cpus_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed)) return -ENOSPC; @@ -777,7 +799,7 @@ static int attach_task(struct cpuset *cs, char *buf) put_task_struct(tsk); if (atomic_dec_and_test(&oldcs->count)) - check_for_release(oldcs); + check_for_release(oldcs, ppathbuf); return 0; } @@ -801,6 +823,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us struct cftype *cft = __d_cft(file->f_dentry); cpuset_filetype_t type = cft->private; char *buffer; + char *pathbuf = NULL; int retval = 0; /* Crude upper limit on largest legitimate cpulist user might write. */ @@ -841,7 +864,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer); break; case FILE_TASKLIST: - retval = attach_task(cs, buffer); + retval = attach_task(cs, buffer, &pathbuf); break; default: retval = -EINVAL; @@ -852,6 +875,7 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us retval = nbytes; out2: up(&cpuset_sem); + cpuset_release_agent(pathbuf); out1: kfree(buffer); return retval; @@ -1357,6 +1381,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cpuset *cs = dentry->d_fsdata; struct dentry *d; struct cpuset *parent; + char *pathbuf = NULL; /* the vfs holds both inode->i_sem already */ @@ -1376,7 +1401,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) update_cpu_domains(cs); list_del(&cs->sibling); /* delete my sibling from parent->children */ if (list_empty(&parent->children)) - check_for_release(parent); + check_for_release(parent, &pathbuf); spin_lock(&cs->dentry->d_lock); d = dget(cs->dentry); cs->dentry = NULL; @@ -1384,6 +1409,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) cpuset_d_remove_dir(d); dput(d); up(&cpuset_sem); + cpuset_release_agent(pathbuf); return 0; } @@ -1483,10 +1509,13 @@ void cpuset_exit(struct task_struct *tsk) task_unlock(tsk); if (notify_on_release(cs)) { + char *pathbuf = NULL; + down(&cpuset_sem); if (atomic_dec_and_test(&cs->count)) - check_for_release(cs); + check_for_release(cs, &pathbuf); up(&cpuset_sem); + cpuset_release_agent(pathbuf); } else { atomic_dec(&cs->count); } @@ -1570,17 +1599,114 @@ int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl) return 0; } +/* + * nearest_exclusive_ancestor() - Returns the nearest mem_exclusive + * ancestor to the specified cpuset. Call while holding cpuset_sem. + * If no ancestor is mem_exclusive (an unusual configuration), then + * returns the root cpuset. + */ +static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs) +{ + while (!is_mem_exclusive(cs) && cs->parent) + cs = cs->parent; + return cs; +} + /** - * cpuset_zone_allowed - is zone z allowed in current->mems_allowed - * @z: zone in question + * cpuset_zone_allowed - Can we allocate memory on zone z's memory node? + * @z: is this zone on an allowed node? + * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL) * - * Is zone z allowed in current->mems_allowed, or is - * the CPU in interrupt context? (zone is always allowed in this case) - */ -int cpuset_zone_allowed(struct zone *z) + * If we're in interrupt, yes, we can always allocate. If zone + * z's node is in our tasks mems_allowed, yes. If it's not a + * __GFP_HARDWALL request and this zone's nodes is in the nearest + * mem_exclusive cpuset ancestor to this tasks cpuset, yes. + * Otherwise, no. + * + * GFP_USER allocations are marked with the __GFP_HARDWALL bit, + * and do not allow allocations outside the current tasks cpuset. + * GFP_KERNEL allocations are not so marked, so can escape to the + * nearest mem_exclusive ancestor cpuset. + * + * Scanning up parent cpusets requires cpuset_sem. The __alloc_pages() + * routine only calls here with __GFP_HARDWALL bit _not_ set if + * it's a GFP_KERNEL allocation, and all nodes in the current tasks + * mems_allowed came up empty on the first pass over the zonelist. + * So only GFP_KERNEL allocations, if all nodes in the cpuset are + * short of memory, might require taking the cpuset_sem semaphore. + * + * The first loop over the zonelist in mm/page_alloc.c:__alloc_pages() + * calls here with __GFP_HARDWALL always set in gfp_mask, enforcing + * hardwall cpusets - no allocation on a node outside the cpuset is + * allowed (unless in interrupt, of course). + * + * The second loop doesn't even call here for GFP_ATOMIC requests + * (if the __alloc_pages() local variable 'wait' is set). That check + * and the checks below have the combined affect in the second loop of + * the __alloc_pages() routine that: + * in_interrupt - any node ok (current task context irrelevant) + * GFP_ATOMIC - any node ok + * GFP_KERNEL - any node in enclosing mem_exclusive cpuset ok + * GFP_USER - only nodes in current tasks mems allowed ok. + **/ + +int cpuset_zone_allowed(struct zone *z, unsigned int __nocast gfp_mask) +{ + int node; /* node that zone z is on */ + const struct cpuset *cs; /* current cpuset ancestors */ + int allowed = 1; /* is allocation in zone z allowed? */ + + if (in_interrupt()) + return 1; + node = z->zone_pgdat->node_id; + if (node_isset(node, current->mems_allowed)) + return 1; + if (gfp_mask & __GFP_HARDWALL) /* If hardwall request, stop here */ + return 0; + + /* Not hardwall and node outside mems_allowed: scan up cpusets */ + down(&cpuset_sem); + cs = current->cpuset; + if (!cs) + goto done; /* current task exiting */ + cs = nearest_exclusive_ancestor(cs); + allowed = node_isset(node, cs->mems_allowed); +done: + up(&cpuset_sem); + return allowed; +} + +/** + * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? + * @p: pointer to task_struct of some other task. + * + * Description: Return true if the nearest mem_exclusive ancestor + * cpusets of tasks @p and current overlap. Used by oom killer to + * determine if task @p's memory usage might impact the memory + * available to the current task. + * + * Acquires cpuset_sem - not suitable for calling from a fast path. + **/ + +int cpuset_excl_nodes_overlap(const struct task_struct *p) { - return in_interrupt() || - node_isset(z->zone_pgdat->node_id, current->mems_allowed); + const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ + int overlap = 0; /* do cpusets overlap? */ + + down(&cpuset_sem); + cs1 = current->cpuset; + if (!cs1) + goto done; /* current task exiting */ + cs2 = p->cpuset; + if (!cs2) + goto done; /* task p is exiting */ + cs1 = nearest_exclusive_ancestor(cs1); + cs2 = nearest_exclusive_ancestor(cs2); + overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); +done: + up(&cpuset_sem); + + return overlap; } /*