X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=kernel%2Fcpuset.c;h=12815d3f1a05ee25efc0eb489ff9c6abc4e4cb98;hb=ca94f26d2b2ee8ad76be617b35f846444fedc07b;hp=6004719f26eef5b09770c006df3830fb62a340ea;hpb=202f72d5d1b5c2c084f63ef996c736d208b447b5;p=linux-2.6 diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 6004719f26..12815d3f1a 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -61,7 +62,7 @@ * When there is only one cpuset (the root cpuset) we can * short circuit some hooks. */ -int number_of_cpusets; +int number_of_cpusets __read_mostly; /* See "Frequency meter" comments, below. */ @@ -248,6 +249,11 @@ static struct super_block *cpuset_sb; * a tasks cpuset pointer we use task_lock(), which acts on a spinlock * (task->alloc_lock) already in the task_struct routinely used for * such matters. + * + * P.S. One more locking exception. RCU is used to guard the + * update of a tasks cpuset pointer by attach_task() and the + * access of task->cpuset->mems_generation via that pointer in + * the routine cpuset_update_task_memory_state(). */ static DECLARE_MUTEX(manage_sem); @@ -325,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) spin_lock(&dcache_lock); node = dentry->d_subdirs.next; while (node != &dentry->d_subdirs) { - struct dentry *d = list_entry(node, struct dentry, d_child); + struct dentry *d = list_entry(node, struct dentry, d_u.d_child); list_del_init(node); if (d->d_inode) { d = dget_locked(d); @@ -337,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry) } node = dentry->d_subdirs.next; } - list_del_init(&dentry->d_child); + list_del_init(&dentry->d_u.d_child); spin_unlock(&dcache_lock); remove_dir(dentry); } @@ -603,21 +609,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * Do not call this routine if in_interrupt(). * * Call without callback_sem or task_lock() held. May be called - * with or without manage_sem held. Except in early boot or - * an exiting task, when tsk->cpuset is NULL, this routine will - * acquire task_lock(). We don't need to use task_lock to guard + * with or without manage_sem held. Doesn't need task_lock to guard * against another task changing a non-NULL cpuset pointer to NULL, * as that is only done by a task on itself, and if the current task * is here, it is not simultaneously in the exit code NULL'ing its * cpuset pointer. This routine also might acquire callback_sem and * current->mm->mmap_sem during call. * - * The task_lock() is required to dereference current->cpuset safely. - * Without it, we could pick up the pointer value of current->cpuset - * in one instruction, and then attach_task could give us a different - * cpuset, and then the cpuset we had could be removed and freed, - * and then on our next instruction, we could dereference a no longer - * valid cpuset pointer to get its mems_generation field. + * Reading current->cpuset->mems_generation doesn't need task_lock + * to guard the current->cpuset derefence, because it is guarded + * from concurrent freeing of current->cpuset by attach_task(), + * using RCU. + * + * The rcu_dereference() is technically probably not needed, + * as I don't actually mind if I see a new cpuset pointer but + * an old value of mems_generation. However this really only + * matters on alpha systems using cpusets heavily. If I dropped + * that rcu_dereference(), it would save them a memory barrier. + * For all other arch's, rcu_dereference is a no-op anyway, and for + * alpha systems not using cpusets, another planned optimization, + * avoiding the rcu critical section for tasks in the root cpuset + * which is statically allocated, so can't vanish, will make this + * irrelevant. Better to use RCU as intended, than to engage in + * some cute trick to save a memory barrier that is impossible to + * test, for alpha systems using cpusets heavily, which might not + * even exist. * * This routine is needed to update the per-task mems_allowed data, * within the tasks context, when it is trying to allocate memory @@ -625,39 +641,31 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask) * task has been modifying its cpuset. */ -void cpuset_update_task_memory_state() +void cpuset_update_task_memory_state(void) { int my_cpusets_mem_gen; struct task_struct *tsk = current; - struct cpuset *cs = tsk->cpuset; - - if (unlikely(!cs)) - return; + struct cpuset *cs; - task_lock(tsk); - my_cpusets_mem_gen = cs->mems_generation; - task_unlock(tsk); + if (tsk->cpuset == &top_cpuset) { + /* Don't need rcu for top_cpuset. It's never freed. */ + my_cpusets_mem_gen = top_cpuset.mems_generation; + } else { + rcu_read_lock(); + cs = rcu_dereference(tsk->cpuset); + my_cpusets_mem_gen = cs->mems_generation; + rcu_read_unlock(); + } if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) { - nodemask_t oldmem = tsk->mems_allowed; - int migrate; - down(&callback_sem); task_lock(tsk); cs = tsk->cpuset; /* Maybe changed when task not locked */ - migrate = is_memory_migrate(cs); guarantee_online_mems(cs, &tsk->mems_allowed); tsk->cpuset_mems_generation = cs->mems_generation; task_unlock(tsk); up(&callback_sem); mpol_rebind_task(tsk, &tsk->mems_allowed); - if (!nodes_equal(oldmem, tsk->mems_allowed)) { - if (migrate) { - do_migrate_pages(tsk->mm, &oldmem, - &tsk->mems_allowed, - MPOL_MF_MOVE_ALL); - } - } } } @@ -812,12 +820,28 @@ static int update_cpumask(struct cpuset *cs, char *buf) } /* + * Handle user request to change the 'mems' memory placement + * of a cpuset. Needs to validate the request, update the + * cpusets mems_allowed and mems_generation, and for each + * task in the cpuset, rebind any vma mempolicies and if + * the cpuset is marked 'memory_migrate', migrate the tasks + * pages to the new memory. + * * Call with manage_sem held. May take callback_sem during call. + * Will take tasklist_lock, scan tasklist for tasks in cpuset cs, + * lock each such tasks mm->mmap_sem, scan its vma's and rebind + * their mempolicies to the cpusets new mems_allowed. */ static int update_nodemask(struct cpuset *cs, char *buf) { struct cpuset trialcs; + nodemask_t oldmem; + struct task_struct *g, *p; + struct mm_struct **mmarray; + int i, n, ntasks; + int migrate; + int fudge; int retval; trialcs = *cs; @@ -825,6 +849,11 @@ static int update_nodemask(struct cpuset *cs, char *buf) if (retval < 0) goto done; nodes_and(trialcs.mems_allowed, trialcs.mems_allowed, node_online_map); + oldmem = cs->mems_allowed; + if (nodes_equal(oldmem, trialcs.mems_allowed)) { + retval = 0; /* Too easy - nothing to do */ + goto done; + } if (nodes_empty(trialcs.mems_allowed)) { retval = -ENOSPC; goto done; @@ -839,6 +868,81 @@ static int update_nodemask(struct cpuset *cs, char *buf) cs->mems_generation = atomic_read(&cpuset_mems_generation); up(&callback_sem); + set_cpuset_being_rebound(cs); /* causes mpol_copy() rebind */ + + fudge = 10; /* spare mmarray[] slots */ + fudge += cpus_weight(cs->cpus_allowed); /* imagine one fork-bomb/cpu */ + retval = -ENOMEM; + + /* + * Allocate mmarray[] to hold mm reference for each task + * in cpuset cs. Can't kmalloc GFP_KERNEL while holding + * tasklist_lock. We could use GFP_ATOMIC, but with a + * few more lines of code, we can retry until we get a big + * enough mmarray[] w/o using GFP_ATOMIC. + */ + while (1) { + ntasks = atomic_read(&cs->count); /* guess */ + ntasks += fudge; + mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL); + if (!mmarray) + goto done; + write_lock_irq(&tasklist_lock); /* block fork */ + if (atomic_read(&cs->count) <= ntasks) + break; /* got enough */ + write_unlock_irq(&tasklist_lock); /* try again */ + kfree(mmarray); + } + + n = 0; + + /* Load up mmarray[] with mm reference for each task in cpuset. */ + do_each_thread(g, p) { + struct mm_struct *mm; + + if (n >= ntasks) { + printk(KERN_WARNING + "Cpuset mempolicy rebind incomplete.\n"); + continue; + } + if (p->cpuset != cs) + continue; + mm = get_task_mm(p); + if (!mm) + continue; + mmarray[n++] = mm; + } while_each_thread(g, p); + write_unlock_irq(&tasklist_lock); + + /* + * Now that we've dropped the tasklist spinlock, we can + * rebind the vma mempolicies of each mm in mmarray[] to their + * new cpuset, and release that mm. The mpol_rebind_mm() + * call takes mmap_sem, which we couldn't take while holding + * tasklist_lock. Forks can happen again now - the mpol_copy() + * cpuset_being_rebound check will catch such forks, and rebind + * their vma mempolicies too. Because we still hold the global + * cpuset manage_sem, we know that no other rebind effort will + * be contending for the global variable cpuset_being_rebound. + * It's ok if we rebind the same mm twice; mpol_rebind_mm() + * is idempotent. Also migrate pages in each mm to new nodes. + */ + migrate = is_memory_migrate(cs); + for (i = 0; i < n; i++) { + struct mm_struct *mm = mmarray[i]; + + mpol_rebind_mm(mm, &cs->mems_allowed); + if (migrate) { + do_migrate_pages(mm, &oldmem, &cs->mems_allowed, + MPOL_MF_MOVE_ALL); + } + mmput(mm); + } + + /* We're done rebinding vma's to this cpusets new mems_allowed. */ + kfree(mmarray); + set_cpuset_being_rebound(NULL); + retval = 0; done: return retval; } @@ -1011,6 +1115,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) struct cpuset *oldcs; cpumask_t cpus; nodemask_t from, to; + struct mm_struct *mm; if (sscanf(pidbuf, "%d", &pid) != 1) return -EIO; @@ -1050,7 +1155,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) return -ESRCH; } atomic_inc(&cs->count); - tsk->cpuset = cs; + rcu_assign_pointer(tsk->cpuset, cs); task_unlock(tsk); guarantee_online_cpus(cs, &cpus); @@ -1060,9 +1165,17 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf) to = cs->mems_allowed; up(&callback_sem); + + mm = get_task_mm(tsk); + if (mm) { + mpol_rebind_mm(mm, &to); + mmput(mm); + } + if (is_memory_migrate(cs)) do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL); put_task_struct(tsk); + synchronize_rcu(); if (atomic_dec_and_test(&oldcs->count)) check_for_release(oldcs, ppathbuf); return 0; @@ -1400,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) struct dentry *dentry; int error; - down(&dir->d_inode->i_sem); + mutex_lock(&dir->d_inode->i_mutex); dentry = cpuset_get_dentry(dir, cft->name); if (!IS_ERR(dentry)) { error = cpuset_create_file(dentry, 0644 | S_IFREG); @@ -1409,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft) dput(dentry); } else error = PTR_ERR(dentry); - up(&dir->d_inode->i_sem); + mutex_unlock(&dir->d_inode->i_mutex); return error; } @@ -1441,7 +1554,7 @@ struct ctr_struct { * when reading out p->cpuset, as we don't really care if it changes * on the next cycle, and we are not going to try to dereference it. */ -static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) +static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs) { int n = 0; struct task_struct *g, *p; @@ -1680,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode) /* * Release manage_sem before cpuset_populate_dir() because it - * will down() this new directory's i_sem and if we race with + * will down() this new directory's i_mutex and if we race with * another mkdir, we might deadlock. */ up(&manage_sem); @@ -1699,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode) { struct cpuset *c_parent = dentry->d_parent->d_fsdata; - /* the vfs holds inode->i_sem already */ + /* the vfs holds inode->i_mutex already */ return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR); } @@ -1710,7 +1823,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) struct cpuset *parent; char *pathbuf = NULL; - /* the vfs holds both inode->i_sem already */ + /* the vfs holds both inode->i_mutex already */ down(&manage_sem); cpuset_update_task_memory_state(); @@ -1743,6 +1856,21 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry) return 0; } +/* + * cpuset_init_early - just enough so that the calls to + * cpuset_update_task_memory_state() in early init code + * are harmless. + */ + +int __init cpuset_init_early(void) +{ + struct task_struct *tsk = current; + + tsk->cpuset = &top_cpuset; + tsk->cpuset->mems_generation = atomic_read(&cpuset_mems_generation); + return 0; +} + /** * cpuset_init - initialize cpusets at system boot * @@ -1849,6 +1977,39 @@ void cpuset_fork(struct task_struct *child) * We don't need to task_lock() this reference to tsk->cpuset, * because tsk is already marked PF_EXITING, so attach_task() won't * mess with it, or task is a failed fork, never visible to attach_task. + * + * Hack: + * + * Set the exiting tasks cpuset to the root cpuset (top_cpuset). + * + * Don't leave a task unable to allocate memory, as that is an + * accident waiting to happen should someone add a callout in + * do_exit() after the cpuset_exit() call that might allocate. + * If a task tries to allocate memory with an invalid cpuset, + * it will oops in cpuset_update_task_memory_state(). + * + * We call cpuset_exit() while the task is still competent to + * handle notify_on_release(), then leave the task attached to + * the root cpuset (top_cpuset) for the remainder of its exit. + * + * To do this properly, we would increment the reference count on + * top_cpuset, and near the very end of the kernel/exit.c do_exit() + * code we would add a second cpuset function call, to drop that + * reference. This would just create an unnecessary hot spot on + * the top_cpuset reference count, to no avail. + * + * Normally, holding a reference to a cpuset without bumping its + * count is unsafe. The cpuset could go away, or someone could + * attach us to a different cpuset, decrementing the count on + * the first cpuset that we never incremented. But in this case, + * top_cpuset isn't going away, and either task has PF_EXITING set, + * which wards off any attach_task() attempts, or task is a failed + * fork, never visible to attach_task. + * + * Another way to do this would be to set the cpuset pointer + * to NULL here, and check in cpuset_update_task_memory_state() + * for a NULL pointer. This hack avoids that NULL check, for no + * cost (other than this way too long comment ;). **/ void cpuset_exit(struct task_struct *tsk) @@ -1856,7 +2017,7 @@ void cpuset_exit(struct task_struct *tsk) struct cpuset *cs; cs = tsk->cpuset; - tsk->cpuset = NULL; + tsk->cpuset = &top_cpuset; /* Hack - see comment above */ if (notify_on_release(cs)) { char *pathbuf = NULL; @@ -2021,6 +2182,33 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) return allowed; } +/** + * cpuset_lock - lock out any changes to cpuset structures + * + * The out of memory (oom) code needs to lock down cpusets + * from being changed while it scans the tasklist looking for a + * task in an overlapping cpuset. Expose callback_sem via this + * cpuset_lock() routine, so the oom code can lock it, before + * locking the task list. The tasklist_lock is a spinlock, so + * must be taken inside callback_sem. + */ + +void cpuset_lock(void) +{ + down(&callback_sem); +} + +/** + * cpuset_unlock - release lock on cpuset changes + * + * Undo the lock taken in a previous cpuset_lock() call. + */ + +void cpuset_unlock(void) +{ + up(&callback_sem); +} + /** * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors? * @p: pointer to task_struct of some other task. @@ -2030,7 +2218,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) * determine if task @p's memory usage might impact the memory * available to the current task. * - * Acquires callback_sem - not suitable for calling from a fast path. + * Call while holding callback_sem. **/ int cpuset_excl_nodes_overlap(const struct task_struct *p) @@ -2038,8 +2226,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ int overlap = 0; /* do cpusets overlap? */ - down(&callback_sem); - task_lock(current); if (current->flags & PF_EXITING) { task_unlock(current); @@ -2058,8 +2244,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p) overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed); done: - up(&callback_sem); - return overlap; }