[ARM] 3382/1: ixp2000: unify defconfigs

[linux-2.6] / kernel / cpuset.c
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index fc949e4a625c14cffc6865c6e5d98071fa2beebf..12815d3f1a05ee25efc0eb489ff9c6abc4e4cb98 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -39,6 +39,7 @@
  #include <linux/namei.h>
  #include <linux/pagemap.h>
  #include <linux/proc_fs.h>
+#include <linux/rcupdate.h>
  #include <linux/sched.h>
  #include <linux/seq_file.h>
  #include <linux/slab.h>
@@ -61,7 +62,7 @@
   * When there is only one cpuset (the root cpuset) we can
   * short circuit some hooks.
   */
-int number_of_cpusets;
+int number_of_cpusets __read_mostly;
  
  /* See "Frequency meter" comments, below. */
  
@@ -248,6 +249,11 @@ static struct super_block *cpuset_sb;
   * a tasks cpuset pointer we use task_lock(), which acts on a spinlock
   * (task->alloc_lock) already in the task_struct routinely used for
   * such matters.
+ *
+ * P.S.  One more locking exception.  RCU is used to guard the
+ * update of a tasks cpuset pointer by attach_task() and the
+ * access of task->cpuset->mems_generation via that pointer in
+ * the routine cpuset_update_task_memory_state().
   */
  
  static DECLARE_MUTEX(manage_sem);
@@ -325,7 +331,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
         spin_lock(&dcache_lock);
         node = dentry->d_subdirs.next;
         while (node != &dentry->d_subdirs) {
-               struct dentry *d = list_entry(node, struct dentry, d_child);
+               struct dentry *d = list_entry(node, struct dentry, d_u.d_child);
                 list_del_init(node);
                 if (d->d_inode) {
                         d = dget_locked(d);
@@ -337,7 +343,7 @@ static void cpuset_d_remove_dir(struct dentry *dentry)
                 }
                 node = dentry->d_subdirs.next;
         }
-       list_del_init(&dentry->d_child);
+       list_del_init(&dentry->d_u.d_child);
         spin_unlock(&dcache_lock);
         remove_dir(dentry);
  }
@@ -610,12 +616,24 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
   * cpuset pointer.  This routine also might acquire callback_sem and
   * current->mm->mmap_sem during call.
   *
- * The task_lock() is required to dereference current->cpuset safely.
- * Without it, we could pick up the pointer value of current->cpuset
- * in one instruction, and then attach_task could give us a different
- * cpuset, and then the cpuset we had could be removed and freed,
- * and then on our next instruction, we could dereference a no longer
- * valid cpuset pointer to get its mems_generation field.
+ * Reading current->cpuset->mems_generation doesn't need task_lock
+ * to guard the current->cpuset derefence, because it is guarded
+ * from concurrent freeing of current->cpuset by attach_task(),
+ * using RCU.
+ *
+ * The rcu_dereference() is technically probably not needed,
+ * as I don't actually mind if I see a new cpuset pointer but
+ * an old value of mems_generation.  However this really only
+ * matters on alpha systems using cpusets heavily.  If I dropped
+ * that rcu_dereference(), it would save them a memory barrier.
+ * For all other arch's, rcu_dereference is a no-op anyway, and for
+ * alpha systems not using cpusets, another planned optimization,
+ * avoiding the rcu critical section for tasks in the root cpuset
+ * which is statically allocated, so can't vanish, will make this
+ * irrelevant.  Better to use RCU as intended, than to engage in
+ * some cute trick to save a memory barrier that is impossible to
+ * test, for alpha systems using cpusets heavily, which might not
+ * even exist.
   *
   * This routine is needed to update the per-task mems_allowed data,
   * within the tasks context, when it is trying to allocate memory
@@ -623,15 +641,21 @@ static void guarantee_online_mems(const struct cpuset *cs, nodemask_t *pmask)
   * task has been modifying its cpuset.
   */
  
-void cpuset_update_task_memory_state()
+void cpuset_update_task_memory_state(void)
  {
         int my_cpusets_mem_gen;
         struct task_struct *tsk = current;
-       struct cpuset *cs = tsk->cpuset;
+       struct cpuset *cs;
  
-       task_lock(tsk);
-       my_cpusets_mem_gen = cs->mems_generation;
-       task_unlock(tsk);
+       if (tsk->cpuset == &top_cpuset) {
+               /* Don't need rcu for top_cpuset.  It's never freed. */
+               my_cpusets_mem_gen = top_cpuset.mems_generation;
+       } else {
+               rcu_read_lock();
+               cs = rcu_dereference(tsk->cpuset);
+               my_cpusets_mem_gen = cs->mems_generation;
+               rcu_read_unlock();
+       }
  
         if (my_cpusets_mem_gen != tsk->cpuset_mems_generation) {
                 down(&callback_sem);
@@ -1131,7 +1155,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
                 return -ESRCH;
         }
         atomic_inc(&cs->count);
-       tsk->cpuset = cs;
+       rcu_assign_pointer(tsk->cpuset, cs);
         task_unlock(tsk);
  
         guarantee_online_cpus(cs, &cpus);
@@ -1151,6 +1175,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
         if (is_memory_migrate(cs))
                 do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
         put_task_struct(tsk);
+       synchronize_rcu();
         if (atomic_dec_and_test(&oldcs->count))
                 check_for_release(oldcs, ppathbuf);
         return 0;
@@ -1488,7 +1513,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
         struct dentry *dentry;
         int error;
  
-       down(&dir->d_inode->i_sem);
+       mutex_lock(&dir->d_inode->i_mutex);
         dentry = cpuset_get_dentry(dir, cft->name);
         if (!IS_ERR(dentry)) {
                 error = cpuset_create_file(dentry, 0644 | S_IFREG);
@@ -1497,7 +1522,7 @@ static int cpuset_add_file(struct dentry *dir, const struct cftype *cft)
                 dput(dentry);
         } else
                 error = PTR_ERR(dentry);
-       up(&dir->d_inode->i_sem);
+       mutex_unlock(&dir->d_inode->i_mutex);
         return error;
  }
  
@@ -1529,7 +1554,7 @@ struct ctr_struct {
   * when reading out p->cpuset, as we don't really care if it changes
   * on the next cycle, and we are not going to try to dereference it.
   */
-static inline int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
+static int pid_array_load(pid_t *pidarray, int npids, struct cpuset *cs)
  {
         int n = 0;
         struct task_struct *g, *p;
@@ -1768,7 +1793,7 @@ static long cpuset_create(struct cpuset *parent, const char *name, int mode)
  
         /*
          * Release manage_sem before cpuset_populate_dir() because it
-        * will down() this new directory's i_sem and if we race with
+        * will down() this new directory's i_mutex and if we race with
          * another mkdir, we might deadlock.
          */
         up(&manage_sem);
@@ -1787,7 +1812,7 @@ static int cpuset_mkdir(struct inode *dir, struct dentry *dentry, int mode)
  {
         struct cpuset *c_parent = dentry->d_parent->d_fsdata;
  
-       /* the vfs holds inode->i_sem already */
+       /* the vfs holds inode->i_mutex already */
         return cpuset_create(c_parent, dentry->d_name.name, mode | S_IFDIR);
  }
  
@@ -1798,7 +1823,7 @@ static int cpuset_rmdir(struct inode *unused_dir, struct dentry *dentry)
         struct cpuset *parent;
         char *pathbuf = NULL;
  
-       /* the vfs holds both inode->i_sem already */
+       /* the vfs holds both inode->i_mutex already */
  
         down(&manage_sem);
         cpuset_update_task_memory_state();
@@ -1952,6 +1977,39 @@ void cpuset_fork(struct task_struct *child)
   * We don't need to task_lock() this reference to tsk->cpuset,
   * because tsk is already marked PF_EXITING, so attach_task() won't
   * mess with it, or task is a failed fork, never visible to attach_task.
+ *
+ * Hack:
+ *
+ *    Set the exiting tasks cpuset to the root cpuset (top_cpuset).
+ *
+ *    Don't leave a task unable to allocate memory, as that is an
+ *    accident waiting to happen should someone add a callout in
+ *    do_exit() after the cpuset_exit() call that might allocate.
+ *    If a task tries to allocate memory with an invalid cpuset,
+ *    it will oops in cpuset_update_task_memory_state().
+ *
+ *    We call cpuset_exit() while the task is still competent to
+ *    handle notify_on_release(), then leave the task attached to
+ *    the root cpuset (top_cpuset) for the remainder of its exit.
+ *
+ *    To do this properly, we would increment the reference count on
+ *    top_cpuset, and near the very end of the kernel/exit.c do_exit()
+ *    code we would add a second cpuset function call, to drop that
+ *    reference.  This would just create an unnecessary hot spot on
+ *    the top_cpuset reference count, to no avail.
+ *
+ *    Normally, holding a reference to a cpuset without bumping its
+ *    count is unsafe.   The cpuset could go away, or someone could
+ *    attach us to a different cpuset, decrementing the count on
+ *    the first cpuset that we never incremented.  But in this case,
+ *    top_cpuset isn't going away, and either task has PF_EXITING set,
+ *    which wards off any attach_task() attempts, or task is a failed
+ *    fork, never visible to attach_task.
+ *
+ *    Another way to do this would be to set the cpuset pointer
+ *    to NULL here, and check in cpuset_update_task_memory_state()
+ *    for a NULL pointer.  This hack avoids that NULL check, for no
+ *    cost (other than this way too long comment ;).
   **/
  
  void cpuset_exit(struct task_struct *tsk)
@@ -1959,7 +2017,7 @@ void cpuset_exit(struct task_struct *tsk)
         struct cpuset *cs;
  
         cs = tsk->cpuset;
-       tsk->cpuset = NULL;
+       tsk->cpuset = &top_cpuset;      /* Hack - see comment above */
  
         if (notify_on_release(cs)) {
                 char *pathbuf = NULL;
@@ -2124,6 +2182,33 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
         return allowed;
  }
  
+/**
+ * cpuset_lock - lock out any changes to cpuset structures
+ *
+ * The out of memory (oom) code needs to lock down cpusets
+ * from being changed while it scans the tasklist looking for a
+ * task in an overlapping cpuset.  Expose callback_sem via this
+ * cpuset_lock() routine, so the oom code can lock it, before
+ * locking the task list.  The tasklist_lock is a spinlock, so
+ * must be taken inside callback_sem.
+ */
+
+void cpuset_lock(void)
+{
+       down(&callback_sem);
+}
+
+/**
+ * cpuset_unlock - release lock on cpuset changes
+ *
+ * Undo the lock taken in a previous cpuset_lock() call.
+ */
+
+void cpuset_unlock(void)
+{
+       up(&callback_sem);
+}
+
  /**
   * cpuset_excl_nodes_overlap - Do we overlap @p's mem_exclusive ancestors?
   * @p: pointer to task_struct of some other task.
@@ -2133,7 +2218,7 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
   * determine if task @p's memory usage might impact the memory
   * available to the current task.
   *
- * Acquires callback_sem - not suitable for calling from a fast path.
+ * Call while holding callback_sem.
   **/
  
  int cpuset_excl_nodes_overlap(const struct task_struct *p)
@@ -2141,8 +2226,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
         const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */
         int overlap = 0;                /* do cpusets overlap? */
  
-       down(&callback_sem);
-
         task_lock(current);
         if (current->flags & PF_EXITING) {
                 task_unlock(current);
@@ -2161,8 +2244,6 @@ int cpuset_excl_nodes_overlap(const struct task_struct *p)
  
         overlap = nodes_intersects(cs1->mems_allowed, cs2->mems_allowed);
  done:
-       up(&callback_sem);
-
         return overlap;
  }