[PATCH] proc: readdir race fix (take 3)

author Eric W. Biederman <ebiederm@xmission.com>

Mon, 2 Oct 2006 09:17:04 +0000 (02:17 -0700)

committer Linus Torvalds <torvalds@g5.osdl.org>

Mon, 2 Oct 2006 14:57:12 +0000 (07:57 -0700)
author Eric W. Biederman <ebiederm@xmission.com>
Mon, 2 Oct 2006 09:17:04 +0000 (02:17 -0700)
committer Linus Torvalds <torvalds@g5.osdl.org>
Mon, 2 Oct 2006 14:57:12 +0000 (07:57 -0700)
diff --git a/fs/proc/base.c b/fs/proc/base.c

index 89c20d9d50bfbf09c77aca8ca921885bb7eb0084..b18f3773dd4360e33cff99366794d0d3900c10f5 100644 (file)
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -2142,72 +2142,43 @@ out_no_task:
  }
  
  /*
- * Find the first tgid to return to user space.
+ * Find the first task with tgid >= tgid
   *
- * Usually this is just whatever follows &init_task, but if the users
- * buffer was too small to hold the full list or there was a seek into
- * the middle of the directory we have more work to do.
- *
- * In the case of a short read we start with find_task_by_pid.
- *
- * In the case of a seek we start with &init_task and walk nr
- * threads past it.
   */
-static struct task_struct *first_tgid(int tgid, unsigned int nr)
+static struct task_struct *next_tgid(unsigned int tgid)
  {
-       struct task_struct *pos;
-       rcu_read_lock();
-       if (tgid && nr) {
-               pos = find_task_by_pid(tgid);
-               if (pos && thread_group_leader(pos))
-                       goto found;
-       }
-       /* If nr exceeds the number of processes get out quickly */
-       pos = NULL;
-       if (nr && nr >= nr_processes())
-               goto done;
-
-       /* If we haven't found our starting place yet start with
-        * the init_task and walk nr tasks forward.
-        */
-       for (pos = next_task(&init_task); nr > 0; --nr) {
-               pos = next_task(pos);
-               if (pos == &init_task) {
-                       pos = NULL;
-                       goto done;
-               }
-       }
-found:
-       get_task_struct(pos);
-done:
-       rcu_read_unlock();
-       return pos;
-}
+       struct task_struct *task;
+       struct pid *pid;
  
-/*
- * Find the next task in the task list.
- * Return NULL if we loop or there is any error.
- *
- * The reference to the input task_struct is released.
- */
-static struct task_struct *next_tgid(struct task_struct *start)
-{
-       struct task_struct *pos;
         rcu_read_lock();
-       pos = start;
-       if (pid_alive(start))
-               pos = next_task(start);
-       if (pid_alive(pos) && (pos != &init_task)) {
-               get_task_struct(pos);
-               goto done;
+retry:
+       task = NULL;
+       pid = find_ge_pid(tgid);
+       if (pid) {
+               tgid = pid->nr + 1;
+               task = pid_task(pid, PIDTYPE_PID);
+               /* What we to know is if the pid we have find is the
+                * pid of a thread_group_leader.  Testing for task
+                * being a thread_group_leader is the obvious thing
+                * todo but there is a window when it fails, due to
+                * the pid transfer logic in de_thread.
+                *
+                * So we perform the straight forward test of seeing
+                * if the pid we have found is the pid of a thread
+                * group leader, and don't worry if the task we have
+                * found doesn't happen to be a thread group leader.
+                * As we don't care in the case of readdir.
+                */
+               if (!task || !has_group_leader_pid(task))
+                       goto retry;
+               get_task_struct(task);
         }
-       pos = NULL;
-done:
         rcu_read_unlock();
-       put_task_struct(start);
-       return pos;
+       return task;
  }
  
+#define TGID_OFFSET (FIRST_PROCESS_ENTRY + (1 /* /proc/self */))
+
  /* for the /proc/ directory itself, after non-process stuff has been done */
  int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
  {
@@ -2223,29 +2194,24 @@ int proc_pid_readdir(struct file * filp, void * dirent, filldir_t filldir)
                 filp->f_pos++;
                 nr++;
         }
-       nr -= 1;
  
-       /* f_version caches the tgid value that the last readdir call couldn't
-        * return. lseek aka telldir automagically resets f_version to 0.
-        */
-       tgid = filp->f_version;
-       filp->f_version = 0;
-       for (task = first_tgid(tgid, nr);
+       tgid = filp->f_pos - TGID_OFFSET;
+       for (task = next_tgid(tgid);
              task;
-            task = next_tgid(task), filp->f_pos++) {
+            put_task_struct(task), task = next_tgid(tgid + 1)) {
                 int len;
                 ino_t ino;
                 tgid = task->pid;
+               filp->f_pos = tgid + TGID_OFFSET;
                 len = snprintf(buf, sizeof(buf), "%d", tgid);
                 ino = fake_ino(tgid, PROC_TGID_INO);
                 if (filldir(dirent, buf, len, filp->f_pos, ino, DT_DIR) < 0) {
-                       /* returning this tgid failed, save it as the first
-                        * pid for the next readir call */
-                       filp->f_version = tgid;
                         put_task_struct(task);
-                       break;
+                       goto out;
                 }
         }
+       filp->f_pos = PID_MAX_LIMIT + TGID_OFFSET;
+out:
         return 0;
  }
  
diff --git a/include/linux/pid.h b/include/linux/pid.h

index 93da7e2d9f30bdda2268d8163e3c9f29d73a4a4a..359121086de1891b6dde0e7eeea0e9328f7fbde7 100644 (file)
--- a/include/linux/pid.h
+++ b/include/linux/pid.h
@@ -89,6 +89,7 @@ extern struct pid *FASTCALL(find_pid(int nr));
   * Lookup a PID in the hash table, and return with it's count elevated.
   */
  extern struct pid *find_get_pid(int nr);
+extern struct pid *find_ge_pid(int nr);
  
  extern struct pid *alloc_pid(void);
  extern void FASTCALL(free_pid(struct pid *pid));
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 7ef899c47c29296b14f28030cab737fc6dc30700..be658e33bd26d3fba85a33f0d4e53e40018fd2d9 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1358,6 +1358,17 @@ extern void wait_task_inactive(struct task_struct * p);
  /* de_thread depends on thread_group_leader not being a pid based check */
  #define thread_group_leader(p) (p == p->group_leader)
  
+/* Do to the insanities of de_thread it is possible for a process
+ * to have the pid of the thread group leader without actually being
+ * the thread group leader.  For iteration through the pids in proc
+ * all we care about is that we have a task with the appropriate
+ * pid, we don't actually care if we have the right task.
+ */
+static inline int has_group_leader_pid(struct task_struct *p)
+{
+       return p->pid == p->tgid;
+}
+
  static inline struct task_struct *next_thread(const struct task_struct *p)
  {
         return list_entry(rcu_dereference(p->thread_group.next),
diff --git a/kernel/pid.c b/kernel/pid.c

index 8387e8c681938a968ae005f52c58fcdf11ab17c3..ed89a732432cd5e193dae3c09881d45c47a0607e 100644 (file)
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -145,6 +145,23 @@ static int alloc_pidmap(void)
         return -1;
  }
  
+static int next_pidmap(int last)
+{
+       int offset;
+       pidmap_t *map;
+
+       offset = (last + 1) & BITS_PER_PAGE_MASK;
+       map = &pidmap_array[(last + 1)/BITS_PER_PAGE];
+       for (; map < &pidmap_array[PIDMAP_ENTRIES]; map++, offset = 0) {
+               if (unlikely(!map->page))
+                       continue;
+               offset = find_next_bit((map)->page, BITS_PER_PAGE, offset);
+               if (offset < BITS_PER_PAGE)
+                       return mk_pid(map, offset);
+       }
+       return -1;
+}
+
  fastcall void put_pid(struct pid *pid)
  {
         if (!pid)
@@ -302,6 +319,25 @@ struct pid *find_get_pid(pid_t nr)
         return pid;
  }
  
+/*
+ * Used by proc to find the first pid that is greater then or equal to nr.
+ *
+ * If there is a pid at nr this function is exactly the same as find_pid.
+ */
+struct pid *find_ge_pid(int nr)
+{
+       struct pid *pid;
+
+       do {
+               pid = find_pid(nr);
+               if (pid)
+                       break;
+               nr = next_pidmap(nr);
+       } while (nr > 0);
+
+       return pid;
+}
+
  /*
   * The pid hash table is scaled according to the amount of memory in the
   * machine.  From a minimum of 16 slots up to 4096 slots at one gigabyte or
author	Eric W. Biederman <ebiederm@xmission.com>
	Mon, 2 Oct 2006 09:17:04 +0000 (02:17 -0700)
committer	Linus Torvalds <torvalds@g5.osdl.org>
	Mon, 2 Oct 2006 14:57:12 +0000 (07:57 -0700)
fs/proc/base.c		patch \| blob \| history
include/linux/pid.h		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/pid.c		patch \| blob \| history