[PATCH] cpuset: rework cpuset_zone_allowed api

author Paul Jackson <pj@sgi.com>

Wed, 13 Dec 2006 08:34:25 +0000 (00:34 -0800)

committer Linus Torvalds <torvalds@woody.osdl.org>

Wed, 13 Dec 2006 17:05:49 +0000 (09:05 -0800)
author Paul Jackson <pj@sgi.com>
Wed, 13 Dec 2006 08:34:25 +0000 (00:34 -0800)
committer Linus Torvalds <torvalds@woody.osdl.org>
Wed, 13 Dec 2006 17:05:49 +0000 (09:05 -0800)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h

index 8821e1f75b447856a041435d966eb12fb5c94941..826b15e914e2b9986787189dcba1d2efe15e15c3 100644 (file)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -30,10 +30,19 @@ void cpuset_update_task_memory_state(void);
                 nodes_subset((nodes), current->mems_allowed)
  int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl);
  
-extern int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask);
-static int inline cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
+extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
+
+static int inline cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+{
+       return number_of_cpusets <= 1 ||
+               __cpuset_zone_allowed_softwall(z, gfp_mask);
+}
+
+static int inline cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
  {
-       return number_of_cpusets <= 1 || __cpuset_zone_allowed(z, gfp_mask);
+       return number_of_cpusets <= 1 ||
+               __cpuset_zone_allowed_hardwall(z, gfp_mask);
  }
  
  extern int cpuset_excl_nodes_overlap(const struct task_struct *p);
@@ -94,7 +103,12 @@ static inline int cpuset_zonelist_valid_mems_allowed(struct zonelist *zl)
         return 1;
  }
  
-static inline int cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+{
+       return 1;
+}
+
+static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
  {
         return 1;
  }
diff --git a/kernel/cpuset.c b/kernel/cpuset.c

index 2c3b4431472bec305dc41dec0c180c23d009ad58..232aed2b10f9a403346cc6e4a425c4ed8613a4d1 100644 (file)
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2342,32 +2342,48 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
  }
  
  /**
- * cpuset_zone_allowed - Can we allocate memory on zone z's memory node?
+ * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
   * @z: is this zone on an allowed node?
- * @gfp_mask: memory allocation flags (we use __GFP_HARDWALL)
+ * @gfp_mask: memory allocation flags
   *
- * If we're in interrupt, yes, we can always allocate.  If zone
+ * If we're in interrupt, yes, we can always allocate.  If
+ * __GFP_THISNODE is set, yes, we can always allocate.  If zone
   * z's node is in our tasks mems_allowed, yes.  If it's not a
   * __GFP_HARDWALL request and this zone's nodes is in the nearest
   * mem_exclusive cpuset ancestor to this tasks cpuset, yes.
   * Otherwise, no.
   *
+ * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
+ * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
+ * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
+ * from an enclosing cpuset.
+ *
+ * cpuset_zone_allowed_hardwall() only handles the simpler case of
+ * hardwall cpusets, and never sleeps.
+ *
+ * The __GFP_THISNODE placement logic is really handled elsewhere,
+ * by forcibly using a zonelist starting at a specified node, and by
+ * (in get_page_from_freelist()) refusing to consider the zones for
+ * any node on the zonelist except the first.  By the time any such
+ * calls get to this routine, we should just shut up and say 'yes'.
+ *
   * GFP_USER allocations are marked with the __GFP_HARDWALL bit,
   * and do not allow allocations outside the current tasks cpuset.
   * GFP_KERNEL allocations are not so marked, so can escape to the
- * nearest mem_exclusive ancestor cpuset.
+ * nearest enclosing mem_exclusive ancestor cpuset.
   *
- * Scanning up parent cpusets requires callback_mutex.  The __alloc_pages()
- * routine only calls here with __GFP_HARDWALL bit _not_ set if
- * it's a GFP_KERNEL allocation, and all nodes in the current tasks
- * mems_allowed came up empty on the first pass over the zonelist.
- * So only GFP_KERNEL allocations, if all nodes in the cpuset are
- * short of memory, might require taking the callback_mutex mutex.
+ * Scanning up parent cpusets requires callback_mutex.  The
+ * __alloc_pages() routine only calls here with __GFP_HARDWALL bit
+ * _not_ set if it's a GFP_KERNEL allocation, and all nodes in the
+ * current tasks mems_allowed came up empty on the first pass over
+ * the zonelist.  So only GFP_KERNEL allocations, if all nodes in the
+ * cpuset are short of memory, might require taking the callback_mutex
+ * mutex.
   *
   * The first call here from mm/page_alloc:get_page_from_freelist()
- * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets, so
- * no allocation on a node outside the cpuset is allowed (unless in
- * interrupt, of course).
+ * has __GFP_HARDWALL set in gfp_mask, enforcing hardwall cpusets,
+ * so no allocation on a node outside the cpuset is allowed (unless
+ * in interrupt, of course).
   *
   * The second pass through get_page_from_freelist() doesn't even call
   * here for GFP_ATOMIC calls.  For those calls, the __alloc_pages()
@@ -2380,12 +2396,12 @@ static const struct cpuset *nearest_exclusive_ancestor(const struct cpuset *cs)
   *     GFP_USER     - only nodes in current tasks mems allowed ok.
   *
   * Rule:
- *    Don't call cpuset_zone_allowed() if you can't sleep, unless you
+ *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
   *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
   *    the code that might scan up ancestor cpusets and sleep.
- **/
+ */
  
-int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
+int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
  {
         int node;                       /* node that zone z is on */
         const struct cpuset *cs;        /* current cpuset ancestors */
@@ -2415,6 +2431,40 @@ int __cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask)
         return allowed;
  }
  
+/*
+ * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
+ * @z: is this zone on an allowed node?
+ * @gfp_mask: memory allocation flags
+ *
+ * If we're in interrupt, yes, we can always allocate.
+ * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
+ * z's node is in our tasks mems_allowed, yes.   Otherwise, no.
+ *
+ * The __GFP_THISNODE placement logic is really handled elsewhere,
+ * by forcibly using a zonelist starting at a specified node, and by
+ * (in get_page_from_freelist()) refusing to consider the zones for
+ * any node on the zonelist except the first.  By the time any such
+ * calls get to this routine, we should just shut up and say 'yes'.
+ *
+ * Unlike the cpuset_zone_allowed_softwall() variant, above,
+ * this variant requires that the zone be in the current tasks
+ * mems_allowed or that we're in interrupt.  It does not scan up the
+ * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
+ * It never sleeps.
+ */
+
+int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+{
+       int node;                       /* node that zone z is on */
+
+       if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
+               return 1;
+       node = zone_to_nid(z);
+       if (node_isset(node, current->mems_allowed))
+               return 1;
+       return 0;
+}
+
  /**
   * cpuset_lock - lock out any changes to cpuset structures
   *
diff --git a/mm/hugetlb.c b/mm/hugetlb.c

index 0ccc7f2302529b0e3a237c556f895ab4ddfb3911..089092d152ab3ee80a0e1e17d67ff6bc923a6759 100644 (file)
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -73,7 +73,7 @@ static struct page *dequeue_huge_page(struct vm_area_struct *vma,
  
         for (z = zonelist->zones; *z; z++) {
                 nid = zone_to_nid(*z);
-               if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+               if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
                     !list_empty(&hugepage_freelists[nid]))
                         break;
         }
diff --git a/mm/oom_kill.c b/mm/oom_kill.c

index 223d9ccb7d64b6b28f5642ba3ab12629a170419a..64cf3c2146348b0e1cba3fd98d64354bcb466c4a 100644 (file)
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -177,7 +177,7 @@ static inline int constrained_alloc(struct zonelist *zonelist, gfp_t gfp_mask)
         nodemask_t nodes = node_online_map;
  
         for (z = zonelist->zones; *z; z++)
-               if (cpuset_zone_allowed(*z, gfp_mask))
+               if (cpuset_zone_allowed_softwall(*z, gfp_mask))
                         node_clear(zone_to_nid(*z), nodes);
                 else
                         return CONSTRAINT_CPUSET;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index e6b17b2989e099636dcb1ec64587f130b8ad27c0..8c1a116875bc71d520069686dcb2483a79659604 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1162,7 +1162,7 @@ zonelist_scan:
                         zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
                                 break;
                 if ((alloc_flags & ALLOC_CPUSET) &&
-                       !cpuset_zone_allowed(zone, gfp_mask))
+                       !cpuset_zone_allowed_softwall(zone, gfp_mask))
                                 goto try_next_zone;
  
                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
diff --git a/mm/slab.c b/mm/slab.c

index 9d3550086c93273fc848e70a9c403a674bf780f5..b856786a3a30ba2b563296717e0ad619b074be66 100644 (file)
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3262,7 +3262,7 @@ retry:
         for (z = zonelist->zones; *z && !obj; z++) {
                 nid = zone_to_nid(*z);
  
-               if (cpuset_zone_allowed(*z, flags | __GFP_HARDWALL) &&
+               if (cpuset_zone_allowed_hardwall(*z, flags) &&
                         cache->nodelists[nid] &&
                         cache->nodelists[nid]->free_objects)
                                 obj = ____cache_alloc_node(cache,
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 093f5fe6dd7795cc5d0ca15f5918eedc822b87f4..e9813b06c7a32f2be3b534722c521457a384e028 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -984,7 +984,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
                 if (!populated_zone(zone))
                         continue;
  
-               if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                         continue;
  
                 note_zone_scanning_priority(zone, priority);
@@ -1034,7 +1034,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
         for (i = 0; zones[i] != NULL; i++) {
                 struct zone *zone = zones[i];
  
-               if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                         continue;
  
                 lru_pages += zone->nr_active + zone->nr_inactive;
@@ -1089,7 +1089,7 @@ out:
         for (i = 0; zones[i] != 0; i++) {
                 struct zone *zone = zones[i];
  
-               if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                         continue;
  
                 zone->prev_priority = priority;
@@ -1354,7 +1354,7 @@ void wakeup_kswapd(struct zone *zone, int order)
                 return;
         if (pgdat->kswapd_max_order < order)
                 pgdat->kswapd_max_order = order;
-       if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
                 return;
         if (!waitqueue_active(&pgdat->kswapd_wait))
                 return;
author	Paul Jackson <pj@sgi.com>
	Wed, 13 Dec 2006 08:34:25 +0000 (00:34 -0800)
committer	Linus Torvalds <torvalds@woody.osdl.org>
	Wed, 13 Dec 2006 17:05:49 +0000 (09:05 -0800)
include/linux/cpuset.h		patch \| blob \| history
kernel/cpuset.c		patch \| blob \| history
mm/hugetlb.c		patch \| blob \| history
mm/oom_kill.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/slab.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history