[PATCH] memory page_alloc zonelist caching speedup

author Paul Jackson <pj@sgi.com>

Thu, 7 Dec 2006 04:31:48 +0000 (20:31 -0800)

committer Linus Torvalds <torvalds@woody.osdl.org>

Thu, 7 Dec 2006 16:39:20 +0000 (08:39 -0800)
author Paul Jackson <pj@sgi.com>
Thu, 7 Dec 2006 04:31:48 +0000 (20:31 -0800)
committer Linus Torvalds <torvalds@woody.osdl.org>
Thu, 7 Dec 2006 16:39:20 +0000 (08:39 -0800)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h

index 4d8adf6636810875b00f070dd548193e620b7015..748d2c99663191e02e562770ab64dc76e66a0f5e 100644 (file)
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -23,6 +23,7 @@ extern void cpuset_fork(struct task_struct *p);
  extern void cpuset_exit(struct task_struct *p);
  extern cpumask_t cpuset_cpus_allowed(struct task_struct *p);
  extern nodemask_t cpuset_mems_allowed(struct task_struct *p);
+#define cpuset_current_mems_allowed (current->mems_allowed)
  void cpuset_init_current_mems_allowed(void);
  void cpuset_update_task_memory_state(void);
  #define cpuset_nodes_subset_current_mems_allowed(nodes) \
@@ -83,6 +84,7 @@ static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
         return node_possible_map;
  }
  
+#define cpuset_current_mems_allowed (node_online_map)
  static inline void cpuset_init_current_mems_allowed(void) {}
  static inline void cpuset_update_task_memory_state(void) {}
  #define cpuset_nodes_subset_current_mems_allowed(nodes) (1)
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index e06683e2bea39fb61fc02803aff1e5fcee2ce7fd..09bf9d8d7b7226aab511988f75354c20ad7cd7ac 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -288,19 +288,94 @@ struct zone {
   */
  #define DEF_PRIORITY 12
  
+/* Maximum number of zones on a zonelist */
+#define MAX_ZONES_PER_ZONELIST (MAX_NUMNODES * MAX_NR_ZONES)
+
+#ifdef CONFIG_NUMA
+/*
+ * We cache key information from each zonelist for smaller cache
+ * footprint when scanning for free pages in get_page_from_freelist().
+ *
+ * 1) The BITMAP fullzones tracks which zones in a zonelist have come
+ *    up short of free memory since the last time (last_fullzone_zap)
+ *    we zero'd fullzones.
+ * 2) The array z_to_n[] maps each zone in the zonelist to its node
+ *    id, so that we can efficiently evaluate whether that node is
+ *    set in the current tasks mems_allowed.
+ *
+ * Both fullzones and z_to_n[] are one-to-one with the zonelist,
+ * indexed by a zones offset in the zonelist zones[] array.
+ *
+ * The get_page_from_freelist() routine does two scans.  During the
+ * first scan, we skip zones whose corresponding bit in 'fullzones'
+ * is set or whose corresponding node in current->mems_allowed (which
+ * comes from cpusets) is not set.  During the second scan, we bypass
+ * this zonelist_cache, to ensure we look methodically at each zone.
+ *
+ * Once per second, we zero out (zap) fullzones, forcing us to
+ * reconsider nodes that might have regained more free memory.
+ * The field last_full_zap is the time we last zapped fullzones.
+ *
+ * This mechanism reduces the amount of time we waste repeatedly
+ * reexaming zones for free memory when they just came up low on
+ * memory momentarilly ago.
+ *
+ * The zonelist_cache struct members logically belong in struct
+ * zonelist.  However, the mempolicy zonelists constructed for
+ * MPOL_BIND are intentionally variable length (and usually much
+ * shorter).  A general purpose mechanism for handling structs with
+ * multiple variable length members is more mechanism than we want
+ * here.  We resort to some special case hackery instead.
+ *
+ * The MPOL_BIND zonelists don't need this zonelist_cache (in good
+ * part because they are shorter), so we put the fixed length stuff
+ * at the front of the zonelist struct, ending in a variable length
+ * zones[], as is needed by MPOL_BIND.
+ *
+ * Then we put the optional zonelist cache on the end of the zonelist
+ * struct.  This optional stuff is found by a 'zlcache_ptr' pointer in
+ * the fixed length portion at the front of the struct.  This pointer
+ * both enables us to find the zonelist cache, and in the case of
+ * MPOL_BIND zonelists, (which will just set the zlcache_ptr to NULL)
+ * to know that the zonelist cache is not there.
+ *
+ * The end result is that struct zonelists come in two flavors:
+ *  1) The full, fixed length version, shown below, and
+ *  2) The custom zonelists for MPOL_BIND.
+ * The custom MPOL_BIND zonelists have a NULL zlcache_ptr and no zlcache.
+ *
+ * Even though there may be multiple CPU cores on a node modifying
+ * fullzones or last_full_zap in the same zonelist_cache at the same
+ * time, we don't lock it.  This is just hint data - if it is wrong now
+ * and then, the allocator will still function, perhaps a bit slower.
+ */
+
+
+struct zonelist_cache {
+       DECLARE_BITMAP(fullzones, MAX_ZONES_PER_ZONELIST);      /* zone full? */
+       unsigned short z_to_n[MAX_ZONES_PER_ZONELIST];          /* zone->nid */
+       unsigned long last_full_zap;            /* when last zap'd (jiffies) */
+};
+#else
+struct zonelist_cache;
+#endif
+
  /*
   * One allocation request operates on a zonelist. A zonelist
   * is a list of zones, the first one is the 'goal' of the
   * allocation, the other zones are fallback zones, in decreasing
   * priority.
   *
- * Right now a zonelist takes up less than a cacheline. We never
- * modify it apart from boot-up, and only a few indices are used,
- * so despite the zonelist table being relatively big, the cache
- * footprint of this construct is very small.
+ * If zlcache_ptr is not NULL, then it is just the address of zlcache,
+ * as explained above.  If zlcache_ptr is NULL, there is no zlcache.
   */
+
  struct zonelist {
-       struct zone *zones[MAX_NUMNODES * MAX_NR_ZONES + 1]; // NULL delimited
+       struct zonelist_cache *zlcache_ptr;                  // NULL or &zlcache
+       struct zone *zones[MAX_ZONES_PER_ZONELIST + 1];      // NULL delimited
+#ifdef CONFIG_NUMA
+       struct zonelist_cache zlcache;                       // optional ...
+#endif
  };
  
  #ifdef CONFIG_ARCH_POPULATES_NODE_MAP
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 617fb31086eef17d45f5b04df65ff3a851a7a19c..fb907236bbd82fad7140f2c9ffdb12d7624715c0 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -141,9 +141,11 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes)
         enum zone_type k;
  
         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+       max++;                  /* space for zlcache_ptr (see mmzone.h) */
         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
         if (!zl)
                 return NULL;
+       zl->zlcache_ptr = NULL;
         num = 0;
         /* First put in the highest zones from all nodes, then all the next 
            lower zones etc. Avoid empty zones because the memory allocator
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 23bc5bcbdcf9bfec2a5446ebc870a80c4a9de012..230771d3c6b6f98f62a6f2bef02bd4e9e80ee267 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -918,6 +918,126 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
         return 1;
  }
  
+#ifdef CONFIG_NUMA
+/*
+ * zlc_setup - Setup for "zonelist cache".  Uses cached zone data to
+ * skip over zones that are not allowed by the cpuset, or that have
+ * been recently (in last second) found to be nearly full.  See further
+ * comments in mmzone.h.  Reduces cache footprint of zonelist scans
+ * that have to skip over alot of full or unallowed zones.
+ *
+ * If the zonelist cache is present in the passed in zonelist, then
+ * returns a pointer to the allowed node mask (either the current
+ * tasks mems_allowed, or node_online_map.)
+ *
+ * If the zonelist cache is not available for this zonelist, does
+ * nothing and returns NULL.
+ *
+ * If the fullzones BITMAP in the zonelist cache is stale (more than
+ * a second since last zap'd) then we zap it out (clear its bits.)
+ *
+ * We hold off even calling zlc_setup, until after we've checked the
+ * first zone in the zonelist, on the theory that most allocations will
+ * be satisfied from that first zone, so best to examine that zone as
+ * quickly as we can.
+ */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+       nodemask_t *allowednodes;       /* zonelist_cache approximation */
+
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return NULL;
+
+       if (jiffies - zlc->last_full_zap > 1 * HZ) {
+               bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+               zlc->last_full_zap = jiffies;
+       }
+
+       allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
+                                       &cpuset_current_mems_allowed :
+                                       &node_online_map;
+       return allowednodes;
+}
+
+/*
+ * Given 'z' scanning a zonelist, run a couple of quick checks to see
+ * if it is worth looking at further for free memory:
+ *  1) Check that the zone isn't thought to be full (doesn't have its
+ *     bit set in the zonelist_cache fullzones BITMAP).
+ *  2) Check that the zones node (obtained from the zonelist_cache
+ *     z_to_n[] mapping) is allowed in the passed in allowednodes mask.
+ * Return true (non-zero) if zone is worth looking at further, or
+ * else return false (zero) if it is not.
+ *
+ * This check -ignores- the distinction between various watermarks,
+ * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ...  If a zone is
+ * found to be full for any variation of these watermarks, it will
+ * be considered full for up to one second by all requests, unless
+ * we are so low on memory on all allowed nodes that we are forced
+ * into the second scan of the zonelist.
+ *
+ * In the second scan we ignore this zonelist cache and exactly
+ * apply the watermarks to all zones, even it is slower to do so.
+ * We are low on memory in the second scan, and should leave no stone
+ * unturned looking for a free page.
+ */
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+                                               nodemask_t *allowednodes)
+{
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+       int i;                          /* index of *z in zonelist zones */
+       int n;                          /* node that zone *z is on */
+
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return 1;
+
+       i = z - zonelist->zones;
+       n = zlc->z_to_n[i];
+
+       /* This zone is worth trying if it is allowed but not full */
+       return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+}
+
+/*
+ * Given 'z' scanning a zonelist, set the corresponding bit in
+ * zlc->fullzones, so that subsequent attempts to allocate a page
+ * from that zone don't waste time re-examining it.
+ */
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+       struct zonelist_cache *zlc;     /* cached zonelist speedup info */
+       int i;                          /* index of *z in zonelist zones */
+
+       zlc = zonelist->zlcache_ptr;
+       if (!zlc)
+               return;
+
+       i = z - zonelist->zones;
+
+       set_bit(i, zlc->fullzones);
+}
+
+#else  /* CONFIG_NUMA */
+
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+       return NULL;
+}
+
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+                               nodemask_t *allowednodes)
+{
+       return 1;
+}
+
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+}
+#endif /* CONFIG_NUMA */
+
  /*
   * get_page_from_freelist goes through the zonelist trying to allocate
   * a page.
@@ -926,23 +1046,32 @@ static struct page *
  get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                 struct zonelist *zonelist, int alloc_flags)
  {
-       struct zone **z = zonelist->zones;
+       struct zone **z;
         struct page *page = NULL;
-       int classzone_idx = zone_idx(*z);
+       int classzone_idx = zone_idx(zonelist->zones[0]);
         struct zone *zone;
+       nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+       int zlc_active = 0;             /* set if using zonelist_cache */
+       int did_zlc_setup = 0;          /* just call zlc_setup() one time */
  
+zonelist_scan:
         /*
-        * Go through the zonelist once, looking for a zone with enough free.
+        * Scan zonelist, looking for a zone with enough free.
          * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
          */
+       z = zonelist->zones;
+
         do {
+               if (NUMA_BUILD && zlc_active &&
+                       !zlc_zone_worth_trying(zonelist, z, allowednodes))
+                               continue;
                 zone = *z;
                 if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
                         zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
                                 break;
                 if ((alloc_flags & ALLOC_CPUSET) &&
                         !cpuset_zone_allowed(zone, gfp_mask))
-                               continue;
+                               goto try_next_zone;
  
                 if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
                         unsigned long mark;
@@ -956,15 +1085,30 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
                                     classzone_idx, alloc_flags)) {
                                 if (!zone_reclaim_mode ||
                                     !zone_reclaim(zone, gfp_mask, order))
-                                       continue;
+                                       goto this_zone_full;
                         }
                 }
  
                 page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
                 if (page)
                         break;
-
+this_zone_full:
+               if (NUMA_BUILD)
+                       zlc_mark_zone_full(zonelist, z);
+try_next_zone:
+               if (NUMA_BUILD && !did_zlc_setup) {
+                       /* we do zlc_setup after the first zone is tried */
+                       allowednodes = zlc_setup(zonelist, alloc_flags);
+                       zlc_active = 1;
+                       did_zlc_setup = 1;
+               }
         } while (*(++z) != NULL);
+
+       if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+               /* Disable zlc cache for second zonelist scan */
+               zlc_active = 0;
+               goto zonelist_scan;
+       }
         return page;
  }
  
@@ -1535,6 +1679,24 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
         }
  }
  
+/* Construct the zonelist performance cache - see further mmzone.h */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+       int i;
+
+       for (i = 0; i < MAX_NR_ZONES; i++) {
+               struct zonelist *zonelist;
+               struct zonelist_cache *zlc;
+               struct zone **z;
+
+               zonelist = pgdat->node_zonelists + i;
+               zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+               bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+               for (z = zonelist->zones; *z; z++)
+                       zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+       }
+}
+
  #else  /* CONFIG_NUMA */
  
  static void __meminit build_zonelists(pg_data_t *pgdat)
@@ -1572,14 +1734,26 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
         }
  }
  
+/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+       int i;
+
+       for (i = 0; i < MAX_NR_ZONES; i++)
+               pgdat->node_zonelists[i].zlcache_ptr = NULL;
+}
+
  #endif /* CONFIG_NUMA */
  
  /* return values int ....just for stop_machine_run() */
  static int __meminit __build_all_zonelists(void *dummy)
  {
         int nid;
-       for_each_online_node(nid)
+
+       for_each_online_node(nid) {
                 build_zonelists(NODE_DATA(nid));
+               build_zonelist_cache(NODE_DATA(nid));
+       }
         return 0;
  }
author	Paul Jackson <pj@sgi.com>
	Thu, 7 Dec 2006 04:31:48 +0000 (20:31 -0800)
committer	Linus Torvalds <torvalds@woody.osdl.org>
	Thu, 7 Dec 2006 16:39:20 +0000 (08:39 -0800)
include/linux/cpuset.h		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
mm/mempolicy.c		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history