Lumpy Reclaim V4

author Andy Whitcroft <apw@shadowen.org>

Tue, 17 Jul 2007 11:03:16 +0000 (04:03 -0700)

committer Linus Torvalds <torvalds@woody.linux-foundation.org>

Tue, 17 Jul 2007 17:22:59 +0000 (10:22 -0700)
author Andy Whitcroft <apw@shadowen.org>
Tue, 17 Jul 2007 11:03:16 +0000 (04:03 -0700)
committer Linus Torvalds <torvalds@woody.linux-foundation.org>
Tue, 17 Jul 2007 17:22:59 +0000 (10:22 -0700)
diff --git a/fs/buffer.c b/fs/buffer.c

index 94344b2e0b46ccd3f4e19070e020be4ed6535bfd..d654a3b6209e30a0fb68d438bee7a40a9bfc54cd 100644 (file)
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -356,7 +356,7 @@ static void free_more_memory(void)
         for_each_online_pgdat(pgdat) {
                 zones = pgdat->node_zonelists[gfp_zone(GFP_NOFS)].zones;
                 if (*zones)
-                       try_to_free_pages(zones, GFP_NOFS);
+                       try_to_free_pages(zones, 0, GFP_NOFS);
         }
  }
  
diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h

index d71ff763c9df65e0132d27dc6442449b2a8b300b..da8eb8ad9e9b0a6b8578c845765170c24ad24e8c 100644 (file)
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -24,6 +24,14 @@
  #endif
  #define MAX_ORDER_NR_PAGES (1 << (MAX_ORDER - 1))
  
+/*
+ * PAGE_ALLOC_COSTLY_ORDER is the order at which allocations are deemed
+ * costly to service.  That is between allocation orders which should
+ * coelesce naturally under reasonable reclaim pressure and those which
+ * will not.
+ */
+#define PAGE_ALLOC_COSTLY_ORDER 3
+
  struct free_area {
         struct list_head        free_list;
         unsigned long           nr_free;
diff --git a/include/linux/swap.h b/include/linux/swap.h

index 006868881346b771eb471c91a4cde68047f3edc6..665f85f2a3afe61ee8be2ee49e29953b3b46bc4f 100644 (file)
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -188,7 +188,8 @@ extern int rotate_reclaimable_page(struct page *page);
  extern void swap_setup(void);
  
  /* linux/mm/vmscan.c */
-extern unsigned long try_to_free_pages(struct zone **, gfp_t);
+extern unsigned long try_to_free_pages(struct zone **zones, int order,
+                                       gfp_t gfp_mask);
  extern unsigned long shrink_all_memory(unsigned long nr_pages);
  extern int vm_swappiness;
  extern int remove_mapping(struct address_space *mapping, struct page *page);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index ac4f8c6b5c10771b71196c0af1238e894d225655..1a889c3fec59c0259496e4edb24ff929560c2b36 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1333,7 +1333,7 @@ nofail_alloc:
         reclaim_state.reclaimed_slab = 0;
         p->reclaim_state = &reclaim_state;
  
-       did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
+       did_some_progress = try_to_free_pages(zonelist->zones, order, gfp_mask);
  
         p->reclaim_state = NULL;
         p->flags &= ~PF_MEMALLOC;
@@ -1370,7 +1370,8 @@ nofail_alloc:
          */
         do_retry = 0;
         if (!(gfp_mask & __GFP_NORETRY)) {
-               if ((order <= 3) || (gfp_mask & __GFP_REPEAT))
+               if ((order <= PAGE_ALLOC_COSTLY_ORDER) ||
+                                               (gfp_mask & __GFP_REPEAT))
                         do_retry = 1;
                 if (gfp_mask & __GFP_NOFAIL)
                         do_retry = 1;
diff --git a/mm/vmscan.c b/mm/vmscan.c

index 1be5a6376ef0719b5e46937924a8a66cf59a35f1..1d9971d8924bb074b5a4204909e2a15d773bf371 100644 (file)
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -66,6 +66,8 @@ struct scan_control {
         int swappiness;
  
         int all_unreclaimable;
+
+       int order;
  };
  
  /*
@@ -481,7 +483,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
  
                 referenced = page_referenced(page, 1);
                 /* In active use or really unfreeable?  Activate it. */
-               if (referenced && page_mapping_inuse(page))
+               if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
+                                       referenced && page_mapping_inuse(page))
                         goto activate_locked;
  
  #ifdef CONFIG_SWAP
@@ -514,7 +517,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                 }
  
                 if (PageDirty(page)) {
-                       if (referenced)
+                       if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
                                 goto keep_locked;
                         if (!may_enter_fs)
                                 goto keep_locked;
@@ -598,6 +601,51 @@ keep:
         return nr_reclaimed;
  }
  
+/* LRU Isolation modes. */
+#define ISOLATE_INACTIVE 0     /* Isolate inactive pages. */
+#define ISOLATE_ACTIVE 1       /* Isolate active pages. */
+#define ISOLATE_BOTH 2         /* Isolate both active and inactive pages. */
+
+/*
+ * Attempt to remove the specified page from its LRU.  Only take this page
+ * if it is of the appropriate PageActive status.  Pages which are being
+ * freed elsewhere are also ignored.
+ *
+ * page:       page to consider
+ * mode:       one of the LRU isolation modes defined above
+ *
+ * returns 0 on success, -ve errno on failure.
+ */
+static int __isolate_lru_page(struct page *page, int mode)
+{
+       int ret = -EINVAL;
+
+       /* Only take pages on the LRU. */
+       if (!PageLRU(page))
+               return ret;
+
+       /*
+        * When checking the active state, we need to be sure we are
+        * dealing with comparible boolean values.  Take the logical not
+        * of each.
+        */
+       if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode))
+               return ret;
+
+       ret = -EBUSY;
+       if (likely(get_page_unless_zero(page))) {
+               /*
+                * Be careful not to clear PageLRU until after we're
+                * sure the page is not being freed elsewhere -- the
+                * page release code relies on it.
+                */
+               ClearPageLRU(page);
+               ret = 0;
+       }
+
+       return ret;
+}
+
  /*
   * zone->lru_lock is heavily contended.  Some of the functions that
   * shrink the lists perform better by taking out a batch of pages
@@ -612,44 +660,114 @@ keep:
   * @src:       The LRU list to pull pages off.
   * @dst:       The temp list to put pages on to.
   * @scanned:   The number of pages that were scanned.
+ * @order:     The caller's attempted allocation order
+ * @mode:      One of the LRU isolation modes
   *
   * returns how many pages were moved onto *@dst.
   */
  static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
                 struct list_head *src, struct list_head *dst,
-               unsigned long *scanned)
+               unsigned long *scanned, int order, int mode)
  {
         unsigned long nr_taken = 0;
-       struct page *page;
         unsigned long scan;
  
         for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
-               struct list_head *target;
+               struct page *page;
+               unsigned long pfn;
+               unsigned long end_pfn;
+               unsigned long page_pfn;
+               int zone_id;
+
                 page = lru_to_page(src);
                 prefetchw_prev_lru_page(page, src, flags);
  
                 VM_BUG_ON(!PageLRU(page));
  
-               list_del(&page->lru);
-               target = src;
-               if (likely(get_page_unless_zero(page))) {
-                       /*
-                        * Be careful not to clear PageLRU until after we're
-                        * sure the page is not being freed elsewhere -- the
-                        * page release code relies on it.
-                        */
-                       ClearPageLRU(page);
-                       target = dst;
+               switch (__isolate_lru_page(page, mode)) {
+               case 0:
+                       list_move(&page->lru, dst);
                         nr_taken++;
-               } /* else it is being freed elsewhere */
+                       break;
  
-               list_add(&page->lru, target);
+               case -EBUSY:
+                       /* else it is being freed elsewhere */
+                       list_move(&page->lru, src);
+                       continue;
+
+               default:
+                       BUG();
+               }
+
+               if (!order)
+                       continue;
+
+               /*
+                * Attempt to take all pages in the order aligned region
+                * surrounding the tag page.  Only take those pages of
+                * the same active state as that tag page.  We may safely
+                * round the target page pfn down to the requested order
+                * as the mem_map is guarenteed valid out to MAX_ORDER,
+                * where that page is in a different zone we will detect
+                * it from its zone id and abort this block scan.
+                */
+               zone_id = page_zone_id(page);
+               page_pfn = page_to_pfn(page);
+               pfn = page_pfn & ~((1 << order) - 1);
+               end_pfn = pfn + (1 << order);
+               for (; pfn < end_pfn; pfn++) {
+                       struct page *cursor_page;
+
+                       /* The target page is in the block, ignore it. */
+                       if (unlikely(pfn == page_pfn))
+                               continue;
+
+                       /* Avoid holes within the zone. */
+                       if (unlikely(!pfn_valid_within(pfn)))
+                               break;
+
+                       cursor_page = pfn_to_page(pfn);
+                       /* Check that we have not crossed a zone boundary. */
+                       if (unlikely(page_zone_id(cursor_page) != zone_id))
+                               continue;
+                       switch (__isolate_lru_page(cursor_page, mode)) {
+                       case 0:
+                               list_move(&cursor_page->lru, dst);
+                               nr_taken++;
+                               scan++;
+                               break;
+
+                       case -EBUSY:
+                               /* else it is being freed elsewhere */
+                               list_move(&cursor_page->lru, src);
+                       default:
+                               break;
+                       }
+               }
         }
  
         *scanned = scan;
         return nr_taken;
  }
  
+/*
+ * clear_active_flags() is a helper for shrink_active_list(), clearing
+ * any active bits from the pages in the list.
+ */
+static unsigned long clear_active_flags(struct list_head *page_list)
+{
+       int nr_active = 0;
+       struct page *page;
+
+       list_for_each_entry(page, page_list, lru)
+               if (PageActive(page)) {
+                       ClearPageActive(page);
+                       nr_active++;
+               }
+
+       return nr_active;
+}
+
  /*
   * shrink_inactive_list() is a helper for shrink_zone().  It returns the number
   * of reclaimed pages
@@ -671,11 +789,18 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                 unsigned long nr_taken;
                 unsigned long nr_scan;
                 unsigned long nr_freed;
+               unsigned long nr_active;
  
                 nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-                                            &zone->inactive_list,
-                                            &page_list, &nr_scan);
-               __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken);
+                            &zone->inactive_list,
+                            &page_list, &nr_scan, sc->order,
+                            (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
+                                            ISOLATE_BOTH : ISOLATE_INACTIVE);
+               nr_active = clear_active_flags(&page_list);
+
+               __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
+               __mod_zone_page_state(zone, NR_INACTIVE,
+                                               -(nr_taken - nr_active));
                 zone->pages_scanned += nr_scan;
                 spin_unlock_irq(&zone->lru_lock);
  
@@ -820,7 +945,7 @@ force_reclaim_mapped:
         lru_add_drain();
         spin_lock_irq(&zone->lru_lock);
         pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
-                                   &l_hold, &pgscanned);
+                           &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
         zone->pages_scanned += pgscanned;
         __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
         spin_unlock_irq(&zone->lru_lock);
@@ -1011,7 +1136,7 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
   * holds filesystem locks which prevent writeout this might not work, and the
   * allocation attempt will fail.
   */
-unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
  {
         int priority;
         int ret = 0;
@@ -1026,6 +1151,7 @@ unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
                 .swap_cluster_max = SWAP_CLUSTER_MAX,
                 .may_swap = 1,
                 .swappiness = vm_swappiness,
+               .order = order,
         };
  
         count_vm_event(ALLOCSTALL);
@@ -1131,6 +1257,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                 .may_swap = 1,
                 .swap_cluster_max = SWAP_CLUSTER_MAX,
                 .swappiness = vm_swappiness,
+               .order = order,
         };
         /*
          * temp_priority is used to remember the scanning priority at which
author	Andy Whitcroft <apw@shadowen.org>
	Tue, 17 Jul 2007 11:03:16 +0000 (04:03 -0700)
committer	Linus Torvalds <torvalds@woody.linux-foundation.org>
	Tue, 17 Jul 2007 17:22:59 +0000 (10:22 -0700)
fs/buffer.c		patch \| blob \| history
include/linux/mmzone.h		patch \| blob \| history
include/linux/swap.h		patch \| blob \| history
mm/page_alloc.c		patch \| blob \| history
mm/vmscan.c		patch \| blob \| history