X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fvmscan.c;h=440a733fe2e9ea3d1374d4fd72e7bba60e268e05;hb=40ef1d8d48e364dce689342adfdc475aa53f4808;hp=acb7611cd5257e5efba27e4a70526130bc436620;hpb=46453a6e194a8c55fe6cf3dc8e1c4f24e2abc013;p=linux-2.6 diff --git a/mm/vmscan.c b/mm/vmscan.c index acb7611cd5..440a733fe2 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -33,39 +33,21 @@ #include #include #include +#include #include #include #include -/* possible outcome of pageout() */ -typedef enum { - /* failed to write page out, page is locked */ - PAGE_KEEP, - /* move page to the active list, page is locked */ - PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ - PAGE_SUCCESS, - /* page is clean and locked */ - PAGE_CLEAN, -} pageout_t; +#include "internal.h" struct scan_control { - /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */ - unsigned long nr_to_scan; - /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; - /* Incremented by the number of pages reclaimed */ - unsigned long nr_reclaimed; - unsigned long nr_mapped; /* From page_state */ - /* Ask shrink_caches, or shrink_zone to scan at this priority */ - unsigned int priority; - /* This context's GFP mask */ gfp_t gfp_mask; @@ -183,10 +165,11 @@ EXPORT_SYMBOL(remove_shrinker); * * Returns the number of slab objects which we shrunk. */ -int shrink_slab(unsigned long scanned, gfp_t gfp_mask, unsigned long lru_pages) +unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, + unsigned long lru_pages) { struct shrinker *shrinker; - int ret = 0; + unsigned long ret = 0; if (scanned == 0) scanned = SWAP_CLUSTER_MAX; @@ -306,9 +289,10 @@ static void handle_write_error(struct address_space *mapping, } /* - * pageout is called by shrink_list() for each dirty page. Calls ->writepage(). + * pageout is called by shrink_page_list() for each dirty page. + * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +pageout_t pageout(struct page *page, struct address_space *mapping) { /* * If the page is dirty, only perform writeback if that write @@ -376,7 +360,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) return PAGE_CLEAN; } -static int remove_mapping(struct address_space *mapping, struct page *page) +int remove_mapping(struct address_space *mapping, struct page *page) { if (!mapping) return 0; /* truncate got there first */ @@ -414,14 +398,15 @@ cannot_free: } /* - * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed + * shrink_page_list() returns the number of reclaimed pages */ -static int shrink_list(struct list_head *page_list, struct scan_control *sc) +static unsigned long shrink_page_list(struct list_head *page_list, + struct scan_control *sc) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; int pgactivate = 0; - int reclaimed = 0; + unsigned long nr_reclaimed = 0; cond_resched(); @@ -464,12 +449,9 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!sc->may_swap) - goto keep_locked; + if (PageAnon(page) && !PageSwapCache(page)) if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; - } #endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -481,12 +463,6 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - /* - * No unmapping if we do not swap - */ - if (!sc->may_swap) - goto keep_locked; - switch (try_to_unmap(page, 0)) { case SWAP_FAIL: goto activate_locked; @@ -561,7 +537,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) free_it: unlock_page(page); - reclaimed++; + nr_reclaimed++; if (!pagevec_add(&freed_pvec, page)) __pagevec_release_nonlru(&freed_pvec); continue; @@ -579,483 +555,8 @@ keep: if (pagevec_count(&freed_pvec)) __pagevec_release_nonlru(&freed_pvec); mod_page_state(pgactivate, pgactivate); - sc->nr_reclaimed += reclaimed; - return reclaimed; -} - -#ifdef CONFIG_MIGRATION -static inline void move_to_lru(struct page *page) -{ - list_del(&page->lru); - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - -/* - * Add isolated pages on the list back to the LRU. - * - * returns the number of pages put back. - */ -int putback_lru_pages(struct list_head *l) -{ - struct page *page; - struct page *page2; - int count = 0; - - list_for_each_entry_safe(page, page2, l, lru) { - move_to_lru(page); - count++; - } - return count; -} - -/* - * Non migratable page - */ -int fail_migrate_page(struct page *newpage, struct page *page) -{ - return -EIO; -} -EXPORT_SYMBOL(fail_migrate_page); - -/* - * swapout a single page - * page is locked upon entry, unlocked on exit - */ -static int swap_page(struct page *page) -{ - struct address_space *mapping = page_mapping(page); - - if (page_mapped(page) && mapping) - if (try_to_unmap(page, 1) != SWAP_SUCCESS) - goto unlock_retry; - - if (PageDirty(page)) { - /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_retry; - - case PAGE_SUCCESS: - goto retry; - - case PAGE_CLEAN: - ; /* try to free the page below */ - } - } - - if (PagePrivate(page)) { - if (!try_to_release_page(page, GFP_KERNEL) || - (!mapping && page_count(page) == 1)) - goto unlock_retry; - } - - if (remove_mapping(mapping, page)) { - /* Success */ - unlock_page(page); - return 0; - } - -unlock_retry: - unlock_page(page); - -retry: - return -EAGAIN; -} -EXPORT_SYMBOL(swap_page); - -/* - * Page migration was first developed in the context of the memory hotplug - * project. The main authors of the migration code are: - * - * IWAMOTO Toshihiro - * Hirokazu Takahashi - * Dave Hansen - * Christoph Lameter - */ - -/* - * Remove references for a page and establish the new page with the correct - * basic settings to be able to stop accesses to the page. - */ -int migrate_page_remove_references(struct page *newpage, - struct page *page, int nr_refs) -{ - struct address_space *mapping = page_mapping(page); - struct page **radix_pointer; - - /* - * Avoid doing any of the following work if the page count - * indicates that the page is in use or truncate has removed - * the page. - */ - if (!mapping || page_mapcount(page) + nr_refs != page_count(page)) - return -EAGAIN; - - /* - * Establish swap ptes for anonymous pages or destroy pte - * maps for files. - * - * In order to reestablish file backed mappings the fault handlers - * will take the radix tree_lock which may then be used to stop - * processses from accessing this page until the new page is ready. - * - * A process accessing via a swap pte (an anonymous page) will take a - * page_lock on the old page which will block the process until the - * migration attempt is complete. At that time the PageSwapCache bit - * will be examined. If the page was migrated then the PageSwapCache - * bit will be clear and the operation to retrieve the page will be - * retried which will find the new page in the radix tree. Then a new - * direct mapping may be generated based on the radix tree contents. - * - * If the page was not migrated then the PageSwapCache bit - * is still set and the operation may continue. - */ - if (try_to_unmap(page, 1) == SWAP_FAIL) - /* A vma has VM_LOCKED set -> Permanent failure */ - return -EPERM; - - /* - * Give up if we were unable to remove all mappings. - */ - if (page_mapcount(page)) - return -EAGAIN; - - write_lock_irq(&mapping->tree_lock); - - radix_pointer = (struct page **)radix_tree_lookup_slot( - &mapping->page_tree, - page_index(page)); - - if (!page_mapping(page) || page_count(page) != nr_refs || - *radix_pointer != page) { - write_unlock_irq(&mapping->tree_lock); - return -EAGAIN; - } - - /* - * Now we know that no one else is looking at the page. - * - * Certain minimal information about a page must be available - * in order for other subsystems to properly handle the page if they - * find it through the radix tree update before we are finished - * copying the page. - */ - get_page(newpage); - newpage->index = page->index; - newpage->mapping = page->mapping; - if (PageSwapCache(page)) { - SetPageSwapCache(newpage); - set_page_private(newpage, page_private(page)); - } - - *radix_pointer = newpage; - __put_page(page); - write_unlock_irq(&mapping->tree_lock); - - return 0; -} -EXPORT_SYMBOL(migrate_page_remove_references); - -/* - * Copy the page to its new location - */ -void migrate_page_copy(struct page *newpage, struct page *page) -{ - copy_highpage(newpage, page); - - if (PageError(page)) - SetPageError(newpage); - if (PageReferenced(page)) - SetPageReferenced(newpage); - if (PageUptodate(page)) - SetPageUptodate(newpage); - if (PageActive(page)) - SetPageActive(newpage); - if (PageChecked(page)) - SetPageChecked(newpage); - if (PageMappedToDisk(page)) - SetPageMappedToDisk(newpage); - - if (PageDirty(page)) { - clear_page_dirty_for_io(page); - set_page_dirty(newpage); - } - - ClearPageSwapCache(page); - ClearPageActive(page); - ClearPagePrivate(page); - set_page_private(page, 0); - page->mapping = NULL; - - /* - * If any waiters have accumulated on the new page then - * wake them up. - */ - if (PageWriteback(newpage)) - end_page_writeback(newpage); -} -EXPORT_SYMBOL(migrate_page_copy); - -/* - * Common logic to directly migrate a single page suitable for - * pages that do not use PagePrivate. - * - * Pages are locked upon entry and exit. - */ -int migrate_page(struct page *newpage, struct page *page) -{ - int rc; - - BUG_ON(PageWriteback(page)); /* Writeback must be complete */ - - rc = migrate_page_remove_references(newpage, page, 2); - - if (rc) - return rc; - - migrate_page_copy(newpage, page); - - /* - * Remove auxiliary swap entries and replace - * them with real ptes. - * - * Note that a real pte entry will allow processes that are not - * waiting on the page lock to use the new page via the page tables - * before the new page is unlocked. - */ - remove_from_swap(newpage); - return 0; -} -EXPORT_SYMBOL(migrate_page); - -/* - * migrate_pages - * - * Two lists are passed to this function. The first list - * contains the pages isolated from the LRU to be migrated. - * The second list contains new pages that the pages isolated - * can be moved to. If the second list is NULL then all - * pages are swapped out. - * - * The function returns after 10 attempts or if no pages - * are movable anymore because to has become empty - * or no retryable pages exist anymore. - * - * Return: Number of pages not migrated when "to" ran empty. - */ -int migrate_pages(struct list_head *from, struct list_head *to, - struct list_head *moved, struct list_head *failed) -{ - int retry; - int nr_failed = 0; - int pass = 0; - struct page *page; - struct page *page2; - int swapwrite = current->flags & PF_SWAPWRITE; - int rc; - - if (!swapwrite) - current->flags |= PF_SWAPWRITE; - -redo: - retry = 0; - - list_for_each_entry_safe(page, page2, from, lru) { - struct page *newpage = NULL; - struct address_space *mapping; - - cond_resched(); - - rc = 0; - if (page_count(page) == 1) - /* page was freed from under us. So we are done. */ - goto next; - - if (to && list_empty(to)) - break; - - /* - * Skip locked pages during the first two passes to give the - * functions holding the lock time to release the page. Later we - * use lock_page() to have a higher chance of acquiring the - * lock. - */ - rc = -EAGAIN; - if (pass > 2) - lock_page(page); - else - if (TestSetPageLocked(page)) - goto next; - - /* - * Only wait on writeback if we have already done a pass where - * we we may have triggered writeouts for lots of pages. - */ - if (pass > 0) { - wait_on_page_writeback(page); - } else { - if (PageWriteback(page)) - goto unlock_page; - } - - /* - * Anonymous pages must have swap cache references otherwise - * the information contained in the page maps cannot be - * preserved. - */ - if (PageAnon(page) && !PageSwapCache(page)) { - if (!add_to_swap(page, GFP_KERNEL)) { - rc = -ENOMEM; - goto unlock_page; - } - } - - if (!to) { - rc = swap_page(page); - goto next; - } - - newpage = lru_to_page(to); - lock_page(newpage); - - /* - * Pages are properly locked and writeback is complete. - * Try to migrate the page. - */ - mapping = page_mapping(page); - if (!mapping) - goto unlock_both; - - if (mapping->a_ops->migratepage) { - /* - * Most pages have a mapping and most filesystems - * should provide a migration function. Anonymous - * pages are part of swap space which also has its - * own migration function. This is the most common - * path for page migration. - */ - rc = mapping->a_ops->migratepage(newpage, page); - goto unlock_both; - } - - /* - * Default handling if a filesystem does not provide - * a migration function. We can only migrate clean - * pages so try to write out any dirty pages first. - */ - if (PageDirty(page)) { - switch (pageout(page, mapping)) { - case PAGE_KEEP: - case PAGE_ACTIVATE: - goto unlock_both; - - case PAGE_SUCCESS: - unlock_page(newpage); - goto next; - - case PAGE_CLEAN: - ; /* try to migrate the page below */ - } - } - - /* - * Buffers are managed in a filesystem specific way. - * We must have no buffers or drop them. - */ - if (!page_has_buffers(page) || - try_to_release_page(page, GFP_KERNEL)) { - rc = migrate_page(newpage, page); - goto unlock_both; - } - - /* - * On early passes with mapped pages simply - * retry. There may be a lock held for some - * buffers that may go away. Later - * swap them out. - */ - if (pass > 4) { - /* - * Persistently unable to drop buffers..... As a - * measure of last resort we fall back to - * swap_page(). - */ - unlock_page(newpage); - newpage = NULL; - rc = swap_page(page); - goto next; - } - -unlock_both: - unlock_page(newpage); - -unlock_page: - unlock_page(page); - -next: - if (rc == -EAGAIN) { - retry++; - } else if (rc) { - /* Permanent failure */ - list_move(&page->lru, failed); - nr_failed++; - } else { - if (newpage) { - /* Successful migration. Return page to LRU */ - move_to_lru(newpage); - } - list_move(&page->lru, moved); - } - } - if (retry && pass++ < 10) - goto redo; - - if (!swapwrite) - current->flags &= ~PF_SWAPWRITE; - - return nr_failed + retry; -} - -/* - * Isolate one page from the LRU lists and put it on the - * indicated list with elevated refcount. - * - * Result: - * 0 = page not on LRU list - * 1 = page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page) -{ - int ret = 0; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - spin_lock_irq(&zone->lru_lock); - if (TestClearPageLRU(page)) { - ret = 1; - get_page(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - } - spin_unlock_irq(&zone->lru_lock); - } - - return ret; + return nr_reclaimed; } -#endif /* * zone->lru_lock is heavily contended. Some of the functions that @@ -1074,36 +575,35 @@ int isolate_lru_page(struct page *page) * * returns how many pages were moved onto *@dst. */ -static int isolate_lru_pages(int nr_to_scan, struct list_head *src, - struct list_head *dst, int *scanned) +static unsigned long isolate_lru_pages(unsigned long nr_to_scan, + struct list_head *src, struct list_head *dst, + unsigned long *scanned) { - int nr_taken = 0; + unsigned long nr_taken = 0; struct page *page; - int scan = 0; + unsigned long scan; - while (scan++ < nr_to_scan && !list_empty(src)) { + for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) { + struct list_head *target; page = lru_to_page(src); prefetchw_prev_lru_page(page, src, flags); + BUG_ON(!PageLRU(page)); + list_del(&page->lru); - if (unlikely(get_page_testone(page))) { + target = src; + if (likely(get_page_unless_zero(page))) { /* - * It is being freed elsewhere + * Be careful not to clear PageLRU until after we're + * sure the page is not being freed elsewhere -- the + * page release code relies on it. */ - __put_page(page); - list_add(&page->lru, src); - continue; - } + ClearPageLRU(page); + target = dst; + nr_taken++; + } /* else it is being freed elsewhere */ - /* - * Be careful not to clear PageLRU until after we're sure - * the page is not being freed elsewhere -- the page release - * code relies on it. - */ - if (!TestClearPageLRU(page)) - BUG(); - list_add(&page->lru, dst); - nr_taken++; + list_add(&page->lru, target); } *scanned = scan; @@ -1111,23 +611,26 @@ static int isolate_lru_pages(int nr_to_scan, struct list_head *src, } /* - * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed + * shrink_inactive_list() is a helper for shrink_zone(). It returns the number + * of reclaimed pages */ -static void shrink_cache(struct zone *zone, struct scan_control *sc) +static unsigned long shrink_inactive_list(unsigned long max_scan, + struct zone *zone, struct scan_control *sc) { LIST_HEAD(page_list); struct pagevec pvec; - int max_scan = sc->nr_to_scan; + unsigned long nr_scanned = 0; + unsigned long nr_reclaimed = 0; pagevec_init(&pvec, 1); lru_add_drain(); spin_lock_irq(&zone->lru_lock); - while (max_scan > 0) { + do { struct page *page; - int nr_taken; - int nr_scan; - int nr_freed; + unsigned long nr_taken; + unsigned long nr_scan; + unsigned long nr_freed; nr_taken = isolate_lru_pages(sc->swap_cluster_max, &zone->inactive_list, @@ -1136,12 +639,9 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) zone->pages_scanned += nr_scan; spin_unlock_irq(&zone->lru_lock); - if (nr_taken == 0) - goto done; - - max_scan -= nr_scan; - nr_freed = shrink_list(&page_list, sc); - + nr_scanned += nr_scan; + nr_freed = shrink_page_list(&page_list, sc); + nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { __mod_page_state_zone(zone, pgscan_kswapd, nr_scan); @@ -1150,14 +650,17 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) __mod_page_state_zone(zone, pgscan_direct, nr_scan); __mod_page_state_zone(zone, pgsteal, nr_freed); + if (nr_taken == 0) + goto done; + spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. */ while (!list_empty(&page_list)) { page = lru_to_page(&page_list); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); list_del(&page->lru); if (PageActive(page)) add_page_to_active_list(zone, page); @@ -1169,10 +672,12 @@ static void shrink_cache(struct zone *zone, struct scan_control *sc) spin_lock_irq(&zone->lru_lock); } } - } - spin_unlock_irq(&zone->lru_lock); + } while (nr_scanned < max_scan); + spin_unlock(&zone->lru_lock); done: + local_irq_enable(); pagevec_release(&pvec); + return nr_reclaimed; } /* @@ -1192,13 +697,12 @@ done: * The downside is that we have to touch page->_count against each page. * But we had to alter page->flags anyway. */ -static void -refill_inactive_zone(struct zone *zone, struct scan_control *sc) +static void shrink_active_list(unsigned long nr_pages, struct zone *zone, + struct scan_control *sc) { - int pgmoved; + unsigned long pgmoved; int pgdeactivate = 0; - int pgscanned; - int nr_pages = sc->nr_to_scan; + unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ LIST_HEAD(l_active); /* Pages to go onto the active_list */ @@ -1206,7 +710,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) struct pagevec pvec; int reclaim_mapped = 0; - if (unlikely(sc->may_swap)) { + if (sc->may_swap) { long mapped_ratio; long distress; long swap_tendency; @@ -1276,10 +780,11 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); - if (TestSetPageLRU(page)) - BUG(); - if (!TestClearPageActive(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); + BUG_ON(!PageActive(page)); + ClearPageActive(page); + list_move(&page->lru, &zone->inactive_list); pgmoved++; if (!pagevec_add(&pvec, page)) { @@ -1305,8 +810,8 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) while (!list_empty(&l_active)) { page = lru_to_page(&l_active); prefetchw_prev_lru_page(page, &l_active, flags); - if (TestSetPageLRU(page)) - BUG(); + BUG_ON(PageLRU(page)); + SetPageLRU(page); BUG_ON(!PageActive(page)); list_move(&page->lru, &zone->active_list); pgmoved++; @@ -1331,11 +836,13 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static void -shrink_zone(struct zone *zone, struct scan_control *sc) +static unsigned long shrink_zone(int priority, struct zone *zone, + struct scan_control *sc) { unsigned long nr_active; unsigned long nr_inactive; + unsigned long nr_to_scan; + unsigned long nr_reclaimed = 0; atomic_inc(&zone->reclaim_in_progress); @@ -1343,14 +850,14 @@ shrink_zone(struct zone *zone, struct scan_control *sc) * Add one to `nr_to_scan' just to make sure that the kernel will * slowly sift through the active list. */ - zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1; + zone->nr_scan_active += (zone->nr_active >> priority) + 1; nr_active = zone->nr_scan_active; if (nr_active >= sc->swap_cluster_max) zone->nr_scan_active = 0; else nr_active = 0; - zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1; + zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1; nr_inactive = zone->nr_scan_inactive; if (nr_inactive >= sc->swap_cluster_max) zone->nr_scan_inactive = 0; @@ -1359,23 +866,25 @@ shrink_zone(struct zone *zone, struct scan_control *sc) while (nr_active || nr_inactive) { if (nr_active) { - sc->nr_to_scan = min(nr_active, + nr_to_scan = min(nr_active, (unsigned long)sc->swap_cluster_max); - nr_active -= sc->nr_to_scan; - refill_inactive_zone(zone, sc); + nr_active -= nr_to_scan; + shrink_active_list(nr_to_scan, zone, sc); } if (nr_inactive) { - sc->nr_to_scan = min(nr_inactive, + nr_to_scan = min(nr_inactive, (unsigned long)sc->swap_cluster_max); - nr_inactive -= sc->nr_to_scan; - shrink_cache(zone, sc); + nr_inactive -= nr_to_scan; + nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, + sc); } } throttle_vm_writeout(); atomic_dec(&zone->reclaim_in_progress); + return nr_reclaimed; } /* @@ -1394,9 +903,10 @@ shrink_zone(struct zone *zone, struct scan_control *sc) * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static void -shrink_caches(struct zone **zones, struct scan_control *sc) +static unsigned long shrink_zones(int priority, struct zone **zones, + struct scan_control *sc) { + unsigned long nr_reclaimed = 0; int i; for (i = 0; zones[i] != NULL; i++) { @@ -1408,15 +918,16 @@ shrink_caches(struct zone **zones, struct scan_control *sc) if (!cpuset_zone_allowed(zone, __GFP_HARDWALL)) continue; - zone->temp_priority = sc->priority; - if (zone->prev_priority > sc->priority) - zone->prev_priority = sc->priority; + zone->temp_priority = priority; + if (zone->prev_priority > priority) + zone->prev_priority = priority; - if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) + if (zone->all_unreclaimable && priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - shrink_zone(zone, sc); + nr_reclaimed += shrink_zone(priority, zone, sc); } + return nr_reclaimed; } /* @@ -1432,19 +943,21 @@ shrink_caches(struct zone **zones, struct scan_control *sc) * holds filesystem locks which prevent writeout this might not work, and the * allocation attempt will fail. */ -int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) +unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask) { int priority; int ret = 0; - int total_scanned = 0, total_reclaimed = 0; + unsigned long total_scanned = 0; + unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; - struct scan_control sc; unsigned long lru_pages = 0; int i; - - sc.gfp_mask = gfp_mask; - sc.may_writepage = !laptop_mode; - sc.may_swap = 1; + struct scan_control sc = { + .gfp_mask = gfp_mask, + .may_writepage = !laptop_mode, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_swap = 1, + }; inc_page_state(allocstall); @@ -1461,20 +974,16 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) for (priority = DEF_PRIORITY; priority >= 0; priority--) { sc.nr_mapped = read_page_state(nr_mapped); sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = priority; - sc.swap_cluster_max = SWAP_CLUSTER_MAX; if (!priority) disable_swap_token(); - shrink_caches(zones, &sc); + nr_reclaimed += shrink_zones(priority, zones, &sc); shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { - sc.nr_reclaimed += reclaim_state->reclaimed_slab; + nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } total_scanned += sc.nr_scanned; - total_reclaimed += sc.nr_reclaimed; - if (total_reclaimed >= sc.swap_cluster_max) { + if (nr_reclaimed >= sc.swap_cluster_max) { ret = 1; goto out; } @@ -1486,7 +995,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) { + if (total_scanned > sc.swap_cluster_max + + sc.swap_cluster_max / 2) { wakeup_pdflush(laptop_mode ? 0 : total_scanned); sc.may_writepage = 1; } @@ -1532,22 +1042,26 @@ out: * the page allocator fallback scheme to ensure that aging of pages is balanced * across the zones. */ -static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order) +static unsigned long balance_pgdat(pg_data_t *pgdat, unsigned long nr_pages, + int order) { - int to_free = nr_pages; + unsigned long to_free = nr_pages; int all_zones_ok; int priority; int i; - int total_scanned, total_reclaimed; + unsigned long total_scanned; + unsigned long nr_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; - struct scan_control sc; + struct scan_control sc = { + .gfp_mask = GFP_KERNEL, + .may_swap = 1, + .swap_cluster_max = nr_pages ? nr_pages : SWAP_CLUSTER_MAX, + }; loop_again: total_scanned = 0; - total_reclaimed = 0; - sc.gfp_mask = GFP_KERNEL; + nr_reclaimed = 0; sc.may_writepage = !laptop_mode; - sc.may_swap = 1; sc.nr_mapped = read_page_state(nr_mapped); inc_page_state(pageoutrun); @@ -1628,15 +1142,11 @@ scan: if (zone->prev_priority > priority) zone->prev_priority = priority; sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = priority; - sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX; - shrink_zone(zone, &sc); + nr_reclaimed += shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - total_reclaimed += sc.nr_reclaimed; + nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone->all_unreclaimable) continue; @@ -1649,10 +1159,10 @@ scan: * even in laptop mode */ if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > total_reclaimed+total_reclaimed/2) + total_scanned > nr_reclaimed + nr_reclaimed / 2) sc.may_writepage = 1; } - if (nr_pages && to_free > total_reclaimed) + if (nr_pages && to_free > nr_reclaimed) continue; /* swsusp: need to do more work */ if (all_zones_ok) break; /* kswapd: all done */ @@ -1669,7 +1179,7 @@ scan: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages)) + if ((nr_reclaimed >= SWAP_CLUSTER_MAX) && !nr_pages) break; } out: @@ -1683,7 +1193,7 @@ out: goto loop_again; } - return total_reclaimed; + return nr_reclaimed; } /* @@ -1783,24 +1293,31 @@ void wakeup_kswapd(struct zone *zone, int order) * Try to free `nr_pages' of memory, system-wide. Returns the number of freed * pages. */ -int shrink_all_memory(int nr_pages) +unsigned long shrink_all_memory(unsigned long nr_pages) { pg_data_t *pgdat; - int nr_to_free = nr_pages; - int ret = 0; + unsigned long nr_to_free = nr_pages; + unsigned long ret = 0; + unsigned retry = 2; struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; current->reclaim_state = &reclaim_state; - for_each_pgdat(pgdat) { - int freed; +repeat: + for_each_online_pgdat(pgdat) { + unsigned long freed; + freed = balance_pgdat(pgdat, nr_to_free, 0); ret += freed; nr_to_free -= freed; - if (nr_to_free <= 0) + if ((long)nr_to_free <= 0) break; } + if (retry-- && ret < nr_pages) { + blk_congestion_wait(WRITE, HZ/5); + goto repeat; + } current->reclaim_state = NULL; return ret; } @@ -1811,15 +1328,14 @@ int shrink_all_memory(int nr_pages) not required for correctness. So if the last cpu in a node goes away, we get changed to run anywhere: as the first one comes back, restore their cpu bindings. */ -static int __devinit cpu_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) +static int cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) { pg_data_t *pgdat; cpumask_t mask; if (action == CPU_ONLINE) { - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { mask = node_to_cpumask(pgdat->node_id); if (any_online_cpu(mask) != NR_CPUS) /* One of our CPUs online: restore mask */ @@ -1833,10 +1349,17 @@ static int __devinit cpu_callback(struct notifier_block *nfb, static int __init kswapd_init(void) { pg_data_t *pgdat; + swap_setup(); - for_each_pgdat(pgdat) - pgdat->kswapd - = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL)); + for_each_online_pgdat(pgdat) { + pid_t pid; + + pid = kernel_thread(kswapd, pgdat, CLONE_KERNEL); + BUG_ON(pid < 0); + read_lock(&tasklist_lock); + pgdat->kswapd = find_task_by_pid(pid); + read_unlock(&tasklist_lock); + } total_memory = nr_free_pagecache_pages(); hotcpu_notifier(cpu_callback, 0); return 0; @@ -1878,46 +1401,24 @@ int zone_reclaim_interval __read_mostly = 30*HZ; /* * Try to free up some pages from this zone through reclaim. */ -int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) { - int nr_pages; + /* Minimum pages needed in order to stay on node */ + const unsigned long nr_pages = 1 << order; struct task_struct *p = current; struct reclaim_state reclaim_state; - struct scan_control sc; - cpumask_t mask; - int node_id; - - if (time_before(jiffies, - zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) - return 0; - - if (!(gfp_mask & __GFP_WAIT) || - zone->all_unreclaimable || - atomic_read(&zone->reclaim_in_progress) > 0 || - (p->flags & PF_MEMALLOC)) - return 0; - - node_id = zone->zone_pgdat->node_id; - mask = node_to_cpumask(node_id); - if (!cpus_empty(mask) && node_id != numa_node_id()) - return 0; - - sc.may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE); - sc.may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP); - sc.nr_scanned = 0; - sc.nr_reclaimed = 0; - sc.priority = ZONE_RECLAIM_PRIORITY + 1; - sc.nr_mapped = read_page_state(nr_mapped); - sc.gfp_mask = gfp_mask; + int priority; + unsigned long nr_reclaimed = 0; + struct scan_control sc = { + .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), + .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .nr_mapped = read_page_state(nr_mapped), + .swap_cluster_max = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), + .gfp_mask = gfp_mask, + }; disable_swap_token(); - - nr_pages = 1 << order; - if (nr_pages > SWAP_CLUSTER_MAX) - sc.swap_cluster_max = nr_pages; - else - sc.swap_cluster_max = SWAP_CLUSTER_MAX; - cond_resched(); /* * We need to be able to allocate from the reserves for RECLAIM_SWAP @@ -1932,17 +1433,20 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Free memory by calling shrink zone with increasing priorities * until we have enough memory freed. */ + priority = ZONE_RECLAIM_PRIORITY; do { - sc.priority--; - shrink_zone(zone, &sc); + nr_reclaimed += shrink_zone(priority, zone, &sc); + priority--; + } while (priority >= 0 && nr_reclaimed < nr_pages); - } while (sc.nr_reclaimed < nr_pages && sc.priority > 0); - - if (sc.nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { + if (nr_reclaimed < nr_pages && (zone_reclaim_mode & RECLAIM_SLAB)) { /* - * shrink_slab does not currently allow us to determine - * how many pages were freed in the zone. So we just - * shake the slab and then go offnode for a single allocation. + * shrink_slab() does not currently allow us to determine how + * many pages were freed in this zone. So we just shake the slab + * a bit and then go off node for this particular allocation + * despite possibly having freed enough memory to allocate in + * this zone. If we freed local memory then the next + * allocations will be local again. * * shrink_slab will free memory on all zones and may take * a long time. @@ -1953,10 +1457,54 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - if (sc.nr_reclaimed == 0) + if (nr_reclaimed == 0) { + /* + * We were unable to reclaim enough pages to stay on node. We + * now allow off node accesses for a certain time period before + * trying again to reclaim pages from the local zone. + */ zone->last_unsuccessful_zone_reclaim = jiffies; + } - return sc.nr_reclaimed >= nr_pages; + return nr_reclaimed >= nr_pages; } -#endif +int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) +{ + cpumask_t mask; + int node_id; + + /* + * Do not reclaim if there was a recent unsuccessful attempt at zone + * reclaim. In that case we let allocations go off node for the + * zone_reclaim_interval. Otherwise we would scan for each off-node + * page allocation. + */ + if (time_before(jiffies, + zone->last_unsuccessful_zone_reclaim + zone_reclaim_interval)) + return 0; + + /* + * Avoid concurrent zone reclaims, do not reclaim in a zone that does + * not have reclaimable pages and if we should not delay the allocation + * then do not scan. + */ + if (!(gfp_mask & __GFP_WAIT) || + zone->all_unreclaimable || + atomic_read(&zone->reclaim_in_progress) > 0 || + (current->flags & PF_MEMALLOC)) + return 0; + + /* + * Only run zone reclaim on the local zone or on zones that do not + * have associated processors. This will favor the local processor + * over remote processors and spread off node memory allocations + * as wide as possible. + */ + node_id = zone->zone_pgdat->node_id; + mask = node_to_cpumask(node_id); + if (!cpus_empty(mask) && node_id != numa_node_id()) + return 0; + return __zone_reclaim(zone, gfp_mask, order); +} +#endif