X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fvmscan.c;h=a26dabd62fed40c8ec7832dc4656feaf7d3909f9;hb=4b11111aba6c80cc2969fd1806d2a869bfc9f357;hp=1b85217b528c38538487d46356069a8c9ffc0b5f;hpb=f1a9ee758de7de1e040de849fdef46e6802ea117;p=linux-2.6 diff --git a/mm/vmscan.c b/mm/vmscan.c index 1b85217b52..a26dabd62f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -126,6 +126,12 @@ long vm_total_pages; /* The total number of pages which the VM controls */ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); +#ifdef CONFIG_CGROUP_MEM_CONT +#define scan_global_lru(sc) (!(sc)->mem_cgroup) +#else +#define scan_global_lru(sc) (1) +#endif + /* * Add a shrinker callback to be called from the vm */ @@ -850,7 +856,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); __mod_zone_page_state(zone, NR_INACTIVE, -(nr_taken - nr_active)); - zone->pages_scanned += nr_scan; + if (scan_global_lru(sc)) + zone->pages_scanned += nr_scan; spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; @@ -882,8 +889,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, if (current_is_kswapd()) { __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); __count_vm_events(KSWAPD_STEAL, nr_freed); - } else + } else if (scan_global_lru(sc)) __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); + __count_zone_vm_events(PGSTEAL, zone, nr_freed); if (nr_taken == 0) @@ -936,6 +944,113 @@ static inline int zone_is_near_oom(struct zone *zone) + zone_page_state(zone, NR_INACTIVE))*3; } +/* + * Determine we should try to reclaim mapped pages. + * This is called only when sc->mem_cgroup is NULL. + */ +static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, + int priority) +{ + long mapped_ratio; + long distress; + long swap_tendency; + long imbalance; + int reclaim_mapped = 0; + int prev_priority; + + if (scan_global_lru(sc) && zone_is_near_oom(zone)) + return 1; + /* + * `distress' is a measure of how much trouble we're having + * reclaiming pages. 0 -> no problems. 100 -> great trouble. + */ + if (scan_global_lru(sc)) + prev_priority = zone->prev_priority; + else + prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); + + distress = 100 >> min(prev_priority, priority); + + /* + * The point of this algorithm is to decide when to start + * reclaiming mapped memory instead of just pagecache. Work out + * how much memory + * is mapped. + */ + if (scan_global_lru(sc)) + mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + + global_page_state(NR_ANON_PAGES)) * 100) / + vm_total_pages; + else + mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); + + /* + * Now decide how much we really want to unmap some pages. The + * mapped ratio is downgraded - just because there's a lot of + * mapped memory doesn't necessarily mean that page reclaim + * isn't succeeding. + * + * The distress ratio is important - we don't want to start + * going oom. + * + * A 100% value of vm_swappiness overrides this algorithm + * altogether. + */ + swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; + + /* + * If there's huge imbalance between active and inactive + * (think active 100 times larger than inactive) we should + * become more permissive, or the system will take too much + * cpu before it start swapping during memory pressure. + * Distress is about avoiding early-oom, this is about + * making swappiness graceful despite setting it to low + * values. + * + * Avoid div by zero with nr_inactive+1, and max resulting + * value is vm_total_pages. + */ + if (scan_global_lru(sc)) { + imbalance = zone_page_state(zone, NR_ACTIVE); + imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; + } else + imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); + + /* + * Reduce the effect of imbalance if swappiness is low, + * this means for a swappiness very low, the imbalance + * must be much higher than 100 for this logic to make + * the difference. + * + * Max temporary value is vm_total_pages*100. + */ + imbalance *= (vm_swappiness + 1); + imbalance /= 100; + + /* + * If not much of the ram is mapped, makes the imbalance + * less relevant, it's high priority we refill the inactive + * list with mapped pages only in presence of high ratio of + * mapped pages. + * + * Max temporary value is vm_total_pages*100. + */ + imbalance *= mapped_ratio; + imbalance /= 100; + + /* apply imbalance feedback to swap_tendency */ + swap_tendency += imbalance; + + /* + * Now use this metric to decide whether to start moving mapped + * memory onto the inactive list. + */ + if (swap_tendency >= 100) + reclaim_mapped = 1; + + return reclaim_mapped; +} + /* * This moves pages from the active list to the inactive list. * @@ -953,6 +1068,8 @@ static inline int zone_is_near_oom(struct zone *zone) * The downside is that we have to touch page->_count against each page. * But we had to alter page->flags anyway. */ + + static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority) { @@ -966,100 +1083,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct pagevec pvec; int reclaim_mapped = 0; - if (sc->may_swap) { - long mapped_ratio; - long distress; - long swap_tendency; - long imbalance; - - if (zone_is_near_oom(zone)) - goto force_reclaim_mapped; - - /* - * `distress' is a measure of how much trouble we're having - * reclaiming pages. 0 -> no problems. 100 -> great trouble. - */ - distress = 100 >> min(zone->prev_priority, priority); - - /* - * The point of this algorithm is to decide when to start - * reclaiming mapped memory instead of just pagecache. Work out - * how much memory - * is mapped. - */ - mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + - global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; - - /* - * Now decide how much we really want to unmap some pages. The - * mapped ratio is downgraded - just because there's a lot of - * mapped memory doesn't necessarily mean that page reclaim - * isn't succeeding. - * - * The distress ratio is important - we don't want to start - * going oom. - * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; - - /* - * If there's huge imbalance between active and inactive - * (think active 100 times larger than inactive) we should - * become more permissive, or the system will take too much - * cpu before it start swapping during memory pressure. - * Distress is about avoiding early-oom, this is about - * making swappiness graceful despite setting it to low - * values. - * - * Avoid div by zero with nr_inactive+1, and max resulting - * value is vm_total_pages. - */ - imbalance = zone_page_state(zone, NR_ACTIVE); - imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; - - /* - * Reduce the effect of imbalance if swappiness is low, - * this means for a swappiness very low, the imbalance - * must be much higher than 100 for this logic to make - * the difference. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= (vm_swappiness + 1); - imbalance /= 100; - - /* - * If not much of the ram is mapped, makes the imbalance - * less relevant, it's high priority we refill the inactive - * list with mapped pages only in presence of high ratio of - * mapped pages. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= mapped_ratio; - imbalance /= 100; - - /* apply imbalance feedback to swap_tendency */ - swap_tendency += imbalance; - - /* - * Now use this metric to decide whether to start moving mapped - * memory onto the inactive list. - */ - if (swap_tendency >= 100) -force_reclaim_mapped: - reclaim_mapped = 1; - } + if (sc->may_swap) + reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, sc->mem_cgroup, 1); - zone->pages_scanned += pgscanned; + /* + * zone->pages_scanned is used for detect zone's oom + * mem_cgroup remembers nr_scan by itself. + */ + if (scan_global_lru(sc)) + zone->pages_scanned += pgscanned; + __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); spin_unlock_irq(&zone->lru_lock); @@ -1149,25 +1187,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone, unsigned long nr_to_scan; unsigned long nr_reclaimed = 0; - /* - * Add one to `nr_to_scan' just to make sure that the kernel will - * slowly sift through the active list. - */ - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; - nr_active = zone->nr_scan_active; - if (nr_active >= sc->swap_cluster_max) - zone->nr_scan_active = 0; - else - nr_active = 0; + if (scan_global_lru(sc)) { + /* + * Add one to nr_to_scan just to make sure that the kernel + * will slowly sift through the active list. + */ + zone->nr_scan_active += + (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; + nr_active = zone->nr_scan_active; + zone->nr_scan_inactive += + (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; + nr_inactive = zone->nr_scan_inactive; + if (nr_inactive >= sc->swap_cluster_max) + zone->nr_scan_inactive = 0; + else + nr_inactive = 0; + + if (nr_active >= sc->swap_cluster_max) + zone->nr_scan_active = 0; + else + nr_active = 0; + } else { + /* + * This reclaim occurs not because zone memory shortage but + * because memory controller hits its limit. + * Then, don't modify zone reclaim related data. + */ + nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup, + zone, priority); + + nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup, + zone, priority); + } - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; - nr_inactive = zone->nr_scan_inactive; - if (nr_inactive >= sc->swap_cluster_max) - zone->nr_scan_inactive = 0; - else - nr_inactive = 0; while (nr_active || nr_inactive) { if (nr_active) { @@ -1212,25 +1264,39 @@ static unsigned long shrink_zones(int priority, struct zone **zones, unsigned long nr_reclaimed = 0; int i; + sc->all_unreclaimable = 1; for (i = 0; zones[i] != NULL; i++) { struct zone *zone = zones[i]; if (!populated_zone(zone)) continue; + /* + * Take care memory controller reclaiming has small influence + * to global LRU. + */ + if (scan_global_lru(sc)) { + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + note_zone_scanning_priority(zone, priority); - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; - - note_zone_scanning_priority(zone, priority); - - if (zone_is_all_unreclaimable(zone) && priority != DEF_PRIORITY) - continue; /* Let kswapd poll it */ - - sc->all_unreclaimable = 0; + if (zone_is_all_unreclaimable(zone) && + priority != DEF_PRIORITY) + continue; /* Let kswapd poll it */ + sc->all_unreclaimable = 0; + } else { + /* + * Ignore cpuset limitation here. We just want to reduce + * # of used pages by us regardless of memory shortage. + */ + sc->all_unreclaimable = 0; + mem_cgroup_note_reclaim_priority(sc->mem_cgroup, + priority); + } nr_reclaimed += shrink_zone(priority, zone, sc); } + return nr_reclaimed; } @@ -1258,16 +1324,21 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, unsigned long lru_pages = 0; int i; - count_vm_event(ALLOCSTALL); - - for (i = 0; zones[i] != NULL; i++) { - struct zone *zone = zones[i]; + if (scan_global_lru(sc)) + count_vm_event(ALLOCSTALL); + /* + * mem_cgroup will not do shrink_slab. + */ + if (scan_global_lru(sc)) { + for (i = 0; zones[i] != NULL; i++) { + struct zone *zone = zones[i]; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_page_state(zone, NR_ACTIVE) + + zone_page_state(zone, NR_INACTIVE); + } } for (priority = DEF_PRIORITY; priority >= 0; priority--) { @@ -1280,11 +1351,12 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, * Don't shrink slabs when reclaiming memory from * over limit cgroups */ - if (sc->mem_cgroup == NULL) + if (scan_global_lru(sc)) { shrink_slab(sc->nr_scanned, gfp_mask, lru_pages); - if (reclaim_state) { - nr_reclaimed += reclaim_state->reclaimed_slab; - reclaim_state->reclaimed_slab = 0; + if (reclaim_state) { + nr_reclaimed += reclaim_state->reclaimed_slab; + reclaim_state->reclaimed_slab = 0; + } } total_scanned += sc->nr_scanned; if (nr_reclaimed >= sc->swap_cluster_max) { @@ -1311,7 +1383,7 @@ static unsigned long do_try_to_free_pages(struct zone **zones, gfp_t gfp_mask, congestion_wait(WRITE, HZ/10); } /* top priority shrink_caches still had more to do? don't OOM, then */ - if (!sc->all_unreclaimable && sc->mem_cgroup == NULL) + if (!sc->all_unreclaimable && scan_global_lru(sc)) ret = 1; out: /* @@ -1323,14 +1395,19 @@ out: */ if (priority < 0) priority = 0; - for (i = 0; zones[i] != NULL; i++) { - struct zone *zone = zones[i]; - if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) - continue; + if (scan_global_lru(sc)) { + for (i = 0; zones[i] != NULL; i++) { + struct zone *zone = zones[i]; + + if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) + continue; + + zone->prev_priority = priority; + } + } else + mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority); - zone->prev_priority = priority; - } return ret; } @@ -1365,15 +1442,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, }; - int node; struct zone **zones; int target_zone = gfp_zone(GFP_HIGHUSER_MOVABLE); - for_each_online_node(node) { - zones = NODE_DATA(node)->node_zonelists[target_zone].zones; - if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) - return 1; - } + zones = NODE_DATA(numa_node_id())->node_zonelists[target_zone].zones; + if (do_try_to_free_pages(zones, sc.gfp_mask, &sc)) + return 1; return 0; } #endif