X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=8d088371196a559b5baa5a8dc02e2d07253fd50c;hb=11d31886dbcb61039ed3789e583d21c6e70960fd;hp=a9da20bc2ed09a754fb69046f32e7e38066507ce;hpb=0c35bbadc59f5ed105c34471143eceb4c0dd9c95;p=linux-2.6 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a9da20bc2e..8d08837119 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -68,7 +68,7 @@ EXPORT_SYMBOL(nr_swap_pages); * Used by page_zone() to look up the address of the struct zone whose * id is encoded in the upper bits of page->flags */ -struct zone *zone_table[1 << (ZONES_SHIFT + NODES_SHIFT)]; +struct zone *zone_table[1 << ZONETABLE_SHIFT]; EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" }; @@ -105,11 +105,13 @@ static void bad_page(const char *function, struct page *page) printk(KERN_EMERG "Backtrace:\n"); dump_stack(); printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); - page->flags &= ~(1 << PG_private | + page->flags &= ~(1 << PG_lru | + 1 << PG_private | 1 << PG_locked | - 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | + 1 << PG_reclaim | + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback); set_page_count(page, 0); @@ -440,14 +442,17 @@ void set_page_refs(struct page *page, int order) */ static void prep_new_page(struct page *page, int order) { - if (page->mapping || page_mapcount(page) || - (page->flags & ( + if ( page_mapcount(page) || + page->mapping != NULL || + page_count(page) != 0 || + (page->flags & ( + 1 << PG_lru | 1 << PG_private | 1 << PG_locked | - 1 << PG_lru | 1 << PG_active | 1 << PG_dirty | 1 << PG_reclaim | + 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback ))) bad_page(__FUNCTION__, page); @@ -511,6 +516,36 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, return allocated; } +#ifdef CONFIG_NUMA +/* Called from the slab reaper to drain remote pagesets */ +void drain_remote_pages(void) +{ + struct zone *zone; + int i; + unsigned long flags; + + local_irq_save(flags); + for_each_zone(zone) { + struct per_cpu_pageset *pset; + + /* Do not drain local pagesets */ + if (zone->zone_pgdat->node_id == numa_node_id()) + continue; + + pset = zone->pageset[smp_processor_id()]; + for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { + struct per_cpu_pages *pcp; + + pcp = &pset->pcp[i]; + if (pcp->count) + pcp->count -= free_pages_bulk(zone, pcp->count, + &pcp->list, 0); + } + } + local_irq_restore(flags); +} +#endif + #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { @@ -520,7 +555,7 @@ static void __drain_pages(unsigned int cpu) for_each_zone(zone) { struct per_cpu_pageset *pset; - pset = &zone->pageset[cpu]; + pset = zone_pcp(zone, cpu); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; @@ -583,12 +618,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) local_irq_save(flags); cpu = smp_processor_id(); - p = &z->pageset[cpu]; + p = zone_pcp(z,cpu); if (pg == orig) { - z->pageset[cpu].numa_hit++; + p->numa_hit++; } else { p->numa_miss++; - zonelist->zones[0]->pageset[cpu].numa_foreign++; + zone_pcp(zonelist->zones[0], cpu)->numa_foreign++; } if (pg == NODE_DATA(numa_node_id())) p->local_node++; @@ -615,12 +650,12 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (PageAnon(page)) page->mapping = NULL; free_pages_check(__FUNCTION__, page); - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); list_add(&page->lru, &pcp->list); pcp->count++; + if (pcp->count >= pcp->high) + pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); local_irq_restore(flags); put_cpu(); } @@ -659,7 +694,7 @@ buffered_rmqueue(struct zone *zone, int order, unsigned int __nocast gfp_flags) if (order == 0) { struct per_cpu_pages *pcp; - pcp = &zone->pageset[get_cpu()].pcp[cold]; + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); if (pcp->count <= pcp->low) pcp->count += rmqueue_bulk(zone, 0, @@ -854,7 +889,7 @@ rebalance: reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zones, gfp_mask, order); + did_some_progress = try_to_free_pages(zones, gfp_mask); p->reclaim_state = NULL; p->flags &= ~PF_MEMALLOC; @@ -862,12 +897,6 @@ rebalance: cond_resched(); if (likely(did_some_progress)) { - /* - * Go through the zonelist yet one more time, keep - * very high watermark here, this is only to catch - * a parallel oom killing, we must fail if we're still - * under heavy pressure. - */ for (i = 0; (z = zones[i]) != NULL; i++) { if (!zone_watermark_ok(z, order, z->pages_min, classzone_idx, can_try_harder, @@ -901,7 +930,7 @@ rebalance: goto got_pg; } - out_of_memory(gfp_mask); + out_of_memory(gfp_mask, order); goto restart; } @@ -930,6 +959,7 @@ nopage: " order:%d, mode:0x%x\n", p->comm, order, gfp_mask); dump_stack(); + show_mem(); } return NULL; got_pg: @@ -1031,20 +1061,19 @@ unsigned int nr_free_pages_pgdat(pg_data_t *pgdat) static unsigned int nr_free_zone_pages(int offset) { - pg_data_t *pgdat; + /* Just pick one node, since fallback list is circular */ + pg_data_t *pgdat = NODE_DATA(numa_node_id()); unsigned int sum = 0; - for_each_pgdat(pgdat) { - struct zonelist *zonelist = pgdat->node_zonelists + offset; - struct zone **zonep = zonelist->zones; - struct zone *zone; + struct zonelist *zonelist = pgdat->node_zonelists + offset; + struct zone **zonep = zonelist->zones; + struct zone *zone; - for (zone = *zonep++; zone; zone = *zonep++) { - unsigned long size = zone->present_pages; - unsigned long high = zone->pages_high; - if (size > high) - sum += size - high; - } + for (zone = *zonep++; zone; zone = *zonep++) { + unsigned long size = zone->present_pages; + unsigned long high = zone->pages_high; + if (size > high) + sum += size - high; } return sum; @@ -1139,7 +1168,7 @@ void get_full_page_state(struct page_state *ret) __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long)); } -unsigned long __read_page_state(unsigned offset) +unsigned long __read_page_state(unsigned long offset) { unsigned long ret = 0; int cpu; @@ -1153,7 +1182,7 @@ unsigned long __read_page_state(unsigned offset) return ret; } -void __mod_page_state(unsigned offset, unsigned long delta) +void __mod_page_state(unsigned long offset, unsigned long delta) { unsigned long flags; void* ptr; @@ -1262,22 +1291,23 @@ void show_free_areas(void) if (!cpu_possible(cpu)) continue; - pageset = zone->pageset + cpu; + pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d\n", + printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", cpu, temperature ? "cold" : "hot", pageset->pcp[temperature].low, pageset->pcp[temperature].high, - pageset->pcp[temperature].batch); + pageset->pcp[temperature].batch, + pageset->pcp[temperature].count); } } get_page_state(&ps); get_zone_counts(&active, &inactive, &free); - printk("\nFree pages: %11ukB (%ukB HighMem)\n", + printk("Free pages: %11ukB (%ukB HighMem)\n", K(nr_free_pages()), K(nr_free_highpages())); @@ -1612,11 +1642,17 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { - struct page *start = pfn_to_page(start_pfn); struct page *page; + unsigned long end_pfn = start_pfn + size; + unsigned long pfn; - for (page = start; page < (start + size); page++) { - set_page_zone(page, NODEZONE(nid, zone)); + for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { + if (!early_pfn_valid(pfn)) + continue; + if (!early_pfn_in_nid(pfn, nid)) + continue; + page = pfn_to_page(pfn); + set_page_links(page, zone, nid, pfn); set_page_count(page, 0); reset_page_mapcount(page); SetPageReserved(page); @@ -1624,9 +1660,8 @@ void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone, #ifdef WANT_PAGE_VIRTUAL /* The shift won't overflow because ZONE_NORMAL is below 4G. */ if (!is_highmem_idx(zone)) - set_page_address(page, __va(start_pfn << PAGE_SHIFT)); + set_page_address(page, __va(pfn << PAGE_SHIFT)); #endif - start_pfn++; } } @@ -1640,11 +1675,181 @@ void zone_init_free_lists(struct pglist_data *pgdat, struct zone *zone, } } +#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr) +void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, + unsigned long size) +{ + unsigned long snum = pfn_to_section_nr(pfn); + unsigned long end = pfn_to_section_nr(pfn + size); + + if (FLAGS_HAS_NODE) + zone_table[ZONETABLE_INDEX(nid, zid)] = zone; + else + for (; snum <= end; snum++) + zone_table[ZONETABLE_INDEX(snum, zid)] = zone; +} + #ifndef __HAVE_ARCH_MEMMAP_INIT #define memmap_init(size, nid, zone, start_pfn) \ memmap_init_zone((size), (nid), (zone), (start_pfn)) #endif +static int __devinit zone_batchsize(struct zone *zone) +{ + int batch; + + /* + * The per-cpu-pages pools are set to around 1000th of the + * size of the zone. But no more than 1/4 of a meg - there's + * no point in going beyond the size of L2 cache. + * + * OK, so we don't know how big the cache is. So guess. + */ + batch = zone->present_pages / 1024; + if (batch * PAGE_SIZE > 256 * 1024) + batch = (256 * 1024) / PAGE_SIZE; + batch /= 4; /* We effectively *= 4 below */ + if (batch < 1) + batch = 1; + + /* + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. + * + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. + */ + batch = (1 << fls(batch + batch/2)) - 1; + return batch; +} + +inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot */ + pcp->count = 0; + pcp->low = 2 * batch; + pcp->high = 6 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); + + pcp = &p->pcp[1]; /* cold*/ + pcp->count = 0; + pcp->low = 0; + pcp->high = 2 * batch; + pcp->batch = max(1UL, 1 * batch); + INIT_LIST_HEAD(&pcp->list); +} + +#ifdef CONFIG_NUMA +/* + * Boot pageset table. One per cpu which is going to be used for all + * zones and all nodes. The parameters will be set in such a way + * that an item put on a list will immediately be handed over to + * the buddy list. This is safe since pageset manipulation is done + * with interrupts disabled. + * + * Some NUMA counter updates may also be caught by the boot pagesets. + * + * The boot_pagesets must be kept even after bootup is complete for + * unused processors and/or zones. They do play a role for bootstrapping + * hotplugged processors. + * + * zoneinfo_show() and maybe other functions do + * not check if the processor is online before following the pageset pointer. + * Other parts of the kernel may not check if the zone is available. + */ +static struct per_cpu_pageset + boot_pageset[NR_CPUS]; + +/* + * Dynamically allocate memory for the + * per cpu pageset array in struct zone. + */ +static int __devinit process_zones(int cpu) +{ + struct zone *zone, *dzone; + + for_each_zone(zone) { + + zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), + GFP_KERNEL, cpu_to_node(cpu)); + if (!zone->pageset[cpu]) + goto bad; + + setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); + } + + return 0; +bad: + for_each_zone(dzone) { + if (dzone == zone) + break; + kfree(dzone->pageset[cpu]); + dzone->pageset[cpu] = NULL; + } + return -ENOMEM; +} + +static inline void free_zone_pagesets(int cpu) +{ +#ifdef CONFIG_NUMA + struct zone *zone; + + for_each_zone(zone) { + struct per_cpu_pageset *pset = zone_pcp(zone, cpu); + + zone_pcp(zone, cpu) = NULL; + kfree(pset); + } +#endif +} + +static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, + unsigned long action, + void *hcpu) +{ + int cpu = (long)hcpu; + int ret = NOTIFY_OK; + + switch (action) { + case CPU_UP_PREPARE: + if (process_zones(cpu)) + ret = NOTIFY_BAD; + break; +#ifdef CONFIG_HOTPLUG_CPU + case CPU_DEAD: + free_zone_pagesets(cpu); + break; +#endif + default: + break; + } + return ret; +} + +static struct notifier_block pageset_notifier = + { &pageset_cpuup_callback, NULL, 0 }; + +void __init setup_per_cpu_pageset() +{ + int err; + + /* Initialize per_cpu_pageset for cpu 0. + * A cpuup callback will do this for every cpu + * as it comes online + */ + err = process_zones(smp_processor_id()); + BUG_ON(err); + register_cpu_notifier(&pageset_notifier); +} + +#endif + /* * Set up the zone data structures: * - mark all pages reserved @@ -1655,7 +1860,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long i, j; - const unsigned long zone_required_alignment = 1UL << (MAX_ORDER-1); int cpu, nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; @@ -1668,7 +1872,6 @@ static void __init free_area_init_core(struct pglist_data *pgdat, unsigned long size, realsize; unsigned long batch; - zone_table[NODEZONE(nid, j)] = zone; realsize = size = zones_size[j]; if (zholes_size) realsize -= zholes_size[j]; @@ -1687,48 +1890,16 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->temp_priority = zone->prev_priority = DEF_PRIORITY; - /* - * The per-cpu-pages pools are set to around 1000th of the - * size of the zone. But no more than 1/4 of a meg - there's - * no point in going beyond the size of L2 cache. - * - * OK, so we don't know how big the cache is. So guess. - */ - batch = zone->present_pages / 1024; - if (batch * PAGE_SIZE > 256 * 1024) - batch = (256 * 1024) / PAGE_SIZE; - batch /= 4; /* We effectively *= 4 below */ - if (batch < 1) - batch = 1; - - /* - * Clamp the batch to a 2^n - 1 value. Having a power - * of 2 value was found to be more likely to have - * suboptimal cache aliasing properties in some cases. - * - * For example if 2 tasks are alternately allocating - * batches of pages, one task can end up with a lot - * of pages of one half of the possible page colors - * and the other with pages of the other colors. - */ - batch = (1 << fls(batch + batch/2)) - 1; + batch = zone_batchsize(zone); for (cpu = 0; cpu < NR_CPUS; cpu++) { - struct per_cpu_pages *pcp; - - pcp = &zone->pageset[cpu].pcp[0]; /* hot */ - pcp->count = 0; - pcp->low = 2 * batch; - pcp->high = 6 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); - - pcp = &zone->pageset[cpu].pcp[1]; /* cold */ - pcp->count = 0; - pcp->low = 0; - pcp->high = 2 * batch; - pcp->batch = 1 * batch; - INIT_LIST_HEAD(&pcp->list); +#ifdef CONFIG_NUMA + /* Early boot. Slab allocator not functional yet */ + zone->pageset[cpu] = &boot_pageset[cpu]; + setup_pageset(&boot_pageset[cpu],0); +#else + setup_pageset(zone_pcp(zone,cpu), batch); +#endif } printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", zone_names[j], realsize, batch); @@ -1738,6 +1909,7 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->nr_scan_inactive = 0; zone->nr_active = 0; zone->nr_inactive = 0; + atomic_set(&zone->reclaim_in_progress, -1); if (!size) continue; @@ -1760,11 +1932,10 @@ static void __init free_area_init_core(struct pglist_data *pgdat, zone->zone_mem_map = pfn_to_page(zone_start_pfn); zone->zone_start_pfn = zone_start_pfn; - if ((zone_start_pfn) & (zone_required_alignment-1)) - printk(KERN_CRIT "BUG: wrong zone alignment, it will crash\n"); - memmap_init(size, nid, j, zone_start_pfn); + zonetable_add(zone, nid, j, zone_start_pfn, size); + zone_start_pfn += size; zone_init_free_lists(pgdat, zone, zone->spanned_pages); @@ -1773,24 +1944,30 @@ static void __init free_area_init_core(struct pglist_data *pgdat, static void __init alloc_node_mem_map(struct pglist_data *pgdat) { - unsigned long size; - /* Skip empty nodes */ if (!pgdat->node_spanned_pages) return; +#ifdef CONFIG_FLAT_NODE_MEM_MAP /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { + unsigned long size; + struct page *map; + size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); - pgdat->node_mem_map = alloc_bootmem_node(pgdat, size); + map = alloc_remap(pgdat->node_id, size); + if (!map) + map = alloc_bootmem_node(pgdat, size); + pgdat->node_mem_map = map; } -#ifndef CONFIG_DISCONTIGMEM +#ifdef CONFIG_FLATMEM /* * With no DISCONTIG, the global mem_map is just set as node 0's */ if (pgdat == NODE_DATA(0)) mem_map = NODE_DATA(0)->node_mem_map; #endif +#endif /* CONFIG_FLAT_NODE_MEM_MAP */ } void __init free_area_init_node(int nid, struct pglist_data *pgdat, @@ -1806,18 +1983,18 @@ void __init free_area_init_node(int nid, struct pglist_data *pgdat, free_area_init_core(pgdat, zones_size, zholes_size); } -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES static bootmem_data_t contig_bootmem_data; struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data }; EXPORT_SYMBOL(contig_page_data); +#endif void __init free_area_init(unsigned long *zones_size) { - free_area_init_node(0, &contig_page_data, zones_size, + free_area_init_node(0, NODE_DATA(0), zones_size, __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -#endif #ifdef CONFIG_PROC_FS @@ -1928,7 +2105,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) struct per_cpu_pageset *pageset; int j; - pageset = &zone->pageset[i]; + pageset = zone_pcp(zone, i); for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { if (pageset->pcp[j].count) break;