X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=5af33186a25feb86afd4db40462f74340e09488e;hb=fadd8fbd153c12963f8fe3c9ef7f8967f286f98b;hp=dc523a1f270db11ad4930fa47548232c14e6e4ad;hpb=e02a4cabfcb9a999b74a2e2e6f13ffcb7ff2d606;p=linux-2.6 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc523a1f27..5af33186a2 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -37,8 +37,10 @@ #include #include #include +#include #include +#include #include "internal.h" /* @@ -51,6 +53,7 @@ nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; +unsigned long totalreserve_pages __read_mostly; long nr_swap_pages; int percpu_pagelist_fraction; @@ -81,8 +84,8 @@ EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; int min_free_kbytes = 1024; -unsigned long __initdata nr_kernel_pages; -unsigned long __initdata nr_all_pages; +unsigned long __meminitdata nr_kernel_pages; +unsigned long __meminitdata nr_all_pages; #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) @@ -151,7 +154,8 @@ static void bad_page(struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ); + 1 << PG_writeback | + 1 << PG_buddy ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -230,18 +234,20 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) * zone->lock is already acquired when we use these. * So, we don't need atomic page->flags operations here. */ -static inline unsigned long page_order(struct page *page) { +static inline unsigned long page_order(struct page *page) +{ return page_private(page); } -static inline void set_page_order(struct page *page, int order) { +static inline void set_page_order(struct page *page, int order) +{ set_page_private(page, order); - __SetPagePrivate(page); + __SetPageBuddy(page); } static inline void rmv_page_order(struct page *page) { - __ClearPagePrivate(page); + __ClearPageBuddy(page); set_page_private(page, 0); } @@ -280,24 +286,31 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && - * (b) the buddy is free && - * (c) the buddy is on the buddy system && - * (d) a page and its buddy have the same order. - * for recording page's order, we use page_private(page) and PG_private. + * (b) the buddy is in the buddy system && + * (c) a page and its buddy have the same order && + * (d) a page and its buddy are in the same zone. * + * For recording whether a page is in the buddy system, we use PG_buddy. + * Setting, clearing, and testing PG_buddy is serialized by zone->lock. + * + * For recording page's order, we use page_private(page). */ -static inline int page_is_buddy(struct page *page, int order) +static inline int page_is_buddy(struct page *page, struct page *buddy, + int order) { #ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) + if (!pfn_valid(page_to_pfn(buddy))) return 0; #endif - if (PagePrivate(page) && - (page_order(page) == order) && - page_count(page) == 0) - return 1; - return 0; + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + + if (PageBuddy(buddy) && page_order(buddy) == order) { + BUG_ON(page_count(buddy) != 0); + return 1; + } + return 0; } /* @@ -313,7 +326,7 @@ static inline int page_is_buddy(struct page *page, int order) * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_Private.Page's + * free pages of length of (1 << order) and marked with PG_buddy. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -345,7 +358,7 @@ static inline void __free_one_page(struct page *page, struct page *buddy; buddy = __page_find_buddy(page, page_idx, order); - if (!page_is_buddy(buddy, order)) + if (!page_is_buddy(page, buddy, order)) break; /* Move the buddy up one level. */ list_del(&buddy->lru); @@ -376,7 +389,8 @@ static inline int free_pages_check(struct page *page) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved )))) + 1 << PG_reserved | + 1 << PG_buddy )))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); @@ -524,7 +538,8 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved )))) + 1 << PG_reserved | + 1 << PG_buddy )))) bad_page(page); /* @@ -942,7 +957,7 @@ restart: goto got_pg; do { - if (cpuset_zone_allowed(*z, gfp_mask)) + if (cpuset_zone_allowed(*z, gfp_mask|__GFP_HARDWALL)) wakeup_kswapd(*z, order); } while (*(++z)); @@ -961,7 +976,8 @@ restart: alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; - alloc_flags |= ALLOC_CPUSET; + if (wait) + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations @@ -1502,7 +1518,7 @@ void show_free_areas(void) * * Add all populated zones of a node to the zonelist. */ -static int __init build_zonelists_node(pg_data_t *pgdat, +static int __meminit build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int nr_zones, int zone_type) { struct zone *zone; @@ -1538,7 +1554,7 @@ static inline int highest_zone(int zone_bits) #ifdef CONFIG_NUMA #define MAX_NODE_LOAD (num_online_nodes()) -static int __initdata node_load[MAX_NUMNODES]; +static int __meminitdata node_load[MAX_NUMNODES]; /** * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending @@ -1553,7 +1569,7 @@ static int __initdata node_load[MAX_NUMNODES]; * on them otherwise. * It returns -1 if no node is found. */ -static int __init find_next_best_node(int node, nodemask_t *used_node_mask) +static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) { int n, val; int min_val = INT_MAX; @@ -1599,7 +1615,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask) return best_node; } -static void __init build_zonelists(pg_data_t *pgdat) +static void __meminit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; int prev_node, load; @@ -1651,7 +1667,7 @@ static void __init build_zonelists(pg_data_t *pgdat) #else /* CONFIG_NUMA */ -static void __init build_zonelists(pg_data_t *pgdat) +static void __meminit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; @@ -1689,14 +1705,29 @@ static void __init build_zonelists(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ -void __init build_all_zonelists(void) +/* return values int ....just for stop_machine_run() */ +static int __meminit __build_all_zonelists(void *dummy) { - int i; + int nid; + for_each_online_node(nid) + build_zonelists(NODE_DATA(nid)); + return 0; +} + +void __meminit build_all_zonelists(void) +{ + if (system_state == SYSTEM_BOOTING) { + __build_all_zonelists(0); + cpuset_init_current_mems_allowed(); + } else { + /* we have to stop all cpus to guaranntee there is no user + of zonelist */ + stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); + /* cpuset refresh routine should be here */ + } - for_each_online_node(i) - build_zonelists(NODE_DATA(i)); printk("Built %i zonelists\n", num_online_nodes()); - cpuset_init_current_mems_allowed(); + } /* @@ -1712,7 +1743,8 @@ void __init build_all_zonelists(void) */ #define PAGES_PER_WAITQUEUE 256 -static inline unsigned long wait_table_size(unsigned long pages) +#ifndef CONFIG_MEMORY_HOTPLUG +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) { unsigned long size = 1; @@ -1730,6 +1762,29 @@ static inline unsigned long wait_table_size(unsigned long pages) return max(size, 4UL); } +#else +/* + * A zone's size might be changed by hot-add, so it is not possible to determine + * a suitable size for its wait_table. So we use the maximum size now. + * + * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: + * + * i386 (preemption config) : 4096 x 16 = 64Kbyte. + * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. + * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. + * + * The maximum entries are prepared when a zone's memory is (512K + 256) pages + * or more by the traditional way. (See above). It equals: + * + * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. + * ia64(16K page size) : = ( 8G + 4M)byte. + * powerpc (64K page size) : = (32G +16M)byte. + */ +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) +{ + return 4096UL; +} +#endif /* * This is an integer logarithm so that shifts can be used later @@ -1954,7 +2009,7 @@ static inline void free_zone_pagesets(int cpu) } } -static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, +static int pageset_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1995,23 +2050,46 @@ void __init setup_per_cpu_pageset(void) #endif static __meminit -void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; struct pglist_data *pgdat = zone->zone_pgdat; + size_t alloc_size; /* * The per-page waitqueue mechanism uses hashed waitqueues * per zone. */ - zone->wait_table_size = wait_table_size(zone_size_pages); - zone->wait_table_bits = wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); + zone->wait_table_hash_nr_entries = + wait_table_hash_nr_entries(zone_size_pages); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_hash_nr_entries); + alloc_size = zone->wait_table_hash_nr_entries + * sizeof(wait_queue_head_t); + + if (system_state == SYSTEM_BOOTING) { + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, alloc_size); + } else { + /* + * This case means that a zone whose size was 0 gets new memory + * via memory hot-add. + * But it may be the case that a new node was hot-added. In + * this case vmalloc() will not be able to use this new node's + * memory - this wait_table must be initialized to use this new + * node itself as well. + * To use this new node's memory, further consideration will be + * necessary. + */ + zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); + } + if (!zone->wait_table) + return -ENOMEM; - for(i = 0; i < zone->wait_table_size; ++i) + for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) init_waitqueue_head(zone->wait_table + i); + + return 0; } static __meminit void zone_pcp_init(struct zone *zone) @@ -2033,12 +2111,15 @@ static __meminit void zone_pcp_init(struct zone *zone) zone->name, zone->present_pages, batch); } -static __meminit void init_currently_empty_zone(struct zone *zone, - unsigned long zone_start_pfn, unsigned long size) +__meminit int init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, + unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - - zone_wait_table_init(zone, size); + int ret; + ret = zone_wait_table_init(zone, size); + if (ret) + return ret; pgdat->nr_zones = zone_idx(zone) + 1; zone->zone_start_pfn = zone_start_pfn; @@ -2046,6 +2127,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone, memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); zone_init_free_lists(pgdat, zone, zone->spanned_pages); + + return 0; } /* @@ -2054,12 +2137,13 @@ static __meminit void init_currently_empty_zone(struct zone *zone, * - mark all memory queues empty * - clear the memory bitmaps */ -static void __init free_area_init_core(struct pglist_data *pgdat, +static void __meminit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long j; int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + int ret; pgdat_resize_init(pgdat); pgdat->nr_zones = 0; @@ -2101,7 +2185,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat, continue; zonetable_add(zone, nid, j, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, size); + BUG_ON(ret); zone_start_pfn += size; } } @@ -2115,14 +2200,22 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) #ifdef CONFIG_FLAT_NODE_MEM_MAP /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { - unsigned long size; + unsigned long size, start, end; struct page *map; - size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + /* + * The zone's endpoints aren't required to be MAX_ORDER + * aligned but the node_mem_map endpoints must be in order + * for the buddy allocator to function correctly. + */ + start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + end = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end = ALIGN(end, MAX_ORDER_NR_PAGES); + size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) map = alloc_bootmem_node(pgdat, size); - pgdat->node_mem_map = map; + pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifdef CONFIG_FLATMEM /* @@ -2134,7 +2227,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } -void __init free_area_init_node(int nid, struct pglist_data *pgdat, +void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { @@ -2471,6 +2564,38 @@ void __init page_alloc_init(void) hotcpu_notifier(page_alloc_cpu_notify, 0); } +/* + * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio + * or min_free_kbytes changes. + */ +static void calculate_totalreserve_pages(void) +{ + struct pglist_data *pgdat; + unsigned long reserve_pages = 0; + int i, j; + + for_each_online_pgdat(pgdat) { + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + unsigned long max = 0; + + /* Find valid and maximum lowmem_reserve in the zone */ + for (j = i; j < MAX_NR_ZONES; j++) { + if (zone->lowmem_reserve[j] > max) + max = zone->lowmem_reserve[j]; + } + + /* we treat pages_high as reserved pages. */ + max += zone->pages_high; + + if (max > zone->present_pages) + max = zone->present_pages; + reserve_pages += max; + } + } + totalreserve_pages = reserve_pages; +} + /* * setup_per_zone_lowmem_reserve - called whenever * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone @@ -2502,6 +2627,9 @@ static void setup_per_zone_lowmem_reserve(void) } } } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); } /* @@ -2523,9 +2651,11 @@ void setup_per_zone_pages_min(void) } for_each_zone(zone) { - unsigned long tmp; + u64 tmp; + spin_lock_irqsave(&zone->lru_lock, flags); - tmp = (pages_min * zone->present_pages) / lowmem_pages; + tmp = (u64)pages_min * zone->present_pages; + do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -2552,10 +2682,13 @@ void setup_per_zone_pages_min(void) zone->pages_min = tmp; } - zone->pages_low = zone->pages_min + tmp / 4; - zone->pages_high = zone->pages_min + tmp / 2; + zone->pages_low = zone->pages_min + (tmp >> 2); + zone->pages_high = zone->pages_min + (tmp >> 1); spin_unlock_irqrestore(&zone->lru_lock, flags); } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); } /* @@ -2746,42 +2879,14 @@ void *__init alloc_large_system_hash(const char *tablename, } #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE -/* - * pfn <-> page translation. out-of-line version. - * (see asm-generic/memory_model.h) - */ -#if defined(CONFIG_FLATMEM) -struct page *pfn_to_page(unsigned long pfn) -{ - return mem_map + (pfn - ARCH_PFN_OFFSET); -} -unsigned long page_to_pfn(struct page *page) -{ - return (page - mem_map) + ARCH_PFN_OFFSET; -} -#elif defined(CONFIG_DISCONTIGMEM) struct page *pfn_to_page(unsigned long pfn) { - int nid = arch_pfn_to_nid(pfn); - return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid); -} -unsigned long page_to_pfn(struct page *page) -{ - struct pglist_data *pgdat = NODE_DATA(page_to_nid(page)); - return (page - pgdat->node_mem_map) + pgdat->node_start_pfn; + return __pfn_to_page(pfn); } -#elif defined(CONFIG_SPARSEMEM) -struct page *pfn_to_page(unsigned long pfn) -{ - return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn; -} - unsigned long page_to_pfn(struct page *page) { - long section_id = page_to_section(page); - return page - __section_mem_map_addr(__nr_to_section(section_id)); + return __page_to_pfn(page); } -#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */ EXPORT_SYMBOL(pfn_to_page); EXPORT_SYMBOL(page_to_pfn); #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */