[PATCH] support for panic at OOM

[linux-2.6] / mm / page_alloc.c
diff --git a/mm/page_alloc.c b/mm/page_alloc.c

index 5ae75bead4df053cf939e2c87433598fbd8ca895..5af33186a25feb86afd4db40462f74340e09488e 100644 (file)
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -37,6 +37,7 @@
  #include <linux/nodemask.h>
  #include <linux/vmalloc.h>
  #include <linux/mempolicy.h>
+#include <linux/stop_machine.h>
  
  #include <asm/tlbflush.h>
  #include <asm/div64.h>
@@ -1704,14 +1705,29 @@ static void __meminit build_zonelists(pg_data_t *pgdat)
  
  #endif /* CONFIG_NUMA */
  
-void __init build_all_zonelists(void)
+/* return values int ....just for stop_machine_run() */
+static int __meminit __build_all_zonelists(void *dummy)
  {
-       int i;
+       int nid;
+       for_each_online_node(nid)
+               build_zonelists(NODE_DATA(nid));
+       return 0;
+}
+
+void __meminit build_all_zonelists(void)
+{
+       if (system_state == SYSTEM_BOOTING) {
+               __build_all_zonelists(0);
+               cpuset_init_current_mems_allowed();
+       } else {
+               /* we have to stop all cpus to guaranntee there is no user
+                  of zonelist */
+               stop_machine_run(__build_all_zonelists, NULL, NR_CPUS);
+               /* cpuset refresh routine should be here */
+       }
  
-       for_each_online_node(i)
-               build_zonelists(NODE_DATA(i));
         printk("Built %i zonelists\n", num_online_nodes());
-       cpuset_init_current_mems_allowed();
+
  }
  
  /*
@@ -1727,6 +1743,7 @@ void __init build_all_zonelists(void)
   */
  #define PAGES_PER_WAITQUEUE    256
  
+#ifndef CONFIG_MEMORY_HOTPLUG
  static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
  {
         unsigned long size = 1;
@@ -1745,6 +1762,29 @@ static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
  
         return max(size, 4UL);
  }
+#else
+/*
+ * A zone's size might be changed by hot-add, so it is not possible to determine
+ * a suitable size for its wait_table.  So we use the maximum size now.
+ *
+ * The max wait table size = 4096 x sizeof(wait_queue_head_t).   ie:
+ *
+ *    i386 (preemption config)    : 4096 x 16 = 64Kbyte.
+ *    ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte.
+ *    ia64, x86-64 (preemption)   : 4096 x 24 = 96Kbyte.
+ *
+ * The maximum entries are prepared when a zone's memory is (512K + 256) pages
+ * or more by the traditional way. (See above).  It equals:
+ *
+ *    i386, x86-64, powerpc(4K page size) : =  ( 2G + 1M)byte.
+ *    ia64(16K page size)                 : =  ( 8G + 4M)byte.
+ *    powerpc (64K page size)             : =  (32G +16M)byte.
+ */
+static inline unsigned long wait_table_hash_nr_entries(unsigned long pages)
+{
+       return 4096UL;
+}
+#endif
  
  /*
   * This is an integer logarithm so that shifts can be used later
@@ -2010,10 +2050,11 @@ void __init setup_per_cpu_pageset(void)
  #endif
  
  static __meminit
-void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
+int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
  {
         int i;
         struct pglist_data *pgdat = zone->zone_pgdat;
+       size_t alloc_size;
  
         /*
          * The per-page waitqueue mechanism uses hashed waitqueues
@@ -2023,12 +2064,32 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
                  wait_table_hash_nr_entries(zone_size_pages);
         zone->wait_table_bits =
                 wait_table_bits(zone->wait_table_hash_nr_entries);
-       zone->wait_table = (wait_queue_head_t *)
-               alloc_bootmem_node(pgdat, zone->wait_table_hash_nr_entries
-                                       * sizeof(wait_queue_head_t));
+       alloc_size = zone->wait_table_hash_nr_entries
+                                       * sizeof(wait_queue_head_t);
+
+       if (system_state == SYSTEM_BOOTING) {
+               zone->wait_table = (wait_queue_head_t *)
+                       alloc_bootmem_node(pgdat, alloc_size);
+       } else {
+               /*
+                * This case means that a zone whose size was 0 gets new memory
+                * via memory hot-add.
+                * But it may be the case that a new node was hot-added.  In
+                * this case vmalloc() will not be able to use this new node's
+                * memory - this wait_table must be initialized to use this new
+                * node itself as well.
+                * To use this new node's memory, further consideration will be
+                * necessary.
+                */
+               zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size);
+       }
+       if (!zone->wait_table)
+               return -ENOMEM;
  
         for(i = 0; i < zone->wait_table_hash_nr_entries; ++i)
                 init_waitqueue_head(zone->wait_table + i);
+
+       return 0;
  }
  
  static __meminit void zone_pcp_init(struct zone *zone)
@@ -2050,12 +2111,15 @@ static __meminit void zone_pcp_init(struct zone *zone)
                         zone->name, zone->present_pages, batch);
  }
  
-static __meminit void init_currently_empty_zone(struct zone *zone,
-               unsigned long zone_start_pfn, unsigned long size)
+__meminit int init_currently_empty_zone(struct zone *zone,
+                                       unsigned long zone_start_pfn,
+                                       unsigned long size)
  {
         struct pglist_data *pgdat = zone->zone_pgdat;
-
-       zone_wait_table_init(zone, size);
+       int ret;
+       ret = zone_wait_table_init(zone, size);
+       if (ret)
+               return ret;
         pgdat->nr_zones = zone_idx(zone) + 1;
  
         zone->zone_start_pfn = zone_start_pfn;
@@ -2063,6 +2127,8 @@ static __meminit void init_currently_empty_zone(struct zone *zone,
         memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
  
         zone_init_free_lists(pgdat, zone, zone->spanned_pages);
+
+       return 0;
  }
  
  /*
@@ -2077,6 +2143,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
         unsigned long j;
         int nid = pgdat->node_id;
         unsigned long zone_start_pfn = pgdat->node_start_pfn;
+       int ret;
  
         pgdat_resize_init(pgdat);
         pgdat->nr_zones = 0;
@@ -2118,7 +2185,8 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat,
                         continue;
  
                 zonetable_add(zone, nid, j, zone_start_pfn, size);
-               init_currently_empty_zone(zone, zone_start_pfn, size);
+               ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+               BUG_ON(ret);
                 zone_start_pfn += size;
         }
  }
@@ -2811,42 +2879,14 @@ void *__init alloc_large_system_hash(const char *tablename,
  }
  
  #ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE
-/*
- * pfn <-> page translation. out-of-line version.
- * (see asm-generic/memory_model.h)
- */
-#if defined(CONFIG_FLATMEM)
  struct page *pfn_to_page(unsigned long pfn)
  {
-       return mem_map + (pfn - ARCH_PFN_OFFSET);
+       return __pfn_to_page(pfn);
  }
  unsigned long page_to_pfn(struct page *page)
  {
-       return (page - mem_map) + ARCH_PFN_OFFSET;
-}
-#elif defined(CONFIG_DISCONTIGMEM)
-struct page *pfn_to_page(unsigned long pfn)
-{
-       int nid = arch_pfn_to_nid(pfn);
-       return NODE_DATA(nid)->node_mem_map + arch_local_page_offset(pfn,nid);
-}
-unsigned long page_to_pfn(struct page *page)
-{
-       struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
-       return (page - pgdat->node_mem_map) + pgdat->node_start_pfn;
-}
-#elif defined(CONFIG_SPARSEMEM)
-struct page *pfn_to_page(unsigned long pfn)
-{
-       return __section_mem_map_addr(__pfn_to_section(pfn)) + pfn;
-}
-
-unsigned long page_to_pfn(struct page *page)
-{
-       long section_id = page_to_section(page);
-       return page - __section_mem_map_addr(__nr_to_section(section_id));
+       return __page_to_pfn(page);
  }
-#endif /* CONFIG_FLATMEM/DISCONTIGMME/SPARSEMEM */
  EXPORT_SYMBOL(pfn_to_page);
  EXPORT_SYMBOL(page_to_pfn);
  #endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */