2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
5 #include <linux/kernel.h>
7 #include <linux/string.h>
8 #include <linux/init.h>
9 #include <linux/bootmem.h>
10 #include <linux/mmzone.h>
11 #include <linux/ctype.h>
12 #include <linux/module.h>
13 #include <linux/nodemask.h>
16 #include <asm/proto.h>
25 struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
26 bootmem_data_t plat_node_bdata[MAX_NUMNODES];
28 struct memnode memnode;
30 unsigned char cpu_to_node[NR_CPUS] __read_mostly = {
31 [0 ... NR_CPUS-1] = NUMA_NO_NODE
33 unsigned char apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
34 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
36 cpumask_t node_to_cpumask[MAX_NUMNODES] __read_mostly;
38 int numa_off __initdata;
39 unsigned long __initdata nodemap_addr;
40 unsigned long __initdata nodemap_size;
44 * Given a shift value, try to populate memnodemap[]
47 * 0 if memnodmap[] too small (of shift too small)
48 * -1 if node overlap or lost ram (shift too big)
51 populate_memnodemap(const struct bootnode *nodes, int numnodes, int shift)
55 unsigned long addr, end;
57 memset(memnodemap, 0xff, memnodemapsize);
58 for (i = 0; i < numnodes; i++) {
59 addr = nodes[i].start;
63 if ((end >> shift) >= memnodemapsize)
66 if (memnodemap[addr >> shift] != 0xff)
68 memnodemap[addr >> shift] = i;
69 addr += (1UL << shift);
76 static int __init allocate_cachealigned_memnodemap(void)
78 unsigned long pad, pad_addr;
80 memnodemap = memnode.embedded_map;
81 if (memnodemapsize <= 48) {
82 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
83 nodemap_addr, nodemap_addr + nodemap_size);
87 pad = L1_CACHE_BYTES - 1;
89 nodemap_size = pad + memnodemapsize;
90 nodemap_addr = find_e820_area(pad_addr, end_pfn<<PAGE_SHIFT,
92 if (nodemap_addr == -1UL) {
94 "NUMA: Unable to allocate Memory to Node hash map\n");
95 nodemap_addr = nodemap_size = 0;
98 pad_addr = (nodemap_addr + pad) & ~pad;
99 memnodemap = phys_to_virt(pad_addr);
101 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
102 nodemap_addr, nodemap_addr + nodemap_size);
107 * The LSB of all start and end addresses in the node map is the value of the
108 * maximum possible shift.
111 extract_lsb_from_nodes (const struct bootnode *nodes, int numnodes)
114 unsigned long start, end;
115 unsigned long bitfield = 0, memtop = 0;
117 for (i = 0; i < numnodes; i++) {
118 start = nodes[i].start;
122 bitfield |= start | end;
126 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
127 memnodemapsize = (memtop >> i)+1;
131 int __init compute_hash_shift(struct bootnode *nodes, int numnodes)
135 shift = extract_lsb_from_nodes(nodes, numnodes);
136 if (allocate_cachealigned_memnodemap())
138 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
141 if (populate_memnodemap(nodes, numnodes, shift) != 1) {
143 "Your memory is not aligned you need to rebuild your kernel "
144 "with a bigger NODEMAPSIZE shift=%d\n",
151 #ifdef CONFIG_SPARSEMEM
152 int early_pfn_to_nid(unsigned long pfn)
154 return phys_to_nid(pfn << PAGE_SHIFT);
159 early_node_mem(int nodeid, unsigned long start, unsigned long end,
162 unsigned long mem = find_e820_area(start, end, size);
166 ptr = __alloc_bootmem_nopanic(size,
167 SMP_CACHE_BYTES, __pa(MAX_DMA_ADDRESS));
169 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
176 /* Initialize bootmem allocator for a node */
177 void __init setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
179 unsigned long start_pfn, end_pfn, bootmap_pages, bootmap_size, bootmap_start;
180 unsigned long nodedata_phys;
182 const int pgdat_size = round_up(sizeof(pg_data_t), PAGE_SIZE);
184 start = round_up(start, ZONE_ALIGN);
186 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, start, end);
188 start_pfn = start >> PAGE_SHIFT;
189 end_pfn = end >> PAGE_SHIFT;
191 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size);
192 if (node_data[nodeid] == NULL)
194 nodedata_phys = __pa(node_data[nodeid]);
196 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
197 NODE_DATA(nodeid)->bdata = &plat_node_bdata[nodeid];
198 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
199 NODE_DATA(nodeid)->node_spanned_pages = end_pfn - start_pfn;
201 /* Find a place for the bootmem map */
202 bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
203 bootmap_start = round_up(nodedata_phys + pgdat_size, PAGE_SIZE);
204 bootmap = early_node_mem(nodeid, bootmap_start, end,
205 bootmap_pages<<PAGE_SHIFT);
206 if (bootmap == NULL) {
207 if (nodedata_phys < start || nodedata_phys >= end)
208 free_bootmem((unsigned long)node_data[nodeid],pgdat_size);
209 node_data[nodeid] = NULL;
212 bootmap_start = __pa(bootmap);
213 Dprintk("bootmap start %lu pages %lu\n", bootmap_start, bootmap_pages);
215 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
216 bootmap_start >> PAGE_SHIFT,
219 free_bootmem_with_active_regions(nodeid, end);
221 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys, pgdat_size);
222 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<<PAGE_SHIFT);
223 #ifdef CONFIG_ACPI_NUMA
224 srat_reserve_add_area(nodeid);
226 node_set_online(nodeid);
229 /* Initialize final allocator for a zone */
230 void __init setup_node_zones(int nodeid)
232 unsigned long start_pfn, end_pfn, memmapsize, limit;
234 start_pfn = node_start_pfn(nodeid);
235 end_pfn = node_end_pfn(nodeid);
237 Dprintk(KERN_INFO "Setting up memmap for node %d %lx-%lx\n",
238 nodeid, start_pfn, end_pfn);
240 /* Try to allocate mem_map at end to not fill up precious <4GB
242 memmapsize = sizeof(struct page) * (end_pfn-start_pfn);
243 limit = end_pfn << PAGE_SHIFT;
244 #ifdef CONFIG_FLAT_NODE_MEM_MAP
245 NODE_DATA(nodeid)->node_mem_map =
246 __alloc_bootmem_core(NODE_DATA(nodeid)->bdata,
247 memmapsize, SMP_CACHE_BYTES,
248 round_down(limit - memmapsize, PAGE_SIZE),
253 void __init numa_init_array(void)
256 /* There are unfortunately some poorly designed mainboards around
257 that only connect memory to a single CPU. This breaks the 1:1 cpu->node
258 mapping. To avoid this fill in the mapping for all possible
259 CPUs, as the number of CPUs is not known yet.
260 We round robin the existing nodes. */
261 rr = first_node(node_online_map);
262 for (i = 0; i < NR_CPUS; i++) {
263 if (cpu_to_node[i] != NUMA_NO_NODE)
265 numa_set_node(i, rr);
266 rr = next_node(rr, node_online_map);
267 if (rr == MAX_NUMNODES)
268 rr = first_node(node_online_map);
273 #ifdef CONFIG_NUMA_EMU
274 int numa_fake __initdata = 0;
277 static int __init numa_emulation(unsigned long start_pfn, unsigned long end_pfn)
280 struct bootnode nodes[MAX_NUMNODES];
281 unsigned long sz = ((end_pfn - start_pfn)<<PAGE_SHIFT) / numa_fake;
283 /* Kludge needed for the hash function */
284 if (hweight64(sz) > 1) {
286 while ((x << 1) < sz)
289 printk(KERN_ERR "Numa emulation unbalanced. Complain to maintainer\n");
293 memset(&nodes,0,sizeof(nodes));
294 for (i = 0; i < numa_fake; i++) {
295 nodes[i].start = (start_pfn<<PAGE_SHIFT) + i*sz;
296 if (i == numa_fake-1)
297 sz = (end_pfn<<PAGE_SHIFT) - nodes[i].start;
298 nodes[i].end = nodes[i].start + sz;
299 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n",
301 nodes[i].start, nodes[i].end,
302 (nodes[i].end - nodes[i].start) >> 20);
305 memnode_shift = compute_hash_shift(nodes, numa_fake);
306 if (memnode_shift < 0) {
308 printk(KERN_ERR "No NUMA hash function found. Emulation disabled.\n");
311 for_each_online_node(i) {
312 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
313 nodes[i].end >> PAGE_SHIFT);
314 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
321 void __init numa_initmem_init(unsigned long start_pfn, unsigned long end_pfn)
325 #ifdef CONFIG_NUMA_EMU
326 if (numa_fake && !numa_emulation(start_pfn, end_pfn))
330 #ifdef CONFIG_ACPI_NUMA
331 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
332 end_pfn << PAGE_SHIFT))
336 #ifdef CONFIG_K8_NUMA
337 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT))
340 printk(KERN_INFO "%s\n",
341 numa_off ? "NUMA turned off" : "No NUMA configuration found");
343 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
344 start_pfn << PAGE_SHIFT,
345 end_pfn << PAGE_SHIFT);
346 /* setup dummy node covering all memory */
348 memnodemap = memnode.embedded_map;
350 nodes_clear(node_online_map);
352 for (i = 0; i < NR_CPUS; i++)
354 node_to_cpumask[0] = cpumask_of_cpu(0);
355 e820_register_active_regions(0, start_pfn, end_pfn);
356 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, end_pfn << PAGE_SHIFT);
359 __cpuinit void numa_add_cpu(int cpu)
361 set_bit(cpu, &node_to_cpumask[cpu_to_node(cpu)]);
364 void __cpuinit numa_set_node(int cpu, int node)
366 cpu_pda(cpu)->nodenumber = node;
367 cpu_to_node[cpu] = node;
370 unsigned long __init numa_free_all_bootmem(void)
373 unsigned long pages = 0;
374 for_each_online_node(i) {
375 pages += free_all_bootmem_node(NODE_DATA(i));
380 #ifdef CONFIG_SPARSEMEM
381 static void __init arch_sparse_init(void)
385 for_each_online_node(i)
386 memory_present(i, node_start_pfn(i), node_end_pfn(i));
391 #define arch_sparse_init() do {} while (0)
394 void __init paging_init(void)
397 unsigned long max_zone_pfns[MAX_NR_ZONES];
398 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
399 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
400 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
401 max_zone_pfns[ZONE_NORMAL] = end_pfn;
405 for_each_online_node(i) {
409 free_area_init_nodes(max_zone_pfns);
412 static __init int numa_setup(char *opt)
416 if (!strncmp(opt,"off",3))
418 #ifdef CONFIG_NUMA_EMU
419 if(!strncmp(opt, "fake=", 5)) {
420 numa_fake = simple_strtoul(opt+5,NULL,0); ;
421 if (numa_fake >= MAX_NUMNODES)
422 numa_fake = MAX_NUMNODES;
425 #ifdef CONFIG_ACPI_NUMA
426 if (!strncmp(opt,"noacpi",6))
428 if (!strncmp(opt,"hotadd=", 7))
429 hotadd_percent = simple_strtoul(opt+7, NULL, 10);
434 early_param("numa", numa_setup);
437 * Setup early cpu_to_node.
439 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
440 * and apicid_to_node[] tables have valid entries for a CPU.
441 * This means we skip cpu_to_node[] initialisation for NUMA
442 * emulation and faking node case (when running a kernel compiled
443 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
444 * is already initialized in a round robin manner at numa_init_array,
445 * prior to this call, and this initialization is good enough
446 * for the fake NUMA cases.
448 void __init init_cpu_to_node(void)
451 for (i = 0; i < NR_CPUS; i++) {
452 u8 apicid = x86_cpu_to_apicid[i];
453 if (apicid == BAD_APICID)
455 if (apicid_to_node[apicid] == NUMA_NO_NODE)
457 numa_set_node(i,apicid_to_node[apicid]);
461 EXPORT_SYMBOL(cpu_to_node);
462 EXPORT_SYMBOL(node_to_cpumask);
463 EXPORT_SYMBOL(memnode);
464 EXPORT_SYMBOL(node_data);
466 #ifdef CONFIG_DISCONTIGMEM
468 * Functions to convert PFNs from/to per node page addresses.
469 * These are out of line because they are quite big.
470 * They could be all tuned by pre caching more state.
474 int pfn_valid(unsigned long pfn)
477 if (pfn >= num_physpages)
479 nid = pfn_to_nid(pfn);
482 return pfn >= node_start_pfn(nid) && (pfn) < node_end_pfn(nid);
484 EXPORT_SYMBOL(pfn_valid);