2 * linux/arch/x86_64/mm/init.c
4 * Copyright (C) 1995 Linus Torvalds
5 * Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
6 * Copyright (C) 2002,2003 Andi Kleen <ak@suse.de>
9 #include <linux/signal.h>
10 #include <linux/sched.h>
11 #include <linux/kernel.h>
12 #include <linux/errno.h>
13 #include <linux/string.h>
14 #include <linux/types.h>
15 #include <linux/ptrace.h>
16 #include <linux/mman.h>
18 #include <linux/swap.h>
19 #include <linux/smp.h>
20 #include <linux/init.h>
21 #include <linux/pagemap.h>
22 #include <linux/bootmem.h>
23 #include <linux/proc_fs.h>
24 #include <linux/pci.h>
25 #include <linux/pfn.h>
26 #include <linux/poison.h>
27 #include <linux/dma-mapping.h>
28 #include <linux/module.h>
29 #include <linux/memory_hotplug.h>
30 #include <linux/nmi.h>
32 #include <asm/processor.h>
33 #include <asm/system.h>
34 #include <asm/uaccess.h>
35 #include <asm/pgtable.h>
36 #include <asm/pgalloc.h>
38 #include <asm/fixmap.h>
42 #include <asm/mmu_context.h>
43 #include <asm/proto.h>
45 #include <asm/sections.h>
46 #include <asm/kdebug.h>
53 const struct dma_mapping_ops* dma_ops;
54 EXPORT_SYMBOL(dma_ops);
56 static unsigned long dma_reserve __initdata;
58 DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
61 * NOTE: pagetable_init alloc all the fixmap pagetables contiguous on the
62 * physical space so we can cache the place of the first one and move
63 * around without checking the pgd every time.
68 long i, total = 0, reserved = 0;
69 long shared = 0, cached = 0;
73 printk(KERN_INFO "Mem-info:\n");
75 printk(KERN_INFO "Free swap: %6ldkB\n", nr_swap_pages<<(PAGE_SHIFT-10));
77 for_each_online_pgdat(pgdat) {
78 for (i = 0; i < pgdat->node_spanned_pages; ++i) {
79 /* this loop can take a while with 256 GB and 4k pages
80 so update the NMI watchdog */
81 if (unlikely(i % MAX_ORDER_NR_PAGES == 0)) {
84 if (!pfn_valid(pgdat->node_start_pfn + i))
86 page = pfn_to_page(pgdat->node_start_pfn + i);
88 if (PageReserved(page))
90 else if (PageSwapCache(page))
92 else if (page_count(page))
93 shared += page_count(page) - 1;
96 printk(KERN_INFO "%lu pages of RAM\n", total);
97 printk(KERN_INFO "%lu reserved pages\n",reserved);
98 printk(KERN_INFO "%lu pages shared\n",shared);
99 printk(KERN_INFO "%lu pages swap cached\n",cached);
104 static __init void *spp_getpage(void)
108 ptr = (void *) get_zeroed_page(GFP_ATOMIC);
110 ptr = alloc_bootmem_pages(PAGE_SIZE);
111 if (!ptr || ((unsigned long)ptr & ~PAGE_MASK))
112 panic("set_pte_phys: cannot allocate page data %s\n", after_bootmem?"after bootmem":"");
114 Dprintk("spp_getpage %p\n", ptr);
118 static __init void set_pte_phys(unsigned long vaddr,
119 unsigned long phys, pgprot_t prot)
126 Dprintk("set_pte_phys %lx to %lx\n", vaddr, phys);
128 pgd = pgd_offset_k(vaddr);
129 if (pgd_none(*pgd)) {
130 printk("PGD FIXMAP MISSING, it should be setup in head.S!\n");
133 pud = pud_offset(pgd, vaddr);
134 if (pud_none(*pud)) {
135 pmd = (pmd_t *) spp_getpage();
136 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE | _PAGE_USER));
137 if (pmd != pmd_offset(pud, 0)) {
138 printk("PAGETABLE BUG #01! %p <-> %p\n", pmd, pmd_offset(pud,0));
142 pmd = pmd_offset(pud, vaddr);
143 if (pmd_none(*pmd)) {
144 pte = (pte_t *) spp_getpage();
145 set_pmd(pmd, __pmd(__pa(pte) | _KERNPG_TABLE | _PAGE_USER));
146 if (pte != pte_offset_kernel(pmd, 0)) {
147 printk("PAGETABLE BUG #02!\n");
151 new_pte = pfn_pte(phys >> PAGE_SHIFT, prot);
153 pte = pte_offset_kernel(pmd, vaddr);
154 if (!pte_none(*pte) &&
155 pte_val(*pte) != (pte_val(new_pte) & __supported_pte_mask))
157 set_pte(pte, new_pte);
160 * It's enough to flush this one mapping.
161 * (PGE mappings get flushed as well)
163 __flush_tlb_one(vaddr);
166 /* NOTE: this is meant to be run only at boot */
168 __set_fixmap (enum fixed_addresses idx, unsigned long phys, pgprot_t prot)
170 unsigned long address = __fix_to_virt(idx);
172 if (idx >= __end_of_fixed_addresses) {
173 printk("Invalid __set_fixmap\n");
176 set_pte_phys(address, phys, prot);
179 static unsigned long __initdata table_start;
180 static unsigned long __meminitdata table_end;
182 static __meminit void *alloc_low_page(unsigned long *phys)
184 unsigned long pfn = table_end++;
188 adr = (void *)get_zeroed_page(GFP_ATOMIC);
194 panic("alloc_low_page: ran out of memory");
196 adr = early_ioremap(pfn * PAGE_SIZE, PAGE_SIZE);
197 memset(adr, 0, PAGE_SIZE);
198 *phys = pfn * PAGE_SIZE;
202 static __meminit void unmap_low_page(void *adr)
208 early_iounmap(adr, PAGE_SIZE);
211 /* Must run before zap_low_mappings */
212 __meminit void *early_ioremap(unsigned long addr, unsigned long size)
215 pmd_t *pmd, *last_pmd;
218 pmds = ((addr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
219 vaddr = __START_KERNEL_map;
220 pmd = level2_kernel_pgt;
221 last_pmd = level2_kernel_pgt + PTRS_PER_PMD - 1;
222 for (; pmd <= last_pmd; pmd++, vaddr += PMD_SIZE) {
223 for (i = 0; i < pmds; i++) {
224 if (pmd_present(pmd[i]))
227 vaddr += addr & ~PMD_MASK;
229 for (i = 0; i < pmds; i++, addr += PMD_SIZE)
230 set_pmd(pmd+i, __pmd(addr | __PAGE_KERNEL_LARGE_EXEC));
232 return (void *)vaddr;
236 printk("early_ioremap(0x%lx, %lu) failed\n", addr, size);
240 /* To avoid virtual aliases later */
241 __meminit void early_iounmap(void *addr, unsigned long size)
247 vaddr = (unsigned long)addr;
248 pmds = ((vaddr & ~PMD_MASK) + size + ~PMD_MASK) / PMD_SIZE;
249 pmd = level2_kernel_pgt + pmd_index(vaddr);
250 for (i = 0; i < pmds; i++)
255 static void __meminit
256 phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end)
258 int i = pmd_index(address);
260 for (; i < PTRS_PER_PMD; i++, address += PMD_SIZE) {
262 pmd_t *pmd = pmd_page + pmd_index(address);
264 if (address >= end) {
266 for (; i < PTRS_PER_PMD; i++, pmd++)
267 set_pmd(pmd, __pmd(0));
274 entry = __PAGE_KERNEL_LARGE|_PAGE_GLOBAL|address;
275 entry &= __supported_pte_mask;
276 set_pmd(pmd, __pmd(entry));
280 static void __meminit
281 phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end)
283 pmd_t *pmd = pmd_offset(pud,0);
284 spin_lock(&init_mm.page_table_lock);
285 phys_pmd_init(pmd, address, end);
286 spin_unlock(&init_mm.page_table_lock);
290 static void __meminit phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end)
292 int i = pud_index(addr);
295 for (; i < PTRS_PER_PUD; i++, addr = (addr & PUD_MASK) + PUD_SIZE ) {
296 unsigned long pmd_phys;
297 pud_t *pud = pud_page + pud_index(addr);
303 if (!after_bootmem && !e820_any_mapped(addr,addr+PUD_SIZE,0)) {
304 set_pud(pud, __pud(0));
309 phys_pmd_update(pud, addr, end);
313 pmd = alloc_low_page(&pmd_phys);
314 spin_lock(&init_mm.page_table_lock);
315 set_pud(pud, __pud(pmd_phys | _KERNPG_TABLE));
316 phys_pmd_init(pmd, addr, end);
317 spin_unlock(&init_mm.page_table_lock);
323 static void __init find_early_table_space(unsigned long end)
325 unsigned long puds, pmds, tables, start;
327 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
328 pmds = (end + PMD_SIZE - 1) >> PMD_SHIFT;
329 tables = round_up(puds * sizeof(pud_t), PAGE_SIZE) +
330 round_up(pmds * sizeof(pmd_t), PAGE_SIZE);
332 /* RED-PEN putting page tables only on node 0 could
333 cause a hotspot and fill up ZONE_DMA. The page tables
334 need roughly 0.5KB per GB. */
336 table_start = find_e820_area(start, end, tables);
337 if (table_start == -1UL)
338 panic("Cannot find space for the kernel page tables");
340 table_start >>= PAGE_SHIFT;
341 table_end = table_start;
343 early_printk("kernel direct mapping tables up to %lx @ %lx-%lx\n",
344 end, table_start << PAGE_SHIFT,
345 (table_start << PAGE_SHIFT) + tables);
348 /* Setup the direct mapping of the physical memory at PAGE_OFFSET.
349 This runs before bootmem is initialized and gets pages directly from the
350 physical memory. To access them they are temporarily mapped. */
351 void __init_refok init_memory_mapping(unsigned long start, unsigned long end)
355 Dprintk("init_memory_mapping\n");
358 * Find space for the kernel direct mapping tables.
359 * Later we should allocate these tables in the local node of the memory
360 * mapped. Unfortunately this is done currently before the nodes are
364 find_early_table_space(end);
366 start = (unsigned long)__va(start);
367 end = (unsigned long)__va(end);
369 for (; start < end; start = next) {
370 unsigned long pud_phys;
371 pgd_t *pgd = pgd_offset_k(start);
375 pud = pud_offset(pgd, start & PGDIR_MASK);
377 pud = alloc_low_page(&pud_phys);
379 next = start + PGDIR_SIZE;
382 phys_pud_init(pud, __pa(start), __pa(next));
384 set_pgd(pgd_offset_k(start), mk_kernel_pgd(pud_phys));
389 mmu_cr4_features = read_cr4();
392 reserve_early(table_start << PAGE_SHIFT, table_end << PAGE_SHIFT);
396 void __init paging_init(void)
398 unsigned long max_zone_pfns[MAX_NR_ZONES];
399 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
400 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
401 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
402 max_zone_pfns[ZONE_NORMAL] = end_pfn;
404 memory_present(0, 0, end_pfn);
406 free_area_init_nodes(max_zone_pfns);
410 /* Unmap a kernel mapping if it exists. This is useful to avoid prefetches
411 from the CPU leading to inconsistent cache lines. address and size
412 must be aligned to 2MB boundaries.
413 Does nothing when the mapping doesn't exist. */
414 void __init clear_kernel_mapping(unsigned long address, unsigned long size)
416 unsigned long end = address + size;
418 BUG_ON(address & ~LARGE_PAGE_MASK);
419 BUG_ON(size & ~LARGE_PAGE_MASK);
421 for (; address < end; address += LARGE_PAGE_SIZE) {
422 pgd_t *pgd = pgd_offset_k(address);
427 pud = pud_offset(pgd, address);
430 pmd = pmd_offset(pud, address);
431 if (!pmd || pmd_none(*pmd))
433 if (0 == (pmd_val(*pmd) & _PAGE_PSE)) {
434 /* Could handle this, but it should not happen currently. */
436 "clear_kernel_mapping: mapping has been split. will leak memory\n");
439 set_pmd(pmd, __pmd(0));
445 * Memory hotplug specific functions
447 void online_page(struct page *page)
449 ClearPageReserved(page);
450 init_page_count(page);
456 #ifdef CONFIG_MEMORY_HOTPLUG
458 * Memory is added always to NORMAL zone. This means you will never get
459 * additional DMA/DMA32 memory.
461 int arch_add_memory(int nid, u64 start, u64 size)
463 struct pglist_data *pgdat = NODE_DATA(nid);
464 struct zone *zone = pgdat->node_zones + ZONE_NORMAL;
465 unsigned long start_pfn = start >> PAGE_SHIFT;
466 unsigned long nr_pages = size >> PAGE_SHIFT;
469 init_memory_mapping(start, (start + size -1));
471 ret = __add_pages(zone, start_pfn, nr_pages);
477 printk("%s: Problem encountered in __add_pages!\n", __func__);
480 EXPORT_SYMBOL_GPL(arch_add_memory);
482 #if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
483 int memory_add_physaddr_to_nid(u64 start)
487 EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
490 #endif /* CONFIG_MEMORY_HOTPLUG */
492 static struct kcore_list kcore_mem, kcore_vmalloc, kcore_kernel, kcore_modules,
495 void __init mem_init(void)
497 long codesize, reservedpages, datasize, initsize;
501 /* clear_bss() already clear the empty_zero_page */
503 /* temporary debugging - double check it's true: */
507 for (i = 0; i < 1024; i++)
508 WARN_ON_ONCE(empty_zero_page[i]);
513 /* this will put all low memory onto the freelists */
515 totalram_pages = numa_free_all_bootmem();
517 totalram_pages = free_all_bootmem();
519 reservedpages = end_pfn - totalram_pages -
520 absent_pages_in_range(0, end_pfn);
524 codesize = (unsigned long) &_etext - (unsigned long) &_text;
525 datasize = (unsigned long) &_edata - (unsigned long) &_etext;
526 initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin;
528 /* Register memory areas for /proc/kcore */
529 kclist_add(&kcore_mem, __va(0), max_low_pfn << PAGE_SHIFT);
530 kclist_add(&kcore_vmalloc, (void *)VMALLOC_START,
531 VMALLOC_END-VMALLOC_START);
532 kclist_add(&kcore_kernel, &_stext, _end - _stext);
533 kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_LEN);
534 kclist_add(&kcore_vsyscall, (void *)VSYSCALL_START,
535 VSYSCALL_END - VSYSCALL_START);
537 printk("Memory: %luk/%luk available (%ldk kernel code, %ldk reserved, %ldk data, %ldk init)\n",
538 (unsigned long) nr_free_pages() << (PAGE_SHIFT-10),
539 end_pfn << (PAGE_SHIFT-10),
541 reservedpages << (PAGE_SHIFT-10),
546 void free_init_pages(char *what, unsigned long begin, unsigned long end)
553 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
554 for (addr = begin; addr < end; addr += PAGE_SIZE) {
555 ClearPageReserved(virt_to_page(addr));
556 init_page_count(virt_to_page(addr));
557 memset((void *)(addr & ~(PAGE_SIZE-1)),
558 POISON_FREE_INITMEM, PAGE_SIZE);
562 #ifdef CONFIG_DEBUG_RODATA
564 * This will make the __init pages not present and
565 * not executable, so that any attempt to use a
566 * __init function from now on will fault immediately
567 * rather than supriously later when memory gets reused.
569 * We only do this for DEBUG_RODATA to not break up the
570 * 2Mb kernel mapping just for this debug feature.
572 if (begin >= __START_KERNEL_map) {
573 set_memory_np(begin, (end - begin)/PAGE_SIZE);
574 set_memory_nx(begin, (end - begin)/PAGE_SIZE);
579 void free_initmem(void)
581 free_init_pages("unused kernel memory",
582 (unsigned long)(&__init_begin),
583 (unsigned long)(&__init_end));
586 #ifdef CONFIG_DEBUG_RODATA
588 void mark_rodata_ro(void)
590 unsigned long start = (unsigned long)_stext, end;
592 #ifdef CONFIG_HOTPLUG_CPU
593 /* It must still be possible to apply SMP alternatives. */
594 if (num_possible_cpus() > 1)
595 start = (unsigned long)_etext;
598 #ifdef CONFIG_KPROBES
599 start = (unsigned long)__start_rodata;
602 end = (unsigned long)__end_rodata;
603 start = (start + PAGE_SIZE - 1) & PAGE_MASK;
608 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
610 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
611 (end - start) >> 10);
613 #ifdef CONFIG_CPA_DEBUG
614 printk("Testing CPA: undo %lx-%lx\n", start, end);
615 set_memory_rw(start, (end-start) >> PAGE_SHIFT);
617 printk("Testing CPA: again\n");
618 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
623 #ifdef CONFIG_BLK_DEV_INITRD
624 void free_initrd_mem(unsigned long start, unsigned long end)
626 free_init_pages("initrd memory", start, end);
630 void __init reserve_bootmem_generic(unsigned long phys, unsigned len)
633 int nid = phys_to_nid(phys);
635 unsigned long pfn = phys >> PAGE_SHIFT;
636 if (pfn >= end_pfn) {
637 /* This can happen with kdump kernels when accessing firmware
639 if (pfn < end_pfn_map)
641 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %u\n",
646 /* Should check here against the e820 map to avoid double free */
648 reserve_bootmem_node(NODE_DATA(nid), phys, len);
650 reserve_bootmem(phys, len);
652 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
653 dma_reserve += len / PAGE_SIZE;
654 set_dma_reserve(dma_reserve);
658 int kern_addr_valid(unsigned long addr)
660 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
666 if (above != 0 && above != -1UL)
669 pgd = pgd_offset_k(addr);
673 pud = pud_offset(pgd, addr);
677 pmd = pmd_offset(pud, addr);
681 return pfn_valid(pmd_pfn(*pmd));
683 pte = pte_offset_kernel(pmd, addr);
686 return pfn_valid(pte_pfn(*pte));
689 /* A pseudo VMA to allow ptrace access for the vsyscall page. This only
690 covers the 64bit vsyscall page now. 32bit has a real VMA now and does
691 not need special handling anymore. */
693 static struct vm_area_struct gate_vma = {
694 .vm_start = VSYSCALL_START,
695 .vm_end = VSYSCALL_START + (VSYSCALL_MAPPED_PAGES << PAGE_SHIFT),
696 .vm_page_prot = PAGE_READONLY_EXEC,
697 .vm_flags = VM_READ | VM_EXEC
700 struct vm_area_struct *get_gate_vma(struct task_struct *tsk)
702 #ifdef CONFIG_IA32_EMULATION
703 if (test_tsk_thread_flag(tsk, TIF_IA32))
709 int in_gate_area(struct task_struct *task, unsigned long addr)
711 struct vm_area_struct *vma = get_gate_vma(task);
714 return (addr >= vma->vm_start) && (addr < vma->vm_end);
717 /* Use this when you have no reliable task/vma, typically from interrupt
718 * context. It is less reliable than using the task's vma and may give
721 int in_gate_area_no_task(unsigned long addr)
723 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
726 const char *arch_vma_name(struct vm_area_struct *vma)
728 if (vma->vm_mm && vma->vm_start == (long)vma->vm_mm->context.vdso)
730 if (vma == &gate_vma)
735 #ifdef CONFIG_SPARSEMEM_VMEMMAP
737 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
739 int __meminit vmemmap_populate(struct page *start_page,
740 unsigned long size, int node)
742 unsigned long addr = (unsigned long)start_page;
743 unsigned long end = (unsigned long)(start_page + size);
749 for (; addr < end; addr = next) {
750 next = pmd_addr_end(addr, end);
752 pgd = vmemmap_pgd_populate(addr, node);
755 pud = vmemmap_pud_populate(pgd, addr, node);
759 pmd = pmd_offset(pud, addr);
760 if (pmd_none(*pmd)) {
762 void *p = vmemmap_alloc_block(PMD_SIZE, node);
766 entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL_LARGE);
767 set_pmd(pmd, __pmd(pte_val(entry)));
769 printk(KERN_DEBUG " [%lx-%lx] PMD ->%p on node %d\n",
770 addr, addr + PMD_SIZE - 1, p, node);
772 vmemmap_verify((pte_t *)pmd, node, addr, next);