X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=arch%2Fx86%2Fmm%2Fpageattr.c;h=7b79f6be4e7df99dc663d2d4c935a7e35f877a3b;hb=2a5f2e3e6cd1ce9fb3f8b186b6bc9aa1f1497a92;hp=4119379f80fff553de99fd30facb39485659f250;hpb=c58310bf4933986513020fa90b4190c7492995ae;p=linux-2.6 diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4119379f80..60bcb5b6a3 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -9,6 +9,8 @@ #include #include #include +#include +#include #include #include @@ -16,6 +18,8 @@ #include #include #include +#include +#include /* * The current flushing context - we pass it instead of 5 arguments: @@ -26,8 +30,30 @@ struct cpa_data { pgprot_t mask_clr; int numpages; int flushtlb; + unsigned long pfn; + unsigned force_split : 1; }; +#ifdef CONFIG_X86_64 + +static inline unsigned long highmap_start_pfn(void) +{ + return __pa(_text) >> PAGE_SHIFT; +} + +static inline unsigned long highmap_end_pfn(void) +{ + return __pa(round_up((unsigned long)_end, PMD_SIZE)) >> PAGE_SHIFT; +} + +#endif + +#ifdef CONFIG_DEBUG_PAGEALLOC +# define debug_pagealloc 1 +#else +# define debug_pagealloc 0 +#endif + static inline int within(unsigned long addr, unsigned long start, unsigned long end) { @@ -123,29 +149,14 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -#define HIGH_MAP_START __START_KERNEL_map -#define HIGH_MAP_END (__START_KERNEL_map + KERNEL_TEXT_SIZE) - - -/* - * Converts a virtual address to a X86-64 highmap address - */ -static unsigned long virt_to_highmap(void *address) -{ -#ifdef CONFIG_X86_64 - return __pa((unsigned long)address) + HIGH_MAP_START - phys_base; -#else - return (unsigned long)address; -#endif -} - /* * Certain areas of memory on x86 require very specific protection flags, * for example the BIOS area or kernel text. Callers don't always get this * right (again, ioremap() on BIOS memory is not uncommon) so this function * checks and fixes these known static required protection bits. */ -static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) +static inline pgprot_t static_protections(pgprot_t prot, unsigned long address, + unsigned long pfn) { pgprot_t forbidden = __pgprot(0); @@ -153,30 +164,23 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address) * The BIOS area between 640k and 1Mb needs to be executable for * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. */ - if (within(__pa(address), BIOS_BEGIN, BIOS_END)) + if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_NX; /* * The kernel text needs to be executable for obvious reasons - * Does not cover __inittext since that is gone later on + * Does not cover __inittext since that is gone later on. On + * 64bit we do not enforce !NX on the low mapping */ if (within(address, (unsigned long)_text, (unsigned long)_etext)) pgprot_val(forbidden) |= _PAGE_NX; - /* - * Do the same for the x86-64 high kernel mapping - */ - if (within(address, virt_to_highmap(_text), virt_to_highmap(_etext))) - pgprot_val(forbidden) |= _PAGE_NX; - /* The .rodata section needs to be read-only */ - if (within(address, (unsigned long)__start_rodata, - (unsigned long)__end_rodata)) - pgprot_val(forbidden) |= _PAGE_RW; /* - * Do the same for the x86-64 high kernel mapping + * The .rodata section needs to be read-only. Using the pfn + * catches all aliases. */ - if (within(address, virt_to_highmap(__start_rodata), - virt_to_highmap(__end_rodata))) + if (within(pfn, __pa((unsigned long)__start_rodata) >> PAGE_SHIFT, + __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) pgprot_val(forbidden) |= _PAGE_RW; prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); @@ -253,12 +257,15 @@ static int try_preserve_large_page(pte_t *kpte, unsigned long address, struct cpa_data *cpa) { - unsigned long nextpage_addr, numpages, pmask, psize, flags, addr; + unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; pte_t new_pte, old_pte, *tmp; pgprot_t old_prot, new_prot; int i, do_split = 1; unsigned int level; + if (cpa->force_split) + return 1; + spin_lock_irqsave(&pgd_lock, flags); /* * Check for races, another CPU might have split this page @@ -301,7 +308,15 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); - new_prot = static_protections(new_prot, address); + + /* + * old_pte points to the large page base address. So we need + * to add the offset of the virtual address: + */ + pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); + cpa->pfn = pfn; + + new_prot = static_protections(new_prot, address, pfn); /* * We need to check the full range, whether @@ -309,8 +324,9 @@ try_preserve_large_page(pte_t *kpte, unsigned long address, * the pages in the range we try to preserve: */ addr = address + PAGE_SIZE; - for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE) { - pgprot_t chk_prot = static_protections(new_prot, addr); + pfn++; + for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { + pgprot_t chk_prot = static_protections(new_prot, addr, pfn); if (pgprot_val(chk_prot) != pgprot_val(new_prot)) goto out_unlock; @@ -352,45 +368,48 @@ out_unlock: static LIST_HEAD(page_pool); static unsigned long pool_size, pool_pages, pool_low; -static unsigned long pool_used, pool_failed, pool_refill; +static unsigned long pool_used, pool_failed; -static void cpa_fill_pool(void) +static void cpa_fill_pool(struct page **ret) { - struct page *p; gfp_t gfp = GFP_KERNEL; + unsigned long flags; + struct page *p; - /* Do not allocate from interrupt context */ - if (in_irq() || irqs_disabled()) - return; /* - * Check unlocked. I does not matter when we have one more - * page in the pool. The bit lock avoids recursive pool - * allocations: + * Avoid recursion (on debug-pagealloc) and also signal + * our priority to get to these pagetables: */ - if (pool_pages >= pool_size || test_and_set_bit_lock(0, &pool_refill)) + if (current->flags & PF_MEMALLOC) return; + current->flags |= PF_MEMALLOC; -#ifdef CONFIG_DEBUG_PAGEALLOC /* - * We could do: - * gfp = in_atomic() ? GFP_ATOMIC : GFP_KERNEL; - * but this fails on !PREEMPT kernels + * Allocate atomically from atomic contexts: */ - gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; -#endif + if (in_atomic() || irqs_disabled() || debug_pagealloc) + gfp = GFP_ATOMIC | __GFP_NORETRY | __GFP_NOWARN; - while (pool_pages < pool_size) { + while (pool_pages < pool_size || (ret && !*ret)) { p = alloc_pages(gfp, 0); if (!p) { pool_failed++; break; } - spin_lock_irq(&pgd_lock); + /* + * If the call site needs a page right now, provide it: + */ + if (ret && !*ret) { + *ret = p; + continue; + } + spin_lock_irqsave(&pgd_lock, flags); list_add(&p->lru, &page_pool); pool_pages++; - spin_unlock_irq(&pgd_lock); + spin_unlock_irqrestore(&pgd_lock, flags); } - clear_bit_unlock(0, &pool_refill); + + current->flags &= ~PF_MEMALLOC; } #define SHIFT_MB (20 - PAGE_SHIFT) @@ -411,11 +430,15 @@ void __init cpa_init(void) * GiB. Shift MiB to Gib and multiply the result by * POOL_PAGES_PER_GB: */ - gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; - pool_size = POOL_PAGES_PER_GB * gb; + if (debug_pagealloc) { + gb = ((si.totalram >> SHIFT_MB) + ROUND_MB_GB) >> SHIFT_MB_GB; + pool_size = POOL_PAGES_PER_GB * gb; + } else { + pool_size = 1; + } pool_low = pool_size; - cpa_fill_pool(); + cpa_fill_pool(NULL); printk(KERN_DEBUG "CPA: page pool initialized %lu of %lu pages preallocated\n", pool_pages, pool_size); @@ -437,16 +460,20 @@ static int split_large_page(pte_t *kpte, unsigned long address) spin_lock_irqsave(&pgd_lock, flags); if (list_empty(&page_pool)) { spin_unlock_irqrestore(&pgd_lock, flags); - return -ENOMEM; + base = NULL; + cpa_fill_pool(&base); + if (!base) + return -ENOMEM; + spin_lock_irqsave(&pgd_lock, flags); + } else { + base = list_first_entry(&page_pool, struct page, lru); + list_del(&base->lru); + pool_pages--; + + if (pool_pages < pool_low) + pool_low = pool_pages; } - base = list_first_entry(&page_pool, struct page, lru); - list_del(&base->lru); - pool_pages--; - - if (pool_pages < pool_low) - pool_low = pool_pages; - /* * Check for races, another CPU might have split this page * up for us already: @@ -456,9 +483,7 @@ static int split_large_page(pte_t *kpte, unsigned long address) goto out_unlock; pbase = (pte_t *)page_address(base); -#ifdef CONFIG_X86_32 - paravirt_alloc_pt(&init_mm, page_to_pfn(base)); -#endif + paravirt_alloc_pte(&init_mm, page_to_pfn(base)); ref_prot = pte_pgprot(pte_clrhuge(*kpte)); #ifdef CONFIG_X86_64 @@ -505,46 +530,46 @@ out_unlock: return 0; } -static int __change_page_attr(unsigned long address, struct cpa_data *cpa) +static int __change_page_attr(struct cpa_data *cpa, int primary) { + unsigned long address = cpa->vaddr; int do_split, err; unsigned int level; - struct page *kpte_page; - pte_t *kpte; + pte_t *kpte, old_pte; repeat: kpte = lookup_address(address, &level); if (!kpte) - return -EINVAL; + return 0; - kpte_page = virt_to_page(kpte); - BUG_ON(PageLRU(kpte_page)); - BUG_ON(PageCompound(kpte_page)); + old_pte = *kpte; + if (!pte_val(old_pte)) { + if (!primary) + return 0; + printk(KERN_WARNING "CPA: called for zero pte. " + "vaddr = %lx cpa->vaddr = %lx\n", address, + cpa->vaddr); + WARN_ON(1); + return -EINVAL; + } if (level == PG_LEVEL_4K) { - pte_t new_pte, old_pte = *kpte; + pte_t new_pte; pgprot_t new_prot = pte_pgprot(old_pte); - - if(!pte_val(old_pte)) { - printk(KERN_WARNING "CPA: called for zero pte. " - "vaddr = %lx cpa->vaddr = %lx\n", address, - cpa->vaddr); - WARN_ON(1); - return -EINVAL; - } + unsigned long pfn = pte_pfn(old_pte); pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); - new_prot = static_protections(new_prot, address); + new_prot = static_protections(new_prot, address, pfn); /* * We need to keep the pfn from the existing PTE, * after all we're only going to change it's attributes * not the memory it points to */ - new_pte = pfn_pte(pte_pfn(old_pte), canon_pgprot(new_prot)); - + new_pte = pfn_pte(pfn, canon_pgprot(new_prot)); + cpa->pfn = pfn; /* * Do we really change anything ? */ @@ -581,67 +606,59 @@ repeat: return err; } -/** - * change_page_attr_addr - Change page table attributes in linear mapping - * @address: Virtual address in linear mapping. - * @prot: New page table attribute (PAGE_*) - * - * Change page attributes of a page in the direct mapping. This is a variant - * of change_page_attr() that also works on memory holes that do not have - * mem_map entry (pfn_valid() is false). - * - * See change_page_attr() documentation for more details. - * - * Modules and drivers should use the set_memory_* APIs instead. - */ -static int change_page_attr_addr(struct cpa_data *cpa) +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias); + +static int cpa_process_alias(struct cpa_data *cpa) { - int err; - unsigned long address = cpa->vaddr; + struct cpa_data alias_cpa; + int ret = 0; -#ifdef CONFIG_X86_64 - unsigned long phys_addr = __pa(address); + if (cpa->pfn > max_pfn_mapped) + return 0; /* - * If we are inside the high mapped kernel range, then we - * fixup the low mapping first. __va() returns the virtual - * address in the linear mapping: + * No need to redo, when the primary call touched the direct + * mapping already: */ - if (within(address, HIGH_MAP_START, HIGH_MAP_END)) - address = (unsigned long) __va(phys_addr); -#endif + if (!within(cpa->vaddr, PAGE_OFFSET, + PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT))) { - err = __change_page_attr(address, cpa); - if (err) - return err; + alias_cpa = *cpa; + alias_cpa.vaddr = (unsigned long) __va(cpa->pfn << PAGE_SHIFT); + + ret = __change_page_attr_set_clr(&alias_cpa, 0); + } #ifdef CONFIG_X86_64 + if (ret) + return ret; + /* + * No need to redo, when the primary call touched the high + * mapping already: + */ + if (within(cpa->vaddr, (unsigned long) _text, (unsigned long) _end)) + return 0; + /* * If the physical address is inside the kernel map, we need * to touch the high mapped kernel as well: */ - if (within(phys_addr, 0, KERNEL_TEXT_SIZE)) { - /* - * Calc the high mapping address. See __phys_addr() - * for the non obvious details. - * - * Note that NX and other required permissions are - * checked in static_protections(). - */ - address = phys_addr + HIGH_MAP_START - phys_base; + if (!within(cpa->pfn, highmap_start_pfn(), highmap_end_pfn())) + return 0; - /* - * Our high aliases are imprecise, because we check - * everything between 0 and KERNEL_TEXT_SIZE, so do - * not propagate lookup failures back to users: - */ - __change_page_attr(address, cpa); - } + alias_cpa = *cpa; + alias_cpa.vaddr = + (cpa->pfn << PAGE_SHIFT) + __START_KERNEL_map - phys_base; + + /* + * The high mapping range is imprecise, so ignore the return value. + */ + __change_page_attr_set_clr(&alias_cpa, 0); #endif - return err; + return ret; } -static int __change_page_attr_set_clr(struct cpa_data *cpa) +static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { int ret, numpages = cpa->numpages; @@ -651,10 +668,17 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa) * preservation check. */ cpa->numpages = numpages; - ret = change_page_attr_addr(cpa); + + ret = __change_page_attr(cpa, checkalias); if (ret) return ret; + if (checkalias) { + ret = cpa_process_alias(cpa); + if (ret) + return ret; + } + /* * Adjust the number of pages with the result of the * CPA operation. Either a large page has been @@ -674,10 +698,11 @@ static inline int cache_attr(pgprot_t attr) } static int change_page_attr_set_clr(unsigned long addr, int numpages, - pgprot_t mask_set, pgprot_t mask_clr) + pgprot_t mask_set, pgprot_t mask_clr, + int force_split) { struct cpa_data cpa; - int ret, cache; + int ret, cache, checkalias; /* * Check, if we are requested to change a not supported @@ -685,7 +710,7 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, */ mask_set = canon_pgprot(mask_set); mask_clr = canon_pgprot(mask_clr); - if (!pgprot_val(mask_set) && !pgprot_val(mask_clr)) + if (!pgprot_val(mask_set) && !pgprot_val(mask_clr) && !force_split) return 0; /* Ensure we are PAGE_SIZE aligned */ @@ -702,8 +727,12 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa.mask_set = mask_set; cpa.mask_clr = mask_clr; cpa.flushtlb = 0; + cpa.force_split = force_split; - ret = __change_page_attr_set_clr(&cpa); + /* No alias checking for _NX bit modifications */ + checkalias = (pgprot_val(mask_set) | pgprot_val(mask_clr)) != _PAGE_NX; + + ret = __change_page_attr_set_clr(&cpa, checkalias); /* * Check whether we really changed something: @@ -729,33 +758,75 @@ static int change_page_attr_set_clr(unsigned long addr, int numpages, cpa_flush_all(cache); out: - cpa_fill_pool(); + cpa_fill_pool(NULL); + return ret; } static inline int change_page_attr_set(unsigned long addr, int numpages, pgprot_t mask) { - return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0)); + return change_page_attr_set_clr(addr, numpages, mask, __pgprot(0), 0); } static inline int change_page_attr_clear(unsigned long addr, int numpages, pgprot_t mask) { - return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask); + return change_page_attr_set_clr(addr, numpages, __pgprot(0), mask, 0); } -int set_memory_uc(unsigned long addr, int numpages) +int _set_memory_uc(unsigned long addr, int numpages) { + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ return change_page_attr_set(addr, numpages, - __pgprot(_PAGE_PCD | _PAGE_PWT)); + __pgprot(_PAGE_CACHE_UC_MINUS)); +} + +int set_memory_uc(unsigned long addr, int numpages) +{ + /* + * for now UC MINUS. see comments in ioremap_nocache() + */ + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + _PAGE_CACHE_UC_MINUS, NULL)) + return -EINVAL; + + return _set_memory_uc(addr, numpages); } EXPORT_SYMBOL(set_memory_uc); -int set_memory_wb(unsigned long addr, int numpages) +int _set_memory_wc(unsigned long addr, int numpages) +{ + return change_page_attr_set(addr, numpages, + __pgprot(_PAGE_CACHE_WC)); +} + +int set_memory_wc(unsigned long addr, int numpages) +{ + if (!pat_wc_enabled) + return set_memory_uc(addr, numpages); + + if (reserve_memtype(addr, addr + numpages * PAGE_SIZE, + _PAGE_CACHE_WC, NULL)) + return -EINVAL; + + return _set_memory_wc(addr, numpages); +} +EXPORT_SYMBOL(set_memory_wc); + +int _set_memory_wb(unsigned long addr, int numpages) { return change_page_attr_clear(addr, numpages, - __pgprot(_PAGE_PCD | _PAGE_PWT)); + __pgprot(_PAGE_CACHE_MASK)); +} + +int set_memory_wb(unsigned long addr, int numpages) +{ + free_memtype(addr, addr + numpages * PAGE_SIZE); + + return _set_memory_wb(addr, numpages); } EXPORT_SYMBOL(set_memory_wb); @@ -786,6 +857,12 @@ int set_memory_np(unsigned long addr, int numpages) return change_page_attr_clear(addr, numpages, __pgprot(_PAGE_PRESENT)); } +int set_memory_4k(unsigned long addr, int numpages) +{ + return change_page_attr_set_clr(addr, numpages, __pgprot(0), + __pgprot(0), 1); +} + int set_pages_uc(struct page *page, int numpages) { unsigned long addr = (unsigned long)page_address(page); @@ -841,7 +918,7 @@ static int __set_pages_p(struct page *page, int numpages) .mask_set = __pgprot(_PAGE_PRESENT | _PAGE_RW), .mask_clr = __pgprot(0)}; - return __change_page_attr_set_clr(&cpa); + return __change_page_attr_set_clr(&cpa, 1); } static int __set_pages_np(struct page *page, int numpages) @@ -851,7 +928,7 @@ static int __set_pages_np(struct page *page, int numpages) .mask_set = __pgprot(0), .mask_clr = __pgprot(_PAGE_PRESENT | _PAGE_RW)}; - return __change_page_attr_set_clr(&cpa); + return __change_page_attr_set_clr(&cpa, 1); } void kernel_map_pages(struct page *page, int numpages, int enable) @@ -892,10 +969,66 @@ void kernel_map_pages(struct page *page, int numpages, int enable) * Try to refill the page pool here. We can do this only after * the tlb flush. */ - cpa_fill_pool(); + cpa_fill_pool(NULL); } + +#ifdef CONFIG_DEBUG_FS +static int dpa_show(struct seq_file *m, void *v) +{ + seq_puts(m, "DEBUG_PAGEALLOC\n"); + seq_printf(m, "pool_size : %lu\n", pool_size); + seq_printf(m, "pool_pages : %lu\n", pool_pages); + seq_printf(m, "pool_low : %lu\n", pool_low); + seq_printf(m, "pool_used : %lu\n", pool_used); + seq_printf(m, "pool_failed : %lu\n", pool_failed); + + return 0; +} + +static int dpa_open(struct inode *inode, struct file *filp) +{ + return single_open(filp, dpa_show, NULL); +} + +static const struct file_operations dpa_fops = { + .open = dpa_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static int __init debug_pagealloc_proc_init(void) +{ + struct dentry *de; + + de = debugfs_create_file("debug_pagealloc", 0600, NULL, NULL, + &dpa_fops); + if (!de) + return -ENOMEM; + + return 0; +} +__initcall(debug_pagealloc_proc_init); #endif +#ifdef CONFIG_HIBERNATION + +bool kernel_page_present(struct page *page) +{ + unsigned int level; + pte_t *pte; + + if (PageHighMem(page)) + return false; + + pte = lookup_address((unsigned long)page_address(page), &level); + return (pte_val(*pte) & _PAGE_PRESENT); +} + +#endif /* CONFIG_HIBERNATION */ + +#endif /* CONFIG_DEBUG_PAGEALLOC */ + /* * The testcases use internal knowledge of the implementation that shouldn't * be exposed to the rest of the kernel. Include these directly here.