X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fmemory.c;h=e046b7e4b53092bb879f8d183d24de36b3389d83;hb=ea48e705be4f886c16313c882a6623b442bab0eb;hp=fb6e5deb873a863cd69de41fd5be7c1da2701c11;hpb=1da177e4c3f41524e886b7f1b8a0c1fc7321cac2;p=linux-2.6 diff --git a/mm/memory.c b/mm/memory.c index fb6e5deb87..e046b7e4b5 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -58,7 +58,7 @@ #include #include -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; struct page *mem_map; @@ -110,90 +110,178 @@ void pmd_clear_bad(pmd_t *pmd) * Note: this doesn't free the actual pages themselves. That * has been handled earlier when unmapping all the memory regions. */ -static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd, - unsigned long addr, unsigned long end) +static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd) { - if (!((addr | end) & ~PMD_MASK)) { - /* Only free fully aligned ranges */ - struct page *page = pmd_page(*pmd); - pmd_clear(pmd); - dec_page_state(nr_page_table_pages); - tlb->mm->nr_ptes--; - pte_free_tlb(tlb, page); - } + struct page *page = pmd_page(*pmd); + pmd_clear(pmd); + pte_free_tlb(tlb, page); + dec_page_state(nr_page_table_pages); + tlb->mm->nr_ptes--; } -static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud, - unsigned long addr, unsigned long end) +static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) { pmd_t *pmd; unsigned long next; - pmd_t *empty_pmd = NULL; + unsigned long start; + start = addr; pmd = pmd_offset(pud, addr); - - /* Only free fully aligned ranges */ - if (!((addr | end) & ~PUD_MASK)) - empty_pmd = pmd; do { next = pmd_addr_end(addr, end); if (pmd_none_or_clear_bad(pmd)) continue; - clear_pte_range(tlb, pmd, addr, next); + free_pte_range(tlb, pmd); } while (pmd++, addr = next, addr != end); - if (empty_pmd) { - pud_clear(pud); - pmd_free_tlb(tlb, empty_pmd); + start &= PUD_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PUD_MASK; + if (!ceiling) + return; } + if (end - 1 > ceiling - 1) + return; + + pmd = pmd_offset(pud, start); + pud_clear(pud); + pmd_free_tlb(tlb, pmd); } -static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd, - unsigned long addr, unsigned long end) +static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) { pud_t *pud; unsigned long next; - pud_t *empty_pud = NULL; + unsigned long start; + start = addr; pud = pud_offset(pgd, addr); - - /* Only free fully aligned ranges */ - if (!((addr | end) & ~PGDIR_MASK)) - empty_pud = pud; do { next = pud_addr_end(addr, end); if (pud_none_or_clear_bad(pud)) continue; - clear_pmd_range(tlb, pud, addr, next); + free_pmd_range(tlb, pud, addr, next, floor, ceiling); } while (pud++, addr = next, addr != end); - if (empty_pud) { - pgd_clear(pgd); - pud_free_tlb(tlb, empty_pud); + start &= PGDIR_MASK; + if (start < floor) + return; + if (ceiling) { + ceiling &= PGDIR_MASK; + if (!ceiling) + return; } + if (end - 1 > ceiling - 1) + return; + + pud = pud_offset(pgd, start); + pgd_clear(pgd); + pud_free_tlb(tlb, pud); } /* - * This function clears user-level page tables of a process. - * Unlike other pagetable walks, some memory layouts might give end 0. + * This function frees user-level page tables of a process. + * * Must be called with pagetable lock held. */ -void clear_page_range(struct mmu_gather *tlb, - unsigned long addr, unsigned long end) +void free_pgd_range(struct mmu_gather **tlb, + unsigned long addr, unsigned long end, + unsigned long floor, unsigned long ceiling) { pgd_t *pgd; unsigned long next; + unsigned long start; + + /* + * The next few lines have given us lots of grief... + * + * Why are we testing PMD* at this top level? Because often + * there will be no work to do at all, and we'd prefer not to + * go all the way down to the bottom just to discover that. + * + * Why all these "- 1"s? Because 0 represents both the bottom + * of the address space and the top of it (using -1 for the + * top wouldn't help much: the masks would do the wrong thing). + * The rule is that addr 0 and floor 0 refer to the bottom of + * the address space, but end 0 and ceiling 0 refer to the top + * Comparisons need to use "end - 1" and "ceiling - 1" (though + * that end 0 case should be mythical). + * + * Wherever addr is brought up or ceiling brought down, we must + * be careful to reject "the opposite 0" before it confuses the + * subsequent tests. But what about where end is brought down + * by PMD_SIZE below? no, end can't go down to 0 there. + * + * Whereas we round start (addr) and ceiling down, by different + * masks at different levels, in order to test whether a table + * now has no other vmas using it, so can be freed, we don't + * bother to round floor or end up - the tests don't need that. + */ - pgd = pgd_offset(tlb->mm, addr); + addr &= PMD_MASK; + if (addr < floor) { + addr += PMD_SIZE; + if (!addr) + return; + } + if (ceiling) { + ceiling &= PMD_MASK; + if (!ceiling) + return; + } + if (end - 1 > ceiling - 1) + end -= PMD_SIZE; + if (addr > end - 1) + return; + + start = addr; + pgd = pgd_offset((*tlb)->mm, addr); do { next = pgd_addr_end(addr, end); if (pgd_none_or_clear_bad(pgd)) continue; - clear_pud_range(tlb, pgd, addr, next); + free_pud_range(*tlb, pgd, addr, next, floor, ceiling); } while (pgd++, addr = next, addr != end); + + if (!tlb_is_full_mm(*tlb)) + flush_tlb_pgtables((*tlb)->mm, start, end); } -pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address) +void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma, + unsigned long floor, unsigned long ceiling) +{ + while (vma) { + struct vm_area_struct *next = vma->vm_next; + unsigned long addr = vma->vm_start; + + if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) { + hugetlb_free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } else { + /* + * Optimization: gather nearby vmas into one call down + */ + while (next && next->vm_start <= vma->vm_end + PMD_SIZE + && !is_hugepage_only_range(vma->vm_mm, next->vm_start, + HPAGE_SIZE)) { + vma = next; + next = vma->vm_next; + } + free_pgd_range(tlb, addr, vma->vm_end, + floor, next? next->vm_start: ceiling); + } + vma = next; + } +} + +pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, + unsigned long address) { if (!pmd_present(*pmd)) { struct page *new; @@ -567,7 +655,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here * @details: details of nonlinear truncation or shared cache invalidation * - * Returns the number of vma's which were covered by the unmapping. + * Returns the end address of the unmapping (restart addr if interrupted). * * Unmap all pages in the vma list. Called under page_table_lock. * @@ -584,7 +672,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, +unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr, unsigned long *nr_accounted, struct zap_details *details) @@ -592,12 +680,11 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, unsigned long zap_bytes = ZAP_BLOCK_SIZE; unsigned long tlb_start = 0; /* For tlb_finish_mmu */ int tlb_start_valid = 0; - int ret = 0; + unsigned long start = start_addr; spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL; int fullmm = tlb_is_full_mm(*tlbp); for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) { - unsigned long start; unsigned long end; start = max(vma->vm_start, start_addr); @@ -610,7 +697,6 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, if (vma->vm_flags & VM_ACCOUNT) *nr_accounted += (end - start) >> PAGE_SHIFT; - ret++; while (start != end) { unsigned long block; @@ -641,7 +727,6 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, if (i_mmap_lock) { /* must reset count of rss freed */ *tlbp = tlb_gather_mmu(mm, fullmm); - details->break_addr = start; goto out; } spin_unlock(&mm->page_table_lock); @@ -655,7 +740,7 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm, } } out: - return ret; + return start; /* which is now the end (or restart) address */ } /** @@ -665,7 +750,7 @@ out: * @size: number of bytes to zap * @details: details of nonlinear truncation or shared cache invalidation */ -void zap_page_range(struct vm_area_struct *vma, unsigned long address, +unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size, struct zap_details *details) { struct mm_struct *mm = vma->vm_mm; @@ -675,23 +760,24 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address, if (is_vm_hugetlb_page(vma)) { zap_hugepage_range(vma, address, size); - return; + return end; } lru_add_drain(); spin_lock(&mm->page_table_lock); tlb = tlb_gather_mmu(mm, 0); - unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); + end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details); tlb_finish_mmu(tlb, address, end); spin_unlock(&mm->page_table_lock); + return end; } /* * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -static struct page * -__follow_page(struct mm_struct *mm, unsigned long address, int read, int write) +static struct page *__follow_page(struct mm_struct *mm, unsigned long address, + int read, int write, int accessed) { pgd_t *pgd; pud_t *pud; @@ -732,9 +818,11 @@ __follow_page(struct mm_struct *mm, unsigned long address, int read, int write) pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (write && !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); + if (accessed) { + if (write && !pte_dirty(pte) &&!PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + } return page; } } @@ -743,33 +831,21 @@ out: return NULL; } -struct page * +inline struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) { - return __follow_page(mm, address, /*read*/0, write); -} - -int -check_user_page_readable(struct mm_struct *mm, unsigned long address) -{ - return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; + return __follow_page(mm, address, 0, write, 1); } -EXPORT_SYMBOL(check_user_page_readable); - -/* - * Given a physical address, is there a useful struct page pointing to - * it? This may become more complex in the future if we start dealing - * with IO-aperture pages for direct-IO. +/* + * check_user_page_readable() can be called frm niterrupt context by oprofile, + * so we need to avoid taking any non-irq-safe locks */ - -static inline struct page *get_page_map(struct page *page) +int check_user_page_readable(struct mm_struct *mm, unsigned long address) { - if (!pfn_valid(page_to_pfn(page))) - return NULL; - return page; + return __follow_page(mm, address, 1, 0, 0) != NULL; } - +EXPORT_SYMBOL(check_user_page_readable); static inline int untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, @@ -801,7 +877,6 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, return 0; } - int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -838,9 +913,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pud = pud_offset(pgd, pg); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, pg); - BUG_ON(pmd_none(*pmd)); + if (pmd_none(*pmd)) + return i ? : -EFAULT; pte = pte_offset_map(pmd, pg); - BUG_ON(pte_none(*pte)); + if (pte_none(*pte)) { + pte_unmap(pte); + return i ? : -EFAULT; + } if (pages) { pages[i] = pte_page(*pte); get_page(pages[i]); @@ -865,25 +944,37 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } spin_lock(&mm->page_table_lock); do { - struct page *map; - int lookup_write = write; + int write_access = write; + struct page *page; cond_resched_lock(&mm->page_table_lock); - while (!(map = follow_page(mm, start, lookup_write))) { + while (!(page = follow_page(mm, start, write_access))) { + int ret; + /* * Shortcut for anonymous pages. We don't want * to force the creation of pages tables for - * insanly big anonymously mapped areas that + * insanely big anonymously mapped areas that * nobody touched so far. This is important * for doing a core dump for these mappings. */ - if (!lookup_write && - untouched_anonymous_page(mm,vma,start)) { - map = ZERO_PAGE(start); + if (!write && untouched_anonymous_page(mm,vma,start)) { + page = ZERO_PAGE(start); break; } spin_unlock(&mm->page_table_lock); - switch (handle_mm_fault(mm,vma,start,write)) { + ret = __handle_mm_fault(mm, vma, start, write_access); + + /* + * The VM_FAULT_WRITE bit tells us that do_wp_page has + * broken COW when necessary, even if maybe_mkwrite + * decided not to set pte_write. We can thus safely do + * subsequent page lookups as if they were reads. + */ + if (ret & VM_FAULT_WRITE) + write_access = 0; + + switch (ret & ~VM_FAULT_WRITE) { case VM_FAULT_MINOR: tsk->min_flt++; break; @@ -897,41 +988,24 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, default: BUG(); } - /* - * Now that we have performed a write fault - * and surely no longer have a shared page we - * shouldn't write, we shouldn't ignore an - * unwritable page in the page table if - * we are forcing write access. - */ - lookup_write = write && !force; spin_lock(&mm->page_table_lock); } if (pages) { - pages[i] = get_page_map(map); - if (!pages[i]) { - spin_unlock(&mm->page_table_lock); - while (i--) - page_cache_release(pages[i]); - i = -EFAULT; - goto out; - } - flush_dcache_page(pages[i]); - if (!PageReserved(pages[i])) - page_cache_get(pages[i]); + pages[i] = page; + flush_dcache_page(page); + if (!PageReserved(page)) + page_cache_get(page); } if (vmas) vmas[i] = vma; i++; start += PAGE_SIZE; len--; - } while(len && start < vma->vm_end); + } while (len && start < vma->vm_end); spin_unlock(&mm->page_table_lock); - } while(len); -out: + } while (len); return i; } - EXPORT_SYMBOL(get_user_pages); static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, @@ -1078,7 +1152,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, { pgd_t *pgd; unsigned long next; - unsigned long end = addr + size; + unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; int err; @@ -1163,6 +1237,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); pte_t entry; + int ret; if (unlikely(!pfn_valid(pfn))) { /* @@ -1178,7 +1253,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, } old_page = pfn_to_page(pfn); - if (!TestSetPageLocked(old_page)) { + if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { @@ -1190,7 +1265,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, lazy_mmu_prot_update(entry); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; + return VM_FAULT_MINOR|VM_FAULT_WRITE; } } pte_unmap(page_table); @@ -1217,6 +1292,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, /* * Re-check the pte - we dropped the lock */ + ret = VM_FAULT_MINOR; spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, pte))) { @@ -1233,12 +1309,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, /* Free the old page.. */ new_page = old_page; + ret |= VM_FAULT_WRITE; } pte_unmap(page_table); page_cache_release(new_page); page_cache_release(old_page); spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; + return ret; no_new_page: page_cache_release(old_page); @@ -1270,7 +1347,7 @@ no_new_page: * i_mmap_lock. * * In order to make forward progress despite repeatedly restarting some - * large vma, note the break_addr set by unmap_vmas when it breaks out: + * large vma, note the restart_addr from unmap_vmas when it breaks out: * and restart from that address when we reach that vma again. It might * have been split or merged, shrunk or extended, but never shifted: so * restart_addr remains valid so long as it remains in the vma's range. @@ -1308,8 +1385,8 @@ again: } } - details->break_addr = end_addr; - zap_page_range(vma, start_addr, end_addr - start_addr, details); + restart_addr = zap_page_range(vma, start_addr, + end_addr - start_addr, details); /* * We cannot rely on the break test in unmap_vmas: @@ -1320,14 +1397,14 @@ again: need_break = need_resched() || need_lockbreak(details->i_mmap_lock); - if (details->break_addr >= end_addr) { + if (restart_addr >= end_addr) { /* We have now completed this vma: mark it so */ vma->vm_truncate_count = details->truncate_count; if (!need_break) return 0; } else { /* Note restart_addr in vma's truncate_count field */ - vma->vm_truncate_count = details->break_addr; + vma->vm_truncate_count = restart_addr; if (!need_break) goto again; } @@ -1397,7 +1474,7 @@ restart: * unmap_mapping_range - unmap the portion of all mmaps * in the specified address_space corresponding to the specified * page range in the underlying file. - * @address_space: the address space containing mmaps to be unmapped. + * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE * boundary. Note that this is different from vmtruncate(), which @@ -1615,19 +1692,16 @@ static int do_swap_page(struct mm_struct * mm, spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (unlikely(!pte_same(*page_table, orig_pte))) { - pte_unmap(page_table); - spin_unlock(&mm->page_table_lock); - unlock_page(page); - page_cache_release(page); ret = VM_FAULT_MINOR; - goto out; + goto out_nomap; + } + + if (unlikely(!PageUptodate(page))) { + ret = VM_FAULT_SIGBUS; + goto out_nomap; } /* The page isn't present yet, go ahead with the fault. */ - - swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); inc_mm_counter(mm, rss); pte = mk_pte(page, vma->vm_page_prot); @@ -1635,12 +1709,16 @@ static int do_swap_page(struct mm_struct * mm, pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } - unlock_page(page); flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + unlock_page(page); + if (write_access) { if (do_wp_page(mm, vma, address, page_table, pmd, pte) == VM_FAULT_OOM) @@ -1655,6 +1733,12 @@ static int do_swap_page(struct mm_struct * mm, spin_unlock(&mm->page_table_lock); out: return ret; +out_nomap: + pte_unmap(page_table); + spin_unlock(&mm->page_table_lock); + unlock_page(page); + page_cache_release(page); + goto out; } /* @@ -1923,7 +2007,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (write_access) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, pmd, entry); - entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); @@ -1938,7 +2021,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, /* * By the time we get here, we already hold the mm semaphore */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, +int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access) { pgd_t *pgd;