X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fmemory.c;h=a596c1172248e56b8fb220408548330ebd1de538;hb=4d7670e0f649f9e6e6ea6c8bb9f52441fa00f92b;hp=d209f745db7fbc3154e83cf04666770068986724;hpb=1a9505996dd0c12a2e56d2c6af00846e75a3850d;p=linux-2.6 diff --git a/mm/memory.c b/mm/memory.c index d209f745db..a596c11722 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -58,7 +58,7 @@ #include #include -#ifndef CONFIG_DISCONTIGMEM +#ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; struct page *mem_map; @@ -498,6 +498,17 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long addr = vma->vm_start; unsigned long end = vma->vm_end; + /* + * Don't copy ptes where a page fault will fill them correctly. + * Fork becomes much lighter when there are big shared or private + * readonly mappings. The tradeoff is that copy_page_range is more + * efficient than faulting. + */ + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) { + if (!vma->anon_vma) + return 0; + } + if (is_vm_hugetlb_page(vma)) return copy_hugetlb_page_range(dst_mm, src_mm, vma); @@ -776,8 +787,8 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -static struct page * -__follow_page(struct mm_struct *mm, unsigned long address, int read, int write) +static struct page *__follow_page(struct mm_struct *mm, unsigned long address, + int read, int write, int accessed) { pgd_t *pgd; pud_t *pud; @@ -818,9 +829,11 @@ __follow_page(struct mm_struct *mm, unsigned long address, int read, int write) pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (write && !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); + if (accessed) { + if (write && !pte_dirty(pte) &&!PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + } return page; } } @@ -829,33 +842,21 @@ out: return NULL; } -struct page * +inline struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) { - return __follow_page(mm, address, /*read*/0, write); -} - -int -check_user_page_readable(struct mm_struct *mm, unsigned long address) -{ - return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; + return __follow_page(mm, address, 0, write, 1); } -EXPORT_SYMBOL(check_user_page_readable); - -/* - * Given a physical address, is there a useful struct page pointing to - * it? This may become more complex in the future if we start dealing - * with IO-aperture pages for direct-IO. +/* + * check_user_page_readable() can be called frm niterrupt context by oprofile, + * so we need to avoid taking any non-irq-safe locks */ - -static inline struct page *get_page_map(struct page *page) +int check_user_page_readable(struct mm_struct *mm, unsigned long address) { - if (!pfn_valid(page_to_pfn(page))) - return NULL; - return page; + return __follow_page(mm, address, 1, 0, 0) != NULL; } - +EXPORT_SYMBOL(check_user_page_readable); static inline int untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, @@ -887,7 +888,6 @@ untouched_anonymous_page(struct mm_struct* mm, struct vm_area_struct *vma, return 0; } - int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, int len, int write, int force, struct page **pages, struct vm_area_struct **vmas) @@ -924,9 +924,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pud = pud_offset(pgd, pg); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, pg); - BUG_ON(pmd_none(*pmd)); + if (pmd_none(*pmd)) + return i ? : -EFAULT; pte = pte_offset_map(pmd, pg); - BUG_ON(pte_none(*pte)); + if (pte_none(*pte)) { + pte_unmap(pte); + return i ? : -EFAULT; + } if (pages) { pages[i] = pte_page(*pte); get_page(pages[i]); @@ -951,25 +955,37 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } spin_lock(&mm->page_table_lock); do { - struct page *map; - int lookup_write = write; + int write_access = write; + struct page *page; cond_resched_lock(&mm->page_table_lock); - while (!(map = follow_page(mm, start, lookup_write))) { + while (!(page = follow_page(mm, start, write_access))) { + int ret; + /* * Shortcut for anonymous pages. We don't want * to force the creation of pages tables for - * insanly big anonymously mapped areas that + * insanely big anonymously mapped areas that * nobody touched so far. This is important * for doing a core dump for these mappings. */ - if (!lookup_write && - untouched_anonymous_page(mm,vma,start)) { - map = ZERO_PAGE(start); + if (!write && untouched_anonymous_page(mm,vma,start)) { + page = ZERO_PAGE(start); break; } spin_unlock(&mm->page_table_lock); - switch (handle_mm_fault(mm,vma,start,write)) { + ret = __handle_mm_fault(mm, vma, start, write_access); + + /* + * The VM_FAULT_WRITE bit tells us that do_wp_page has + * broken COW when necessary, even if maybe_mkwrite + * decided not to set pte_write. We can thus safely do + * subsequent page lookups as if they were reads. + */ + if (ret & VM_FAULT_WRITE) + write_access = 0; + + switch (ret & ~VM_FAULT_WRITE) { case VM_FAULT_MINOR: tsk->min_flt++; break; @@ -983,41 +999,24 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, default: BUG(); } - /* - * Now that we have performed a write fault - * and surely no longer have a shared page we - * shouldn't write, we shouldn't ignore an - * unwritable page in the page table if - * we are forcing write access. - */ - lookup_write = write && !force; spin_lock(&mm->page_table_lock); } if (pages) { - pages[i] = get_page_map(map); - if (!pages[i]) { - spin_unlock(&mm->page_table_lock); - while (i--) - page_cache_release(pages[i]); - i = -EFAULT; - goto out; - } - flush_dcache_page(pages[i]); - if (!PageReserved(pages[i])) - page_cache_get(pages[i]); + pages[i] = page; + flush_dcache_page(page); + if (!PageReserved(page)) + page_cache_get(page); } if (vmas) vmas[i] = vma; i++; start += PAGE_SIZE; len--; - } while(len && start < vma->vm_end); + } while (len && start < vma->vm_end); spin_unlock(&mm->page_table_lock); - } while(len); -out: + } while (len); return i; } - EXPORT_SYMBOL(get_user_pages); static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd, @@ -1164,7 +1163,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, { pgd_t *pgd; unsigned long next; - unsigned long end = addr + size; + unsigned long end = addr + PAGE_ALIGN(size); struct mm_struct *mm = vma->vm_mm; int err; @@ -1249,6 +1248,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, struct page *old_page, *new_page; unsigned long pfn = pte_pfn(pte); pte_t entry; + int ret; if (unlikely(!pfn_valid(pfn))) { /* @@ -1264,7 +1264,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, } old_page = pfn_to_page(pfn); - if (!TestSetPageLocked(old_page)) { + if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); unlock_page(old_page); if (reuse) { @@ -1276,7 +1276,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, lazy_mmu_prot_update(entry); pte_unmap(page_table); spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; + return VM_FAULT_MINOR|VM_FAULT_WRITE; } } pte_unmap(page_table); @@ -1303,6 +1303,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, /* * Re-check the pte - we dropped the lock */ + ret = VM_FAULT_MINOR; spin_lock(&mm->page_table_lock); page_table = pte_offset_map(pmd, address); if (likely(pte_same(*page_table, pte))) { @@ -1319,12 +1320,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct * vma, /* Free the old page.. */ new_page = old_page; + ret |= VM_FAULT_WRITE; } pte_unmap(page_table); page_cache_release(new_page); page_cache_release(old_page); spin_unlock(&mm->page_table_lock); - return VM_FAULT_MINOR; + return ret; no_new_page: page_cache_release(old_page); @@ -1483,7 +1485,7 @@ restart: * unmap_mapping_range - unmap the portion of all mmaps * in the specified address_space corresponding to the specified * page range in the underlying file. - * @address_space: the address space containing mmaps to be unmapped. + * @mapping: the address space containing mmaps to be unmapped. * @holebegin: byte in first page to unmap, relative to the start of * the underlying file. This will be rounded down to a PAGE_SIZE * boundary. Note that this is different from vmtruncate(), which @@ -1711,10 +1713,6 @@ static int do_swap_page(struct mm_struct * mm, } /* The page isn't present yet, go ahead with the fault. */ - - swap_free(entry); - if (vm_swap_full()) - remove_exclusive_swap_page(page); inc_mm_counter(mm, rss); pte = mk_pte(page, vma->vm_page_prot); @@ -1722,12 +1720,16 @@ static int do_swap_page(struct mm_struct * mm, pte = maybe_mkwrite(pte_mkdirty(pte), vma); write_access = 0; } - unlock_page(page); flush_icache_page(vma, page); set_pte_at(mm, address, page_table, pte); page_add_anon_rmap(page, vma, address); + swap_free(entry); + if (vm_swap_full()) + remove_exclusive_swap_page(page); + unlock_page(page); + if (write_access) { if (do_wp_page(mm, vma, address, page_table, pmd, pte) == VM_FAULT_OOM) @@ -2016,7 +2018,6 @@ static inline int handle_pte_fault(struct mm_struct *mm, if (write_access) { if (!pte_write(entry)) return do_wp_page(mm, vma, address, pte, pmd, entry); - entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); @@ -2031,7 +2032,7 @@ static inline int handle_pte_fault(struct mm_struct *mm, /* * By the time we get here, we already hold the mm semaphore */ -int handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, +int __handle_mm_fault(struct mm_struct *mm, struct vm_area_struct * vma, unsigned long address, int write_access) { pgd_t *pgd;