ocfs2: release page lock before calling ->page_mkwrite

[linux-2.6] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index e7066e71dfa3b36daeb4830e3032091e38f0bfc1..7abd3899848bf9ff4f032fda6a48ad86e1f92d18 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -78,11 +78,9 @@ unsigned long num_physpages;
   * and ZONE_HIGHMEM.
   */
  void * high_memory;
-unsigned long vmalloc_earlyreserve;
  
  EXPORT_SYMBOL(num_physpages);
  EXPORT_SYMBOL(high_memory);
-EXPORT_SYMBOL(vmalloc_earlyreserve);
  
  int randomize_va_space __read_mostly = 1;
  
@@ -481,7 +479,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
         page = vm_normal_page(vma, addr, pte);
         if (page) {
                 get_page(page);
-               page_dup_rmap(page);
+               page_dup_rmap(page, vma, addr);
                 rss[!!PageAnon(page)]++;
         }
  
@@ -1049,12 +1047,21 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                 if (pages)
                         foll_flags |= FOLL_GET;
                 if (!write && !(vma->vm_flags & VM_LOCKED) &&
-                   (!vma->vm_ops || !vma->vm_ops->nopage))
+                   (!vma->vm_ops || (!vma->vm_ops->nopage &&
+                                       !vma->vm_ops->fault)))
                         foll_flags |= FOLL_ANON;
  
                 do {
                         struct page *page;
  
+                       /*
+                        * If tsk is ooming, cut off its access to large memory
+                        * allocations. It has a pending SIGKILL, but it can't
+                        * be processed until returning to user space.
+                        */
+                       if (unlikely(test_tsk_thread_flag(tsk, TIF_MEMDIE)))
+                               return -ENOMEM;
+
                         if (write)
                                 foll_flags |= FOLL_WRITE;
  
@@ -1448,6 +1455,100 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
  }
  EXPORT_SYMBOL(remap_pfn_range);
  
+static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
+                                    unsigned long addr, unsigned long end,
+                                    pte_fn_t fn, void *data)
+{
+       pte_t *pte;
+       int err;
+       struct page *pmd_page;
+       spinlock_t *uninitialized_var(ptl);
+
+       pte = (mm == &init_mm) ?
+               pte_alloc_kernel(pmd, addr) :
+               pte_alloc_map_lock(mm, pmd, addr, &ptl);
+       if (!pte)
+               return -ENOMEM;
+
+       BUG_ON(pmd_huge(*pmd));
+
+       pmd_page = pmd_page(*pmd);
+
+       do {
+               err = fn(pte, pmd_page, addr, data);
+               if (err)
+                       break;
+       } while (pte++, addr += PAGE_SIZE, addr != end);
+
+       if (mm != &init_mm)
+               pte_unmap_unlock(pte-1, ptl);
+       return err;
+}
+
+static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
+                                    unsigned long addr, unsigned long end,
+                                    pte_fn_t fn, void *data)
+{
+       pmd_t *pmd;
+       unsigned long next;
+       int err;
+
+       pmd = pmd_alloc(mm, pud, addr);
+       if (!pmd)
+               return -ENOMEM;
+       do {
+               next = pmd_addr_end(addr, end);
+               err = apply_to_pte_range(mm, pmd, addr, next, fn, data);
+               if (err)
+                       break;
+       } while (pmd++, addr = next, addr != end);
+       return err;
+}
+
+static int apply_to_pud_range(struct mm_struct *mm, pgd_t *pgd,
+                                    unsigned long addr, unsigned long end,
+                                    pte_fn_t fn, void *data)
+{
+       pud_t *pud;
+       unsigned long next;
+       int err;
+
+       pud = pud_alloc(mm, pgd, addr);
+       if (!pud)
+               return -ENOMEM;
+       do {
+               next = pud_addr_end(addr, end);
+               err = apply_to_pmd_range(mm, pud, addr, next, fn, data);
+               if (err)
+                       break;
+       } while (pud++, addr = next, addr != end);
+       return err;
+}
+
+/*
+ * Scan a region of virtual memory, filling in page tables as necessary
+ * and calling a provided function on each leaf page table.
+ */
+int apply_to_page_range(struct mm_struct *mm, unsigned long addr,
+                       unsigned long size, pte_fn_t fn, void *data)
+{
+       pgd_t *pgd;
+       unsigned long next;
+       unsigned long end = addr + size;
+       int err;
+
+       BUG_ON(addr >= end);
+       pgd = pgd_offset(mm, addr);
+       do {
+               next = pgd_addr_end(addr, end);
+               err = apply_to_pud_range(mm, pgd, addr, next, fn, data);
+               if (err)
+                       break;
+       } while (pgd++, addr = next, addr != end);
+       return err;
+}
+EXPORT_SYMBOL_GPL(apply_to_page_range);
+
  /*
   * handle_pte_fault chooses page fault handler according to an entry
   * which was read non-atomically.  Before making any commitment, on
@@ -1597,9 +1698,10 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 flush_cache_page(vma, address, pte_pfn(orig_pte));
                 entry = pte_mkyoung(orig_pte);
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
-               ptep_set_access_flags(vma, address, page_table, entry, 1);
-               update_mmu_cache(vma, address, entry);
-               lazy_mmu_prot_update(entry);
+               if (ptep_set_access_flags(vma, address, page_table, entry,1)) {
+                       update_mmu_cache(vma, address, entry);
+                       lazy_mmu_prot_update(entry);
+               }
                 ret |= VM_FAULT_WRITE;
                 goto unlock;
         }
@@ -1614,11 +1716,11 @@ gotten:
         if (unlikely(anon_vma_prepare(vma)))
                 goto oom;
         if (old_page == ZERO_PAGE(address)) {
-               new_page = alloc_zeroed_user_highpage(vma, address);
+               new_page = alloc_zeroed_user_highpage_movable(vma, address);
                 if (!new_page)
                         goto oom;
         } else {
-               new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
+               new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
                 if (!new_page)
                         goto oom;
                 cow_user_page(new_page, old_page, address, vma);
@@ -1730,6 +1832,13 @@ static int unmap_mapping_range_vma(struct vm_area_struct *vma,
         unsigned long restart_addr;
         int need_break;
  
+       /*
+        * files that support invalidating or truncating portions of the
+        * file from under mmaped areas must set the VM_CAN_INVALIDATE flag, and
+        * have their .nopage function return the page locked.
+        */
+       BUG_ON(!(vma->vm_flags & VM_CAN_INVALIDATE));
+
  again:
         restart_addr = vma->vm_truncate_count;
         if (is_restart_addr(restart_addr) && start_addr < restart_addr) {
@@ -1858,17 +1967,8 @@ void unmap_mapping_range(struct address_space *mapping,
  
         spin_lock(&mapping->i_mmap_lock);
  
-       /* serialize i_size write against truncate_count write */
-       smp_wmb();
-       /* Protect against page faults, and endless unmapping loops */
+       /* Protect against endless unmapping loops */
         mapping->truncate_count++;
-       /*
-        * For archs where spin_lock has inclusive semantics like ia64
-        * this smp_mb() will prevent to read pagetable contents
-        * before the truncate_count increment is visible to
-        * other cpus.
-        */
-       smp_mb();
         if (unlikely(is_restart_addr(mapping->truncate_count))) {
                 if (mapping->truncate_count == 0)
                         reset_vma_truncate_counts(mapping);
@@ -1907,8 +2007,18 @@ int vmtruncate(struct inode * inode, loff_t offset)
         if (IS_SWAPFILE(inode))
                 goto out_busy;
         i_size_write(inode, offset);
+
+       /*
+        * unmap_mapping_range is called twice, first simply for efficiency
+        * so that truncate_inode_pages does fewer single-page unmaps. However
+        * after this first call, and before truncate_inode_pages finishes,
+        * it is possible for private pages to be COWed, which remain after
+        * truncate_inode_pages finishes, hence the second unmap_mapping_range
+        * call must be made for correctness.
+        */
         unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
         truncate_inode_pages(mapping, offset);
+       unmap_mapping_range(mapping, offset + PAGE_SIZE - 1, 0, 1);
         goto out_truncate;
  
  do_expand:
@@ -1948,6 +2058,7 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end)
         down_write(&inode->i_alloc_sem);
         unmap_mapping_range(mapping, offset, (end - offset), 1);
         truncate_inode_pages_range(mapping, offset, end);
+       unmap_mapping_range(mapping, offset, (end - offset), 1);
         inode->i_op->truncate_range(inode, offset, end);
         up_write(&inode->i_alloc_sem);
         mutex_unlock(&inode->i_mutex);
@@ -2105,7 +2216,6 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
         /* No need to invalidate - it was non-present before */
         update_mmu_cache(vma, address, pte);
-       lazy_mmu_prot_update(pte);
  unlock:
         pte_unmap_unlock(page_table, ptl);
  out:
@@ -2136,7 +2246,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
  
                 if (unlikely(anon_vma_prepare(vma)))
                         goto oom;
-               page = alloc_zeroed_user_highpage(vma, address);
+               page = alloc_zeroed_user_highpage_movable(vma, address);
                 if (!page)
                         goto oom;
  
@@ -2179,10 +2289,10 @@ oom:
  }
  
  /*
- * do_no_page() tries to create a new page mapping. It aggressively
+ * __do_fault() tries to create a new page mapping. It aggressively
   * tries to share with existing pages, but makes a separate copy if
- * the "write_access" parameter is true in order to avoid the next
- * page fault.
+ * the FAULT_FLAG_WRITE is set in the flags parameter in order to avoid
+ * the next page fault.
   *
   * As this is called only for pages that do not currently exist, we
   * do not need to flush old virtual caches or the TLB.
@@ -2191,89 +2301,88 @@ oom:
   * but allow concurrent faults), and pte mapped but not yet locked.
   * We return with mmap_sem still held, but pte unmapped and unlocked.
   */
-static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
+static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma,
                 unsigned long address, pte_t *page_table, pmd_t *pmd,
-               int write_access)
+               pgoff_t pgoff, unsigned int flags, pte_t orig_pte)
  {
         spinlock_t *ptl;
-       struct page *new_page;
-       struct address_space *mapping = NULL;
+       struct page *page, *faulted_page;
         pte_t entry;
-       unsigned int sequence = 0;
-       int ret = VM_FAULT_MINOR;
         int anon = 0;
         struct page *dirty_page = NULL;
+       struct fault_data fdata;
+
+       fdata.address = address & PAGE_MASK;
+       fdata.pgoff = pgoff;
+       fdata.flags = flags;
  
         pte_unmap(page_table);
         BUG_ON(vma->vm_flags & VM_PFNMAP);
  
-       if (vma->vm_file) {
-               mapping = vma->vm_file->f_mapping;
-               sequence = mapping->truncate_count;
-               smp_rmb(); /* serializes i_size against truncate_count */
+       if (likely(vma->vm_ops->fault)) {
+               fdata.type = -1;
+               faulted_page = vma->vm_ops->fault(vma, &fdata);
+               WARN_ON(fdata.type == -1);
+               if (unlikely(!faulted_page))
+                       return fdata.type;
+       } else {
+               /* Legacy ->nopage path */
+               fdata.type = VM_FAULT_MINOR;
+               faulted_page = vma->vm_ops->nopage(vma, address & PAGE_MASK,
+                                                               &fdata.type);
+               /* no page was available -- either SIGBUS or OOM */
+               if (unlikely(faulted_page == NOPAGE_SIGBUS))
+                       return VM_FAULT_SIGBUS;
+               else if (unlikely(faulted_page == NOPAGE_OOM))
+                       return VM_FAULT_OOM;
         }
-retry:
-       new_page = vma->vm_ops->nopage(vma, address & PAGE_MASK, &ret);
+
         /*
-        * No smp_rmb is needed here as long as there's a full
-        * spin_lock/unlock sequence inside the ->nopage callback
-        * (for the pagecache lookup) that acts as an implicit
-        * smp_mb() and prevents the i_size read to happen
-        * after the next truncate_count read.
+        * For consistency in subsequent calls, make the faulted_page always
+        * locked.
          */
-
-       /* no page was available -- either SIGBUS, OOM or REFAULT */
-       if (unlikely(new_page == NOPAGE_SIGBUS))
-               return VM_FAULT_SIGBUS;
-       else if (unlikely(new_page == NOPAGE_OOM))
-               return VM_FAULT_OOM;
-       else if (unlikely(new_page == NOPAGE_REFAULT))
-               return VM_FAULT_MINOR;
+       if (unlikely(!(vma->vm_flags & VM_CAN_INVALIDATE)))
+               lock_page(faulted_page);
+       else
+               BUG_ON(!PageLocked(faulted_page));
  
         /*
          * Should we do an early C-O-W break?
          */
-       if (write_access) {
+       page = faulted_page;
+       if (flags & FAULT_FLAG_WRITE) {
                 if (!(vma->vm_flags & VM_SHARED)) {
-                       struct page *page;
-
-                       if (unlikely(anon_vma_prepare(vma)))
-                               goto oom;
-                       page = alloc_page_vma(GFP_HIGHUSER, vma, address);
-                       if (!page)
-                               goto oom;
-                       copy_user_highpage(page, new_page, address, vma);
-                       page_cache_release(new_page);
-                       new_page = page;
                         anon = 1;
-
+                       if (unlikely(anon_vma_prepare(vma))) {
+                               fdata.type = VM_FAULT_OOM;
+                               goto out;
+                       }
+                       page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+                       if (!page) {
+                               fdata.type = VM_FAULT_OOM;
+                               goto out;
+                       }
+                       copy_user_highpage(page, faulted_page, address, vma);
                 } else {
-                       /* if the page will be shareable, see if the backing
+                       /*
+                        * If the page will be shareable, see if the backing
                          * address space wants to know that the page is about
-                        * to become writable */
-                       if (vma->vm_ops->page_mkwrite &&
-                           vma->vm_ops->page_mkwrite(vma, new_page) < 0
-                           ) {
-                               page_cache_release(new_page);
-                               return VM_FAULT_SIGBUS;
+                        * to become writable
+                        */
+                       if (vma->vm_ops->page_mkwrite) {
+                               unlock_page(page);
+                               if (vma->vm_ops->page_mkwrite(vma, page) < 0) {
+                                       fdata.type = VM_FAULT_SIGBUS;
+                                       anon = 1; /* no anon but release faulted_page */
+                                       goto out_unlocked;
+                               }
+                               lock_page(page);
                         }
                 }
+
         }
  
         page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
-       /*
-        * For a file-backed vma, someone could have truncated or otherwise
-        * invalidated this page.  If unmap_mapping_range got called,
-        * retry getting the page.
-        */
-       if (mapping && unlikely(sequence != mapping->truncate_count)) {
-               pte_unmap_unlock(page_table, ptl);
-               page_cache_release(new_page);
-               cond_resched();
-               sequence = mapping->truncate_count;
-               smp_rmb();
-               goto retry;
-       }
  
         /*
          * This silly early PAGE_DIRTY setting removes a race
@@ -2286,43 +2395,69 @@ retry:
          * handle that later.
          */
         /* Only go through if we didn't race with anybody else... */
-       if (pte_none(*page_table)) {
-               flush_icache_page(vma, new_page);
-               entry = mk_pte(new_page, vma->vm_page_prot);
-               if (write_access)
+       if (likely(pte_same(*page_table, orig_pte))) {
+               flush_icache_page(vma, page);
+               entry = mk_pte(page, vma->vm_page_prot);
+               if (flags & FAULT_FLAG_WRITE)
                         entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                 set_pte_at(mm, address, page_table, entry);
                 if (anon) {
-                       inc_mm_counter(mm, anon_rss);
-                       lru_cache_add_active(new_page);
-                       page_add_new_anon_rmap(new_page, vma, address);
+                        inc_mm_counter(mm, anon_rss);
+                        lru_cache_add_active(page);
+                        page_add_new_anon_rmap(page, vma, address);
                 } else {
                         inc_mm_counter(mm, file_rss);
-                       page_add_file_rmap(new_page);
-                       if (write_access) {
-                               dirty_page = new_page;
+                       page_add_file_rmap(page);
+                       if (flags & FAULT_FLAG_WRITE) {
+                               dirty_page = page;
                                 get_page(dirty_page);
                         }
                 }
+
+               /* no need to invalidate: a not-present page won't be cached */
+               update_mmu_cache(vma, address, entry);
+               lazy_mmu_prot_update(entry);
         } else {
-               /* One of our sibling threads was faster, back out. */
-               page_cache_release(new_page);
-               goto unlock;
+               if (anon)
+                       page_cache_release(page);
+               else
+                       anon = 1; /* no anon but release faulted_page */
         }
  
-       /* no need to invalidate: a not-present page shouldn't be cached */
-       update_mmu_cache(vma, address, entry);
-       lazy_mmu_prot_update(entry);
-unlock:
         pte_unmap_unlock(page_table, ptl);
-       if (dirty_page) {
+
+out:
+       unlock_page(faulted_page);
+out_unlocked:
+       if (anon)
+               page_cache_release(faulted_page);
+       else if (dirty_page) {
                 set_page_dirty_balance(dirty_page);
                 put_page(dirty_page);
         }
-       return ret;
-oom:
-       page_cache_release(new_page);
-       return VM_FAULT_OOM;
+
+       return fdata.type;
+}
+
+static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               int write_access, pte_t orig_pte)
+{
+       pgoff_t pgoff = (((address & PAGE_MASK)
+                       - vma->vm_start) >> PAGE_CACHE_SHIFT) + vma->vm_pgoff;
+       unsigned int flags = (write_access ? FAULT_FLAG_WRITE : 0);
+
+       return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
+}
+
+static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
+               unsigned long address, pte_t *page_table, pmd_t *pmd,
+               int write_access, pgoff_t pgoff, pte_t orig_pte)
+{
+       unsigned int flags = FAULT_FLAG_NONLINEAR |
+                               (write_access ? FAULT_FLAG_WRITE : 0);
+
+       return __do_fault(mm, vma, address, page_table, pmd, pgoff, flags, orig_pte);
  }
  
  /*
@@ -2401,9 +2536,14 @@ static int do_file_page(struct mm_struct *mm, struct vm_area_struct *vma,
                 print_bad_pte(vma, orig_pte, address);
                 return VM_FAULT_OOM;
         }
-       /* We can then assume vm->vm_ops && vma->vm_ops->populate */
  
         pgoff = pte_to_pgoff(orig_pte);
+
+       if (vma->vm_ops && vma->vm_ops->fault)
+               return do_nonlinear_fault(mm, vma, address, page_table, pmd,
+                                       write_access, pgoff, orig_pte);
+
+       /* We can then assume vm->vm_ops && vma->vm_ops->populate */
         err = vma->vm_ops->populate(vma, address & PAGE_MASK, PAGE_SIZE,
                                         vma->vm_page_prot, pgoff, 0);
         if (err == -ENOMEM)
@@ -2431,17 +2571,15 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                 pte_t *pte, pmd_t *pmd, int write_access)
  {
         pte_t entry;
-       pte_t old_entry;
         spinlock_t *ptl;
  
-       old_entry = entry = *pte;
+       entry = *pte;
         if (!pte_present(entry)) {
                 if (pte_none(entry)) {
                         if (vma->vm_ops) {
-                               if (vma->vm_ops->nopage)
-                                       return do_no_page(mm, vma, address,
-                                                         pte, pmd,
-                                                         write_access);
+                               if (vma->vm_ops->fault || vma->vm_ops->nopage)
+                                       return do_linear_fault(mm, vma, address,
+                                               pte, pmd, write_access, entry);
                                 if (unlikely(vma->vm_ops->nopfn))
                                         return do_no_pfn(mm, vma, address, pte,
                                                          pmd, write_access);
@@ -2467,8 +2605,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
                 entry = pte_mkdirty(entry);
         }
         entry = pte_mkyoung(entry);
-       if (!pte_same(old_entry, entry)) {
-               ptep_set_access_flags(vma, address, pte, entry, write_access);
+       if (ptep_set_access_flags(vma, address, pte, entry, write_access)) {
                 update_mmu_cache(vma, address, entry);
                 lazy_mmu_prot_update(entry);
         } else {
@@ -2539,12 +2676,6 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
         spin_unlock(&mm->page_table_lock);
         return 0;
  }
-#else
-/* Workaround for gcc 2.96 */
-int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
-{
-       return 0;
-}
  #endif /* __PAGETABLE_PUD_FOLDED */
  
  #ifndef __PAGETABLE_PMD_FOLDED
@@ -2573,12 +2704,6 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
         spin_unlock(&mm->page_table_lock);
         return 0;
  }
-#else
-/* Workaround for gcc 2.96 */
-int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
-{
-       return 0;
-}
  #endif /* __PAGETABLE_PMD_FOLDED */
  
  int make_pages_present(unsigned long addr, unsigned long end)
@@ -2592,7 +2717,7 @@ int make_pages_present(unsigned long addr, unsigned long end)
         write = (vma->vm_flags & VM_WRITE) != 0;
         BUG_ON(addr >= end);
         BUG_ON(end > vma->vm_end);
-       len = (end+PAGE_SIZE-1)/PAGE_SIZE-addr/PAGE_SIZE;
+       len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE;
         ret = get_user_pages(current, current->mm, addr,
                         len, write, 0, NULL, NULL);
         if (ret < 0)