[DCCP] ackvec: infrastructure for sending more than one ackvec per packet

[linux-2.6] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index 92a3ebd8d7951daff767f409836c8490cf283028..156861fcac436e4716537c7e5dff565dded43224 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -467,7 +467,7 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
          */
         if (is_cow_mapping(vm_flags)) {
                 ptep_set_wrprotect(src_mm, addr, src_pte);
-               pte = *src_pte;
+               pte = pte_wrprotect(pte);
         }
  
         /*
@@ -506,6 +506,7 @@ again:
         src_pte = pte_offset_map_nested(src_pmd, addr);
         src_ptl = pte_lockptr(src_mm, src_pmd);
         spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
+       arch_enter_lazy_mmu_mode();
  
         do {
                 /*
@@ -527,6 +528,7 @@ again:
                 progress += 8;
         } while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
  
+       arch_leave_lazy_mmu_mode();
         spin_unlock(src_ptl);
         pte_unmap_nested(src_pte - 1);
         add_mm_rss(dst_mm, rss[0], rss[1]);
@@ -628,6 +630,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
         int anon_rss = 0;
  
         pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
+       arch_enter_lazy_mmu_mode();
         do {
                 pte_t ptent = *pte;
                 if (pte_none(ptent)) {
@@ -690,10 +693,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
                         continue;
                 if (!pte_file(ptent))
                         free_swap_and_cache(pte_to_swp_entry(ptent));
-               pte_clear_full(mm, addr, pte, tlb->fullmm);
+               pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
         } while (pte++, addr += PAGE_SIZE, (addr != end && *zap_work > 0));
  
         add_mm_rss(mm, file_rss, anon_rss);
+       arch_leave_lazy_mmu_mode();
         pte_unmap_unlock(pte - 1, ptl);
  
         return addr;
@@ -1082,6 +1086,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
                                 default:
                                         BUG();
                                 }
+                               cond_resched();
                         }
                         if (pages) {
                                 pages[i] = page;
@@ -1109,6 +1114,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
         pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
         if (!pte)
                 return -ENOMEM;
+       arch_enter_lazy_mmu_mode();
         do {
                 struct page *page = ZERO_PAGE(addr);
                 pte_t zero_pte = pte_wrprotect(mk_pte(page, prot));
@@ -1118,6 +1124,7 @@ static int zeromap_pte_range(struct mm_struct *mm, pmd_t *pmd,
                 BUG_ON(!pte_none(*pte));
                 set_pte_at(mm, addr, pte, zero_pte);
         } while (pte++, addr += PAGE_SIZE, addr != end);
+       arch_leave_lazy_mmu_mode();
         pte_unmap_unlock(pte - 1, ptl);
         return 0;
  }
@@ -1275,11 +1282,13 @@ static int remap_pte_range(struct mm_struct *mm, pmd_t *pmd,
         pte = pte_alloc_map_lock(mm, pmd, addr, &ptl);
         if (!pte)
                 return -ENOMEM;
+       arch_enter_lazy_mmu_mode();
         do {
                 BUG_ON(!pte_none(*pte));
                 set_pte_at(mm, addr, pte, pfn_pte(pfn, prot));
                 pfn++;
         } while (pte++, addr += PAGE_SIZE, addr != end);
+       arch_leave_lazy_mmu_mode();
         pte_unmap_unlock(pte - 1, ptl);
         return 0;
  }
@@ -1443,6 +1452,7 @@ static inline void cow_user_page(struct page *dst, struct page *src, unsigned lo
                 if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE))
                         memset(kaddr, 0, PAGE_SIZE);
                 kunmap_atomic(kaddr, KM_USER0);
+               flush_dcache_page(dst);
                 return;
                 
         }
@@ -1577,7 +1587,14 @@ gotten:
                 entry = mk_pte(new_page, vma->vm_page_prot);
                 entry = maybe_mkwrite(pte_mkdirty(entry), vma);
                 lazy_mmu_prot_update(entry);
-               ptep_establish(vma, address, page_table, entry);
+               /*
+                * Clear the pte entry and flush it first, before updating the
+                * pte with the new entry. This will avoid a race condition
+                * seen in the presence of one thread doing SMC and another
+                * thread doing COW.
+                */
+               ptep_clear_flush(vma, address, page_table);
+               set_pte_at(mm, address, page_table, entry);
                 update_mmu_cache(vma, address, entry);
                 lru_cache_add_active(new_page);
                 page_add_new_anon_rmap(new_page, vma, address);
@@ -2154,11 +2171,13 @@ retry:
          * after the next truncate_count read.
          */
  
-       /* no page was available -- either SIGBUS or OOM */
-       if (new_page == NOPAGE_SIGBUS)
+       /* no page was available -- either SIGBUS, OOM or REFAULT */
+       if (unlikely(new_page == NOPAGE_SIGBUS))
                 return VM_FAULT_SIGBUS;
-       if (new_page == NOPAGE_OOM)
+       else if (unlikely(new_page == NOPAGE_OOM))
                 return VM_FAULT_OOM;
+       else if (unlikely(new_page == NOPAGE_REFAULT))
+               return VM_FAULT_MINOR;
  
         /*
          * Should we do an early C-O-W break?
@@ -2255,6 +2274,54 @@ oom:
         return VM_FAULT_OOM;
  }
  
+/*
+ * do_no_pfn() tries to create a new page mapping for a page without
+ * a struct_page backing it
+ *
+ * As this is called only for pages that do not currently exist, we
+ * do not need to flush old virtual caches or the TLB.
+ *
+ * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * but allow concurrent faults), and pte mapped but not yet locked.
+ * We return with mmap_sem still held, but pte unmapped and unlocked.
+ *
+ * It is expected that the ->nopfn handler always returns the same pfn
+ * for a given virtual mapping.
+ *
+ * Mark this `noinline' to prevent it from bloating the main pagefault code.
+ */
+static noinline int do_no_pfn(struct mm_struct *mm, struct vm_area_struct *vma,
+                    unsigned long address, pte_t *page_table, pmd_t *pmd,
+                    int write_access)
+{
+       spinlock_t *ptl;
+       pte_t entry;
+       unsigned long pfn;
+       int ret = VM_FAULT_MINOR;
+
+       pte_unmap(page_table);
+       BUG_ON(!(vma->vm_flags & VM_PFNMAP));
+       BUG_ON(is_cow_mapping(vma->vm_flags));
+
+       pfn = vma->vm_ops->nopfn(vma, address & PAGE_MASK);
+       if (pfn == NOPFN_OOM)
+               return VM_FAULT_OOM;
+       if (pfn == NOPFN_SIGBUS)
+               return VM_FAULT_SIGBUS;
+
+       page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
+
+       /* Only go through if we didn't race with anybody else... */
+       if (pte_none(*page_table)) {
+               entry = pfn_pte(pfn, vma->vm_page_prot);
+               if (write_access)
+                       entry = maybe_mkwrite(pte_mkdirty(entry), vma);
+               set_pte_at(mm, address, page_table, entry);
+       }
+       pte_unmap_unlock(page_table, ptl);
+       return ret;
+}
+
  /*
   * Fault of a previously existing named mapping. Repopulate the pte
   * from the encoded file_pte if possible. This enables swappable
@@ -2317,11 +2384,17 @@ static inline int handle_pte_fault(struct mm_struct *mm,
         old_entry = entry = *pte;
         if (!pte_present(entry)) {
                 if (pte_none(entry)) {
-                       if (!vma->vm_ops || !vma->vm_ops->nopage)
-                               return do_anonymous_page(mm, vma, address,
-                                       pte, pmd, write_access);
-                       return do_no_page(mm, vma, address,
-                                       pte, pmd, write_access);
+                       if (vma->vm_ops) {
+                               if (vma->vm_ops->nopage)
+                                       return do_no_page(mm, vma, address,
+                                                         pte, pmd,
+                                                         write_access);
+                               if (unlikely(vma->vm_ops->nopfn))
+                                       return do_no_pfn(mm, vma, address, pte,
+                                                        pmd, write_access);
+                       }
+                       return do_anonymous_page(mm, vma, address,
+                                                pte, pmd, write_access);
                 }
                 if (pte_file(entry))
                         return do_file_page(mm, vma, address,
@@ -2550,3 +2623,56 @@ int in_gate_area_no_task(unsigned long addr)
  }
  
  #endif /* __HAVE_ARCH_GATE_AREA */
+
+/*
+ * Access another process' address space.
+ * Source/target buffer must be kernel space,
+ * Do not walk the page table directly, use get_user_pages
+ */
+int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, int write)
+{
+       struct mm_struct *mm;
+       struct vm_area_struct *vma;
+       struct page *page;
+       void *old_buf = buf;
+
+       mm = get_task_mm(tsk);
+       if (!mm)
+               return 0;
+
+       down_read(&mm->mmap_sem);
+       /* ignore errors, just check how much was sucessfully transfered */
+       while (len) {
+               int bytes, ret, offset;
+               void *maddr;
+
+               ret = get_user_pages(tsk, mm, addr, 1,
+                               write, 1, &page, &vma);
+               if (ret <= 0)
+                       break;
+
+               bytes = len;
+               offset = addr & (PAGE_SIZE-1);
+               if (bytes > PAGE_SIZE-offset)
+                       bytes = PAGE_SIZE-offset;
+
+               maddr = kmap(page);
+               if (write) {
+                       copy_to_user_page(vma, page, addr,
+                                         maddr + offset, buf, bytes);
+                       set_page_dirty_lock(page);
+               } else {
+                       copy_from_user_page(vma, page, addr,
+                                           buf, maddr + offset, bytes);
+               }
+               kunmap(page);
+               page_cache_release(page);
+               len -= bytes;
+               buf += bytes;
+               addr += bytes;
+       }
+       up_read(&mm->mmap_sem);
+       mmput(mm);
+
+       return buf - old_buf;
+}