[PATCH] CLASS: move a "simple" class logic into the class core.

[linux-2.6] / mm / memory.c
diff --git a/mm/memory.c b/mm/memory.c

index fb6e5deb873a863cd69de41fd5be7c1da2701c11..d209f745db7fbc3154e83cf04666770068986724 100644 (file)
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -110,90 +110,178 @@ void pmd_clear_bad(pmd_t *pmd)
   * Note: this doesn't free the actual pages themselves. That
   * has been handled earlier when unmapping all the memory regions.
   */
-static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
-                               unsigned long addr, unsigned long end)
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
  {
-       if (!((addr | end) & ~PMD_MASK)) {
-               /* Only free fully aligned ranges */
-               struct page *page = pmd_page(*pmd);
-               pmd_clear(pmd);
-               dec_page_state(nr_page_table_pages);
-               tlb->mm->nr_ptes--;
-               pte_free_tlb(tlb, page);
-       }
+       struct page *page = pmd_page(*pmd);
+       pmd_clear(pmd);
+       pte_free_tlb(tlb, page);
+       dec_page_state(nr_page_table_pages);
+       tlb->mm->nr_ptes--;
  }
  
-static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud,
-                               unsigned long addr, unsigned long end)
+static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+                               unsigned long addr, unsigned long end,
+                               unsigned long floor, unsigned long ceiling)
  {
         pmd_t *pmd;
         unsigned long next;
-       pmd_t *empty_pmd = NULL;
+       unsigned long start;
  
+       start = addr;
         pmd = pmd_offset(pud, addr);
-
-       /* Only free fully aligned ranges */
-       if (!((addr | end) & ~PUD_MASK))
-               empty_pmd = pmd;
         do {
                 next = pmd_addr_end(addr, end);
                 if (pmd_none_or_clear_bad(pmd))
                         continue;
-               clear_pte_range(tlb, pmd, addr, next);
+               free_pte_range(tlb, pmd);
         } while (pmd++, addr = next, addr != end);
  
-       if (empty_pmd) {
-               pud_clear(pud);
-               pmd_free_tlb(tlb, empty_pmd);
+       start &= PUD_MASK;
+       if (start < floor)
+               return;
+       if (ceiling) {
+               ceiling &= PUD_MASK;
+               if (!ceiling)
+                       return;
         }
+       if (end - 1 > ceiling - 1)
+               return;
+
+       pmd = pmd_offset(pud, start);
+       pud_clear(pud);
+       pmd_free_tlb(tlb, pmd);
  }
  
-static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
-                               unsigned long addr, unsigned long end)
+static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+                               unsigned long addr, unsigned long end,
+                               unsigned long floor, unsigned long ceiling)
  {
         pud_t *pud;
         unsigned long next;
-       pud_t *empty_pud = NULL;
+       unsigned long start;
  
+       start = addr;
         pud = pud_offset(pgd, addr);
-
-       /* Only free fully aligned ranges */
-       if (!((addr | end) & ~PGDIR_MASK))
-               empty_pud = pud;
         do {
                 next = pud_addr_end(addr, end);
                 if (pud_none_or_clear_bad(pud))
                         continue;
-               clear_pmd_range(tlb, pud, addr, next);
+               free_pmd_range(tlb, pud, addr, next, floor, ceiling);
         } while (pud++, addr = next, addr != end);
  
-       if (empty_pud) {
-               pgd_clear(pgd);
-               pud_free_tlb(tlb, empty_pud);
+       start &= PGDIR_MASK;
+       if (start < floor)
+               return;
+       if (ceiling) {
+               ceiling &= PGDIR_MASK;
+               if (!ceiling)
+                       return;
         }
+       if (end - 1 > ceiling - 1)
+               return;
+
+       pud = pud_offset(pgd, start);
+       pgd_clear(pgd);
+       pud_free_tlb(tlb, pud);
  }
  
  /*
- * This function clears user-level page tables of a process.
- * Unlike other pagetable walks, some memory layouts might give end 0.
+ * This function frees user-level page tables of a process.
+ *
   * Must be called with pagetable lock held.
   */
-void clear_page_range(struct mmu_gather *tlb,
-                               unsigned long addr, unsigned long end)
+void free_pgd_range(struct mmu_gather **tlb,
+                       unsigned long addr, unsigned long end,
+                       unsigned long floor, unsigned long ceiling)
  {
         pgd_t *pgd;
         unsigned long next;
+       unsigned long start;
+
+       /*
+        * The next few lines have given us lots of grief...
+        *
+        * Why are we testing PMD* at this top level?  Because often
+        * there will be no work to do at all, and we'd prefer not to
+        * go all the way down to the bottom just to discover that.
+        *
+        * Why all these "- 1"s?  Because 0 represents both the bottom
+        * of the address space and the top of it (using -1 for the
+        * top wouldn't help much: the masks would do the wrong thing).
+        * The rule is that addr 0 and floor 0 refer to the bottom of
+        * the address space, but end 0 and ceiling 0 refer to the top
+        * Comparisons need to use "end - 1" and "ceiling - 1" (though
+        * that end 0 case should be mythical).
+        *
+        * Wherever addr is brought up or ceiling brought down, we must
+        * be careful to reject "the opposite 0" before it confuses the
+        * subsequent tests.  But what about where end is brought down
+        * by PMD_SIZE below? no, end can't go down to 0 there.
+        *
+        * Whereas we round start (addr) and ceiling down, by different
+        * masks at different levels, in order to test whether a table
+        * now has no other vmas using it, so can be freed, we don't
+        * bother to round floor or end up - the tests don't need that.
+        */
  
-       pgd = pgd_offset(tlb->mm, addr);
+       addr &= PMD_MASK;
+       if (addr < floor) {
+               addr += PMD_SIZE;
+               if (!addr)
+                       return;
+       }
+       if (ceiling) {
+               ceiling &= PMD_MASK;
+               if (!ceiling)
+                       return;
+       }
+       if (end - 1 > ceiling - 1)
+               end -= PMD_SIZE;
+       if (addr > end - 1)
+               return;
+
+       start = addr;
+       pgd = pgd_offset((*tlb)->mm, addr);
         do {
                 next = pgd_addr_end(addr, end);
                 if (pgd_none_or_clear_bad(pgd))
                         continue;
-               clear_pud_range(tlb, pgd, addr, next);
+               free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
         } while (pgd++, addr = next, addr != end);
+
+       if (!tlb_is_full_mm(*tlb))
+               flush_tlb_pgtables((*tlb)->mm, start, end);
+}
+
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+               unsigned long floor, unsigned long ceiling)
+{
+       while (vma) {
+               struct vm_area_struct *next = vma->vm_next;
+               unsigned long addr = vma->vm_start;
+
+               if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
+                       hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
+                               floor, next? next->vm_start: ceiling);
+               } else {
+                       /*
+                        * Optimization: gather nearby vmas into one call down
+                        */
+                       while (next && next->vm_start <= vma->vm_end + PMD_SIZE
+                         && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
+                                                       HPAGE_SIZE)) {
+                               vma = next;
+                               next = vma->vm_next;
+                       }
+                       free_pgd_range(tlb, addr, vma->vm_end,
+                               floor, next? next->vm_start: ceiling);
+               }
+               vma = next;
+       }
  }
  
-pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
+                               unsigned long address)
  {
         if (!pmd_present(*pmd)) {
                 struct page *new;
@@ -567,7 +655,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
   * @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
   * @details: details of nonlinear truncation or shared cache invalidation
   *
- * Returns the number of vma's which were covered by the unmapping.
+ * Returns the end address of the unmapping (restart addr if interrupted).
   *
   * Unmap all pages in the vma list.  Called under page_table_lock.
   *
@@ -584,7 +672,7 @@ static void unmap_page_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
   * ensure that any thus-far unmapped pages are flushed before unmap_vmas()
   * drops the lock and schedules.
   */
-int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
                 struct vm_area_struct *vma, unsigned long start_addr,
                 unsigned long end_addr, unsigned long *nr_accounted,
                 struct zap_details *details)
@@ -592,12 +680,11 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
         unsigned long zap_bytes = ZAP_BLOCK_SIZE;
         unsigned long tlb_start = 0;    /* For tlb_finish_mmu */
         int tlb_start_valid = 0;
-       int ret = 0;
+       unsigned long start = start_addr;
         spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
         int fullmm = tlb_is_full_mm(*tlbp);
  
         for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
-               unsigned long start;
                 unsigned long end;
  
                 start = max(vma->vm_start, start_addr);
@@ -610,7 +697,6 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
                 if (vma->vm_flags & VM_ACCOUNT)
                         *nr_accounted += (end - start) >> PAGE_SHIFT;
  
-               ret++;
                 while (start != end) {
                         unsigned long block;
  
@@ -641,7 +727,6 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
                                 if (i_mmap_lock) {
                                         /* must reset count of rss freed */
                                         *tlbp = tlb_gather_mmu(mm, fullmm);
-                                       details->break_addr = start;
                                         goto out;
                                 }
                                 spin_unlock(&mm->page_table_lock);
@@ -655,7 +740,7 @@ int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
                 }
         }
  out:
-       return ret;
+       return start;   /* which is now the end (or restart) address */
  }
  
  /**
@@ -665,7 +750,7 @@ out:
   * @size: number of bytes to zap
   * @details: details of nonlinear truncation or shared cache invalidation
   */
-void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
                 unsigned long size, struct zap_details *details)
  {
         struct mm_struct *mm = vma->vm_mm;
@@ -675,15 +760,16 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long address,
  
         if (is_vm_hugetlb_page(vma)) {
                 zap_hugepage_range(vma, address, size);
-               return;
+               return end;
         }
  
         lru_add_drain();
         spin_lock(&mm->page_table_lock);
         tlb = tlb_gather_mmu(mm, 0);
-       unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
+       end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
         tlb_finish_mmu(tlb, address, end);
         spin_unlock(&mm->page_table_lock);
+       return end;
  }
  
  /*
@@ -1270,7 +1356,7 @@ no_new_page:
   * i_mmap_lock.
   *
   * In order to make forward progress despite repeatedly restarting some
- * large vma, note the break_addr set by unmap_vmas when it breaks out:
+ * large vma, note the restart_addr from unmap_vmas when it breaks out:
   * and restart from that address when we reach that vma again.  It might
   * have been split or merged, shrunk or extended, but never shifted: so
   * restart_addr remains valid so long as it remains in the vma's range.
@@ -1308,8 +1394,8 @@ again:
                 }
         }
  
-       details->break_addr = end_addr;
-       zap_page_range(vma, start_addr, end_addr - start_addr, details);
+       restart_addr = zap_page_range(vma, start_addr,
+                                       end_addr - start_addr, details);
  
         /*
          * We cannot rely on the break test in unmap_vmas:
@@ -1320,14 +1406,14 @@ again:
         need_break = need_resched() ||
                         need_lockbreak(details->i_mmap_lock);
  
-       if (details->break_addr >= end_addr) {
+       if (restart_addr >= end_addr) {
                 /* We have now completed this vma: mark it so */
                 vma->vm_truncate_count = details->truncate_count;
                 if (!need_break)
                         return 0;
         } else {
                 /* Note restart_addr in vma's truncate_count field */
-               vma->vm_truncate_count = details->break_addr;
+               vma->vm_truncate_count = restart_addr;
                 if (!need_break)
                         goto again;
         }
@@ -1615,12 +1701,13 @@ static int do_swap_page(struct mm_struct * mm,
         spin_lock(&mm->page_table_lock);
         page_table = pte_offset_map(pmd, address);
         if (unlikely(!pte_same(*page_table, orig_pte))) {
-               pte_unmap(page_table);
-               spin_unlock(&mm->page_table_lock);
-               unlock_page(page);
-               page_cache_release(page);
                 ret = VM_FAULT_MINOR;
-               goto out;
+               goto out_nomap;
+       }
+
+       if (unlikely(!PageUptodate(page))) {
+               ret = VM_FAULT_SIGBUS;
+               goto out_nomap;
         }
  
         /* The page isn't present yet, go ahead with the fault. */
@@ -1655,6 +1742,12 @@ static int do_swap_page(struct mm_struct * mm,
         spin_unlock(&mm->page_table_lock);
  out:
         return ret;
+out_nomap:
+       pte_unmap(page_table);
+       spin_unlock(&mm->page_table_lock);
+       unlock_page(page);
+       page_cache_release(page);
+       goto out;
  }
  
  /*