* Note: this doesn't free the actual pages themselves. That
* has been handled earlier when unmapping all the memory regions.
*/
-static inline void clear_pte_range(struct mmu_gather *tlb, pmd_t *pmd,
- unsigned long addr, unsigned long end)
+static void free_pte_range(struct mmu_gather *tlb, pmd_t *pmd)
{
- if (!((addr | end) & ~PMD_MASK)) {
- /* Only free fully aligned ranges */
- struct page *page = pmd_page(*pmd);
- pmd_clear(pmd);
- dec_page_state(nr_page_table_pages);
- tlb->mm->nr_ptes--;
- pte_free_tlb(tlb, page);
- }
+ struct page *page = pmd_page(*pmd);
+ pmd_clear(pmd);
+ pte_free_tlb(tlb, page);
+ dec_page_state(nr_page_table_pages);
+ tlb->mm->nr_ptes--;
}
-static inline void clear_pmd_range(struct mmu_gather *tlb, pud_t *pud,
- unsigned long addr, unsigned long end)
+static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
{
pmd_t *pmd;
unsigned long next;
- pmd_t *empty_pmd = NULL;
+ unsigned long start;
+ start = addr;
pmd = pmd_offset(pud, addr);
-
- /* Only free fully aligned ranges */
- if (!((addr | end) & ~PUD_MASK))
- empty_pmd = pmd;
do {
next = pmd_addr_end(addr, end);
if (pmd_none_or_clear_bad(pmd))
continue;
- clear_pte_range(tlb, pmd, addr, next);
+ free_pte_range(tlb, pmd);
} while (pmd++, addr = next, addr != end);
- if (empty_pmd) {
- pud_clear(pud);
- pmd_free_tlb(tlb, empty_pmd);
+ start &= PUD_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= PUD_MASK;
+ if (!ceiling)
+ return;
}
+ if (end - 1 > ceiling - 1)
+ return;
+
+ pmd = pmd_offset(pud, start);
+ pud_clear(pud);
+ pmd_free_tlb(tlb, pmd);
}
-static inline void clear_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
- unsigned long addr, unsigned long end)
+static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
{
pud_t *pud;
unsigned long next;
- pud_t *empty_pud = NULL;
+ unsigned long start;
+ start = addr;
pud = pud_offset(pgd, addr);
-
- /* Only free fully aligned ranges */
- if (!((addr | end) & ~PGDIR_MASK))
- empty_pud = pud;
do {
next = pud_addr_end(addr, end);
if (pud_none_or_clear_bad(pud))
continue;
- clear_pmd_range(tlb, pud, addr, next);
+ free_pmd_range(tlb, pud, addr, next, floor, ceiling);
} while (pud++, addr = next, addr != end);
- if (empty_pud) {
- pgd_clear(pgd);
- pud_free_tlb(tlb, empty_pud);
+ start &= PGDIR_MASK;
+ if (start < floor)
+ return;
+ if (ceiling) {
+ ceiling &= PGDIR_MASK;
+ if (!ceiling)
+ return;
}
+ if (end - 1 > ceiling - 1)
+ return;
+
+ pud = pud_offset(pgd, start);
+ pgd_clear(pgd);
+ pud_free_tlb(tlb, pud);
}
/*
- * This function clears user-level page tables of a process.
- * Unlike other pagetable walks, some memory layouts might give end 0.
+ * This function frees user-level page tables of a process.
+ *
* Must be called with pagetable lock held.
*/
-void clear_page_range(struct mmu_gather *tlb,
- unsigned long addr, unsigned long end)
+void free_pgd_range(struct mmu_gather **tlb,
+ unsigned long addr, unsigned long end,
+ unsigned long floor, unsigned long ceiling)
{
pgd_t *pgd;
unsigned long next;
+ unsigned long start;
+
+ /*
+ * The next few lines have given us lots of grief...
+ *
+ * Why are we testing PMD* at this top level? Because often
+ * there will be no work to do at all, and we'd prefer not to
+ * go all the way down to the bottom just to discover that.
+ *
+ * Why all these "- 1"s? Because 0 represents both the bottom
+ * of the address space and the top of it (using -1 for the
+ * top wouldn't help much: the masks would do the wrong thing).
+ * The rule is that addr 0 and floor 0 refer to the bottom of
+ * the address space, but end 0 and ceiling 0 refer to the top
+ * Comparisons need to use "end - 1" and "ceiling - 1" (though
+ * that end 0 case should be mythical).
+ *
+ * Wherever addr is brought up or ceiling brought down, we must
+ * be careful to reject "the opposite 0" before it confuses the
+ * subsequent tests. But what about where end is brought down
+ * by PMD_SIZE below? no, end can't go down to 0 there.
+ *
+ * Whereas we round start (addr) and ceiling down, by different
+ * masks at different levels, in order to test whether a table
+ * now has no other vmas using it, so can be freed, we don't
+ * bother to round floor or end up - the tests don't need that.
+ */
- pgd = pgd_offset(tlb->mm, addr);
+ addr &= PMD_MASK;
+ if (addr < floor) {
+ addr += PMD_SIZE;
+ if (!addr)
+ return;
+ }
+ if (ceiling) {
+ ceiling &= PMD_MASK;
+ if (!ceiling)
+ return;
+ }
+ if (end - 1 > ceiling - 1)
+ end -= PMD_SIZE;
+ if (addr > end - 1)
+ return;
+
+ start = addr;
+ pgd = pgd_offset((*tlb)->mm, addr);
do {
next = pgd_addr_end(addr, end);
if (pgd_none_or_clear_bad(pgd))
continue;
- clear_pud_range(tlb, pgd, addr, next);
+ free_pud_range(*tlb, pgd, addr, next, floor, ceiling);
} while (pgd++, addr = next, addr != end);
+
+ if (!tlb_is_full_mm(*tlb))
+ flush_tlb_pgtables((*tlb)->mm, start, end);
+}
+
+void free_pgtables(struct mmu_gather **tlb, struct vm_area_struct *vma,
+ unsigned long floor, unsigned long ceiling)
+{
+ while (vma) {
+ struct vm_area_struct *next = vma->vm_next;
+ unsigned long addr = vma->vm_start;
+
+ if (is_hugepage_only_range(vma->vm_mm, addr, HPAGE_SIZE)) {
+ hugetlb_free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next? next->vm_start: ceiling);
+ } else {
+ /*
+ * Optimization: gather nearby vmas into one call down
+ */
+ while (next && next->vm_start <= vma->vm_end + PMD_SIZE
+ && !is_hugepage_only_range(vma->vm_mm, next->vm_start,
+ HPAGE_SIZE)) {
+ vma = next;
+ next = vma->vm_next;
+ }
+ free_pgd_range(tlb, addr, vma->vm_end,
+ floor, next? next->vm_start: ceiling);
+ }
+ vma = next;
+ }
}
-pte_t fastcall * pte_alloc_map(struct mm_struct *mm, pmd_t *pmd, unsigned long address)
+pte_t fastcall *pte_alloc_map(struct mm_struct *mm, pmd_t *pmd,
+ unsigned long address)
{
if (!pmd_present(*pmd)) {
struct page *new;
* @nr_accounted: Place number of unmapped pages in vm-accountable vma's here
* @details: details of nonlinear truncation or shared cache invalidation
*
- * Returns the number of vma's which were covered by the unmapping.
+ * Returns the end address of the unmapping (restart addr if interrupted).
*
* Unmap all pages in the vma list. Called under page_table_lock.
*
* ensure that any thus-far unmapped pages are flushed before unmap_vmas()
* drops the lock and schedules.
*/
-int unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
+unsigned long unmap_vmas(struct mmu_gather **tlbp, struct mm_struct *mm,
struct vm_area_struct *vma, unsigned long start_addr,
unsigned long end_addr, unsigned long *nr_accounted,
struct zap_details *details)
unsigned long zap_bytes = ZAP_BLOCK_SIZE;
unsigned long tlb_start = 0; /* For tlb_finish_mmu */
int tlb_start_valid = 0;
- int ret = 0;
+ unsigned long start = start_addr;
spinlock_t *i_mmap_lock = details? details->i_mmap_lock: NULL;
int fullmm = tlb_is_full_mm(*tlbp);
for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) {
- unsigned long start;
unsigned long end;
start = max(vma->vm_start, start_addr);
if (vma->vm_flags & VM_ACCOUNT)
*nr_accounted += (end - start) >> PAGE_SHIFT;
- ret++;
while (start != end) {
unsigned long block;
if (i_mmap_lock) {
/* must reset count of rss freed */
*tlbp = tlb_gather_mmu(mm, fullmm);
- details->break_addr = start;
goto out;
}
spin_unlock(&mm->page_table_lock);
}
}
out:
- return ret;
+ return start; /* which is now the end (or restart) address */
}
/**
* @size: number of bytes to zap
* @details: details of nonlinear truncation or shared cache invalidation
*/
-void zap_page_range(struct vm_area_struct *vma, unsigned long address,
+unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
unsigned long size, struct zap_details *details)
{
struct mm_struct *mm = vma->vm_mm;
if (is_vm_hugetlb_page(vma)) {
zap_hugepage_range(vma, address, size);
- return;
+ return end;
}
lru_add_drain();
spin_lock(&mm->page_table_lock);
tlb = tlb_gather_mmu(mm, 0);
- unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
+ end = unmap_vmas(&tlb, mm, vma, address, end, &nr_accounted, details);
tlb_finish_mmu(tlb, address, end);
spin_unlock(&mm->page_table_lock);
+ return end;
}
/*
* i_mmap_lock.
*
* In order to make forward progress despite repeatedly restarting some
- * large vma, note the break_addr set by unmap_vmas when it breaks out:
+ * large vma, note the restart_addr from unmap_vmas when it breaks out:
* and restart from that address when we reach that vma again. It might
* have been split or merged, shrunk or extended, but never shifted: so
* restart_addr remains valid so long as it remains in the vma's range.
}
}
- details->break_addr = end_addr;
- zap_page_range(vma, start_addr, end_addr - start_addr, details);
+ restart_addr = zap_page_range(vma, start_addr,
+ end_addr - start_addr, details);
/*
* We cannot rely on the break test in unmap_vmas:
need_break = need_resched() ||
need_lockbreak(details->i_mmap_lock);
- if (details->break_addr >= end_addr) {
+ if (restart_addr >= end_addr) {
/* We have now completed this vma: mark it so */
vma->vm_truncate_count = details->truncate_count;
if (!need_break)
return 0;
} else {
/* Note restart_addr in vma's truncate_count field */
- vma->vm_truncate_count = details->break_addr;
+ vma->vm_truncate_count = restart_addr;
if (!need_break)
goto again;
}
spin_lock(&mm->page_table_lock);
page_table = pte_offset_map(pmd, address);
if (unlikely(!pte_same(*page_table, orig_pte))) {
- pte_unmap(page_table);
- spin_unlock(&mm->page_table_lock);
- unlock_page(page);
- page_cache_release(page);
ret = VM_FAULT_MINOR;
- goto out;
+ goto out_nomap;
+ }
+
+ if (unlikely(!PageUptodate(page))) {
+ ret = VM_FAULT_SIGBUS;
+ goto out_nomap;
}
/* The page isn't present yet, go ahead with the fault. */
spin_unlock(&mm->page_table_lock);
out:
return ret;
+out_nomap:
+ pte_unmap(page_table);
+ spin_unlock(&mm->page_table_lock);
+ unlock_page(page);
+ page_cache_release(page);
+ goto out;
}
/*