}
static void copy_huge_page(struct page *dst, struct page *src,
- unsigned long addr)
+ unsigned long addr, struct vm_area_struct *vma)
{
int i;
might_sleep();
for (i = 0; i < HPAGE_SIZE/PAGE_SIZE; i++) {
cond_resched();
- copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE);
+ copy_user_highpage(dst + i, src + i, addr + i*PAGE_SIZE, vma);
}
}
for (z = zonelist->zones; *z; z++) {
nid = zone_to_nid(*z);
- if (cpuset_zone_allowed(*z, GFP_HIGHUSER) &&
+ if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) &&
!list_empty(&hugepage_freelists[nid]))
break;
}
if (nid == MAX_NUMNODES)
nid = first_node(node_online_map);
if (page) {
- page[1].lru.next = (void *)free_huge_page; /* dtor */
+ set_compound_page_dtor(page, free_huge_page);
spin_lock(&hugetlb_lock);
nr_huge_pages++;
nr_huge_pages_node[page_to_nid(page)]++;
return page;
fail:
+ if (vma->vm_flags & VM_MAYSHARE)
+ resv_huge_pages++;
spin_unlock(&hugetlb_lock);
return NULL;
}
}
__setup("hugepages=", hugetlb_setup);
+static unsigned int cpuset_mems_nr(unsigned int *array)
+{
+ int node;
+ unsigned int nr = 0;
+
+ for_each_node_mask(node, cpuset_current_mems_allowed)
+ nr += array[node];
+
+ return nr;
+}
+
#ifdef CONFIG_SYSCTL
static void update_and_free_page(struct page *page)
{
pte_t entry;
entry = pte_mkwrite(pte_mkdirty(*ptep));
- ptep_set_access_flags(vma, address, ptep, entry, 1);
- update_mmu_cache(vma, address, entry);
- lazy_mmu_prot_update(entry);
+ if (ptep_set_access_flags(vma, address, ptep, entry, 1)) {
+ update_mmu_cache(vma, address, entry);
+ lazy_mmu_prot_update(entry);
+ }
}
entry = *src_pte;
ptepage = pte_page(entry);
get_page(ptepage);
- add_mm_counter(dst, file_rss, HPAGE_SIZE / PAGE_SIZE);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
pte_t pte;
struct page *page;
struct page *tmp;
+ /*
+ * A page gathering list, protected by per file i_mmap_lock. The
+ * lock is used to avoid list corruption from multiple unmapping
+ * of the same page since we are using page->lru.
+ */
LIST_HEAD(page_list);
WARN_ON(!is_vm_hugetlb_page(vma));
BUG_ON(end & ~HPAGE_MASK);
spin_lock(&mm->page_table_lock);
-
- /* Update high watermark before we lower rss */
- update_hiwater_rss(mm);
-
for (address = start; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
+ if (huge_pmd_unshare(mm, &address, ptep))
+ continue;
+
pte = huge_ptep_get_and_clear(mm, address, ptep);
if (pte_none(pte))
continue;
page = pte_page(pte);
+ if (pte_dirty(pte))
+ set_page_dirty(page);
list_add(&page->lru, &page_list);
- add_mm_counter(mm, file_rss, (int) -(HPAGE_SIZE / PAGE_SIZE));
}
-
spin_unlock(&mm->page_table_lock);
flush_tlb_range(vma, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
}
spin_unlock(&mm->page_table_lock);
- copy_huge_page(new_page, old_page, address);
+ copy_huge_page(new_page, old_page, address, vma);
spin_lock(&mm->page_table_lock);
ptep = huge_pte_offset(mm, address & HPAGE_MASK);
if (!pte_none(*ptep))
goto backout;
- add_mm_counter(mm, file_rss, HPAGE_SIZE / PAGE_SIZE);
new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE)
&& (vma->vm_flags & VM_SHARED)));
set_huge_pte_at(mm, address, ptep, new_pte);
BUG_ON(address >= end);
flush_cache_range(vma, address, end);
+ spin_lock(&vma->vm_file->f_mapping->i_mmap_lock);
spin_lock(&mm->page_table_lock);
for (; address < end; address += HPAGE_SIZE) {
ptep = huge_pte_offset(mm, address);
if (!ptep)
continue;
+ if (huge_pmd_unshare(mm, &address, ptep))
+ continue;
if (!pte_none(*ptep)) {
pte = huge_ptep_get_and_clear(mm, address, ptep);
pte = pte_mkhuge(pte_modify(pte, newprot));
}
}
spin_unlock(&mm->page_table_lock);
+ spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock);
flush_tlb_range(vma, start, end);
}
chg = region_chg(&inode->i_mapping->private_list, from, to);
if (chg < 0)
return chg;
+ /*
+ * When cpuset is configured, it breaks the strict hugetlb page
+ * reservation as the accounting is done on a global variable. Such
+ * reservation is completely rubbish in the presence of cpuset because
+ * the reservation is not checked against page availability for the
+ * current cpuset. Application can still potentially OOM'ed by kernel
+ * with lack of free htlb page in cpuset that the task is in.
+ * Attempt to enforce strict accounting with cpuset is almost
+ * impossible (or too ugly) because cpuset is too fluid that
+ * task or memory node can be dynamically moved between cpusets.
+ *
+ * The change of semantics for shared hugetlb mapping with cpuset is
+ * undesirable. However, in order to preserve some of the semantics,
+ * we fall back to check against current free page availability as
+ * a best attempt and hopefully to minimize the impact of changing
+ * semantics that cpuset has.
+ */
+ if (chg > cpuset_mems_nr(free_huge_pages_node))
+ return -ENOMEM;
+
ret = hugetlb_acct_memory(chg);
if (ret < 0)
return ret;