X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fhugetlb.c;h=06fd80149e47b39dab90d3087f5ef7274edd8ef0;hb=48f13bf3e742fca8aab87f6c39451d03bf5952d4;hp=36db012b38dde252c827d0af4719c2e7b79e5083;hpb=920841d8d1d61bc12b43f95a579a5374f6d98f81;p=linux-2.6 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 36db012b38..06fd80149e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -27,6 +27,9 @@ unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; +static gfp_t htlb_alloc_mask = GFP_HIGHUSER; +unsigned long hugepages_treat_as_movable; + /* * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages */ @@ -39,7 +42,7 @@ static void clear_huge_page(struct page *page, unsigned long addr) might_sleep(); for (i = 0; i < (HPAGE_SIZE/PAGE_SIZE); i++) { cond_resched(); - clear_user_highpage(page + i, addr); + clear_user_highpage(page + i, addr + i * PAGE_SIZE); } } @@ -66,25 +69,26 @@ static void enqueue_huge_page(struct page *page) static struct page *dequeue_huge_page(struct vm_area_struct *vma, unsigned long address) { - int nid = numa_node_id(); + int nid; struct page *page = NULL; - struct zonelist *zonelist = huge_zonelist(vma, address); + struct mempolicy *mpol; + struct zonelist *zonelist = huge_zonelist(vma, address, + htlb_alloc_mask, &mpol); struct zone **z; for (z = zonelist->zones; *z; z++) { nid = zone_to_nid(*z); - if (cpuset_zone_allowed_softwall(*z, GFP_HIGHUSER) && - !list_empty(&hugepage_freelists[nid])) + if (cpuset_zone_allowed_softwall(*z, htlb_alloc_mask) && + !list_empty(&hugepage_freelists[nid])) { + page = list_entry(hugepage_freelists[nid].next, + struct page, lru); + list_del(&page->lru); + free_huge_pages--; + free_huge_pages_node[nid]--; break; + } } - - if (*z) { - page = list_entry(hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - free_huge_pages--; - free_huge_pages_node[nid]--; - } + mpol_free(mpol); /* unref if mpol !NULL */ return page; } @@ -101,13 +105,24 @@ static void free_huge_page(struct page *page) static int alloc_fresh_huge_page(void) { - static int nid = 0; + static int prev_nid; struct page *page; - page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN, - HUGETLB_PAGE_ORDER); - nid = next_node(nid, node_online_map); + int nid; + + /* + * Copy static prev_nid to local nid, work on that, then copy it + * back to prev_nid afterwards: otherwise there's a window in which + * a racer might pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really doesn't + * matter if occasionally a racer chooses the same nid as we do. + */ + nid = next_node(prev_nid, node_online_map); if (nid == MAX_NUMNODES) nid = first_node(node_online_map); + prev_nid = nid; + + page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_NOWARN, + HUGETLB_PAGE_ORDER); if (page) { set_compound_page_dtor(page, free_huge_page); spin_lock(&hugetlb_lock); @@ -140,6 +155,8 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; fail: + if (vma->vm_flags & VM_MAYSHARE) + resv_huge_pages++; spin_unlock(&hugetlb_lock); return NULL; } @@ -172,6 +189,17 @@ static int __init hugetlb_setup(char *s) } __setup("hugepages=", hugetlb_setup); +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + #ifdef CONFIG_SYSCTL static void update_and_free_page(struct page *page) { @@ -183,7 +211,7 @@ static void update_and_free_page(struct page *page) 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | 1 << PG_private | 1<< PG_writeback); } - page[1].lru.next = NULL; + set_compound_page_dtor(page, NULL); set_page_refcounted(page); __free_pages(page, HUGETLB_PAGE_ORDER); } @@ -243,6 +271,19 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, max_huge_pages = set_max_huge_pages(max_huge_pages); return 0; } + +int hugetlb_treat_movable_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + proc_dointvec(table, write, file, buffer, length, ppos); + if (hugepages_treat_as_movable) + htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; + else + htlb_alloc_mask = GFP_HIGHUSER; + return 0; +} + #endif /* CONFIG_SYSCTL */ int hugetlb_report_meminfo(char *buf) @@ -279,15 +320,14 @@ unsigned long hugetlb_total_pages(void) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static struct page *hugetlb_nopage(struct vm_area_struct *vma, - unsigned long address, int *unused) +static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { BUG(); - return NULL; + return 0; } struct vm_operations_struct hugetlb_vm_ops = { - .nopage = hugetlb_nopage, + .fault = hugetlb_vm_op_fault, }; static pte_t make_huge_pte(struct vm_area_struct *vma, struct page *page, @@ -313,9 +353,9 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, pte_t entry; entry = pte_mkwrite(pte_mkdirty(*ptep)); - ptep_set_access_flags(vma, address, ptep, entry, 1); - update_mmu_cache(vma, address, entry); - lazy_mmu_prot_update(entry); + if (ptep_set_access_flags(vma, address, ptep, entry, 1)) { + update_mmu_cache(vma, address, entry); + } } @@ -432,7 +472,7 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, avoidcopy = (page_count(old_page) == 1); if (avoidcopy) { set_huge_ptep_writable(vma, address, ptep); - return VM_FAULT_MINOR; + return 0; } page_cache_get(old_page); @@ -457,10 +497,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma, } page_cache_release(new_page); page_cache_release(old_page); - return VM_FAULT_MINOR; + return 0; } -int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, +static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { int ret = VM_FAULT_SIGBUS; @@ -514,7 +554,7 @@ retry: if (idx >= size) goto backout; - ret = VM_FAULT_MINOR; + ret = 0; if (!pte_none(*ptep)) goto backout; @@ -565,7 +605,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return ret; } - ret = VM_FAULT_MINOR; + ret = 0; spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ @@ -604,7 +644,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); ret = hugetlb_fault(mm, vma, vaddr, 0); spin_lock(&mm->page_table_lock); - if (ret == VM_FAULT_MINOR) + if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; @@ -667,7 +707,6 @@ void hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_ptep_get_and_clear(mm, address, ptep); pte = pte_mkhuge(pte_modify(pte, newprot)); set_huge_pte_at(mm, address, ptep, pte); - lazy_mmu_prot_update(pte); } } spin_unlock(&mm->page_table_lock); @@ -817,6 +856,26 @@ int hugetlb_reserve_pages(struct inode *inode, long from, long to) chg = region_chg(&inode->i_mapping->private_list, from, to); if (chg < 0) return chg; + /* + * When cpuset is configured, it breaks the strict hugetlb page + * reservation as the accounting is done on a global variable. Such + * reservation is completely rubbish in the presence of cpuset because + * the reservation is not checked against page availability for the + * current cpuset. Application can still potentially OOM'ed by kernel + * with lack of free htlb page in cpuset that the task is in. + * Attempt to enforce strict accounting with cpuset is almost + * impossible (or too ugly) because cpuset is too fluid that + * task or memory node can be dynamically moved between cpusets. + * + * The change of semantics for shared hugetlb mapping with cpuset is + * undesirable. However, in order to preserve some of the semantics, + * we fall back to check against current free page availability as + * a best attempt and hopefully to minimize the impact of changing + * semantics that cpuset has. + */ + if (chg > cpuset_mems_nr(free_huge_pages_node)) + return -ENOMEM; + ret = hugetlb_acct_memory(chg); if (ret < 0) return ret;