X-Git-Url: https://err.no/cgi-bin/gitweb.cgi?a=blobdiff_plain;f=mm%2Fmempolicy.c;h=b21869a39f0b0380265fb3bd9e0ed885a29afa0d;hb=2d2020242d14dc8caf83d7682d91368cff64290c;hp=27da6d5c77bade350dd88f15eed6424923d91de7;hpb=7e2ab150d1b3b286a4c864c60a549b2601777b63;p=linux-2.6 diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 27da6d5c77..b21869a39f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -132,19 +132,29 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) } return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; } + /* Generate a custom zonelist for the BIND policy. */ static struct zonelist *bind_zonelist(nodemask_t *nodes) { struct zonelist *zl; - int num, max, nd; + int num, max, nd, k; max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); - zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); + zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); if (!zl) return NULL; num = 0; - for_each_node_mask(nd, *nodes) - zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; + /* First put in the highest zones from all nodes, then all the next + lower zones etc. Avoid empty zones because the memory allocator + doesn't like them. If you implement node hot removal you + have to fix that. */ + for (k = policy_zone; k >= 0; k--) { + for_each_node_mask(nd, *nodes) { + struct zone *z = &NODE_DATA(nd)->node_zones[k]; + if (z->present_pages > 0) + zl->zones[num++] = z; + } + } zl->zones[num] = NULL; return zl; } @@ -187,7 +197,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) return policy; } -static void gather_stats(struct page *, void *); +static void gather_stats(struct page *, void *, int pte_dirty); static void migrate_page_add(struct page *page, struct list_head *pagelist, unsigned long flags); @@ -229,7 +239,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; if (flags & MPOL_MF_STATS) - gather_stats(page, private); + gather_stats(page, private, pte_dirty(*pte)); else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) migrate_page_add(page, private, flags); else @@ -320,9 +330,19 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, int err; struct vm_area_struct *first, *vma, *prev; - /* Clear the LRU lists so pages can be isolated */ - if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + /* Must have swap device for migration */ + if (nr_swap_pages <= 0) + return ERR_PTR(-ENODEV); + + /* + * Clear the LRU lists so pages can be isolated. + * Note that pages may be moved off the LRU after we have + * drained them. Those pages will fail to migrate like other + * pages that may be busy. + */ lru_add_drain_all(); + } first = find_vma(mm, start); if (!first) @@ -542,7 +562,7 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, */ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { if (isolate_lru_page(page)) - list_add(&page->lru, pagelist); + list_add_tail(&page->lru, pagelist); } } @@ -559,6 +579,7 @@ static int migrate_pages_to(struct list_head *pagelist, LIST_HEAD(moved); LIST_HEAD(failed); int err = 0; + unsigned long offset = 0; int nr_pages; struct page *page; struct list_head *p; @@ -566,8 +587,21 @@ static int migrate_pages_to(struct list_head *pagelist, redo: nr_pages = 0; list_for_each(p, pagelist) { - if (vma) - page = alloc_page_vma(GFP_HIGHUSER, vma, vma->vm_start); + if (vma) { + /* + * The address passed to alloc_page_vma is used to + * generate the proper interleave behavior. We fake + * the address here by an increasing offset in order + * to get the proper distribution of pages. + * + * No decision has been made as to which page + * a certain old page is moved to so we cannot + * specify the correct address. + */ + page = alloc_page_vma(GFP_HIGHUSER, vma, + offset + vma->vm_start); + offset += PAGE_SIZE; + } else page = alloc_pages_node(dest, GFP_HIGHUSER, 0); @@ -575,9 +609,9 @@ redo: err = -ENOMEM; goto out; } - list_add(&page->lru, &newlist); + list_add_tail(&page->lru, &newlist); nr_pages++; - if (nr_pages > MIGRATE_CHUNK_SIZE); + if (nr_pages > MIGRATE_CHUNK_SIZE) break; } err = migrate_pages(pagelist, &newlist, &moved, &failed); @@ -724,7 +758,7 @@ long do_mbind(unsigned long start, unsigned long len, MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || mode > MPOL_MAX) return -EINVAL; - if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) @@ -798,6 +832,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; + if (maxnode > PAGE_SIZE*BITS_PER_BYTE) + return -EINVAL; nlongs = BITS_TO_LONGS(maxnode); if ((maxnode % BITS_PER_LONG) == 0) @@ -916,19 +952,20 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, */ if ((current->euid != task->suid) && (current->euid != task->uid) && (current->uid != task->suid) && (current->uid != task->uid) && - !capable(CAP_SYS_ADMIN)) { + !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ - if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { + if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } - err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); + err = do_migrate_pages(mm, &old, &new, + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: mmput(mm); return err; @@ -1159,6 +1196,7 @@ static inline unsigned interleave_nid(struct mempolicy *pol, return interleave_nodes(pol); } +#ifdef CONFIG_HUGETLBFS /* Return a zonelist suitable for a huge page allocation. */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) { @@ -1172,6 +1210,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) } return zonelist_policy(GFP_HIGHUSER, pol); } +#endif /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ @@ -1724,66 +1763,145 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) struct numa_maps { unsigned long pages; unsigned long anon; - unsigned long mapped; + unsigned long active; + unsigned long writeback; unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; unsigned long node[MAX_NUMNODES]; }; -static void gather_stats(struct page *page, void *private) +static void gather_stats(struct page *page, void *private, int pte_dirty) { struct numa_maps *md = private; int count = page_mapcount(page); - if (count) - md->mapped++; + md->pages++; + if (pte_dirty || PageDirty(page)) + md->dirty++; - if (count > md->mapcount_max) - md->mapcount_max = count; + if (PageSwapCache(page)) + md->swapcache++; - md->pages++; + if (PageActive(page)) + md->active++; + + if (PageWriteback(page)) + md->writeback++; if (PageAnon(page)) md->anon++; + if (count > md->mapcount_max) + md->mapcount_max = count; + md->node[page_to_nid(page)]++; cond_resched(); } +#ifdef CONFIG_HUGETLB_PAGE +static void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ + unsigned long addr; + struct page *page; + + for (addr = start; addr < end; addr += HPAGE_SIZE) { + pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); + pte_t pte; + + if (!ptep) + continue; + + pte = *ptep; + if (pte_none(pte)) + continue; + + page = pte_page(pte); + if (!page) + continue; + + gather_stats(page, md, pte_dirty(*ptep)); + } +} +#else +static inline void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ +} +#endif + int show_numa_map(struct seq_file *m, void *v) { struct task_struct *task = m->private; struct vm_area_struct *vma = v; struct numa_maps *md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; int n; char buffer[50]; - if (!vma->vm_mm) + if (!mm) return 0; md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); if (!md) return 0; - check_pgd_range(vma, vma->vm_start, vma->vm_end, - &node_online_map, MPOL_MF_STATS, md); + mpol_to_str(buffer, sizeof(buffer), + get_vma_policy(task, vma, vma->vm_start)); - if (md->pages) { - mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(task, vma, vma->vm_start)); + seq_printf(m, "%08lx %s", vma->vm_start, buffer); - seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", - vma->vm_start, buffer, md->pages, - md->mapped, md->mapcount_max); + if (file) { + seq_printf(m, " file="); + seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } + + if (is_vm_hugetlb_page(vma)) { + check_huge_range(vma, vma->vm_start, vma->vm_end, md); + seq_printf(m, " huge"); + } else { + check_pgd_range(vma, vma->vm_start, vma->vm_end, + &node_online_map, MPOL_MF_STATS, md); + } - if (md->anon) - seq_printf(m," anon=%lu",md->anon); + if (!md->pages) + goto out; - for_each_online_node(n) - if (md->node[n]) - seq_printf(m, " N%d=%lu", n, md->node[n]); + if (md->anon) + seq_printf(m," anon=%lu",md->anon); - seq_putc(m, '\n'); - } + if (md->dirty) + seq_printf(m," dirty=%lu",md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m," swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m," active=%lu", md->active); + + if (md->writeback) + seq_printf(m," writeback=%lu", md->writeback); + + for_each_online_node(n) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); kfree(md); if (m->count < m->size)