err.no Git - linux-2.6/blob - mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89
  90 #include <asm/tlbflush.h>
  91 #include <asm/uaccess.h>
  92
  93 /* Internal flags */
  94 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  95 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  96 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
  97
  98 static kmem_cache_t *policy_cache;
  99 static kmem_cache_t *sn_cache;
 100
 101 #define PDprintk(fmt...)
 102
 103 /* Highest zone. An specific allocation for a zone below that is not
 104    policied. */
 105 int policy_zone = ZONE_DMA;
 106
 107 struct mempolicy default_policy = {
 108         .refcnt = ATOMIC_INIT(1), /* never free it */
 109         .policy = MPOL_DEFAULT,
 110 };
 111
 112 /* Do sanity checking on a policy */
 113 static int mpol_check_policy(int mode, nodemask_t *nodes)
 114 {
 115         int empty = nodes_empty(*nodes);
 116
 117         switch (mode) {
 118         case MPOL_DEFAULT:
 119                 if (!empty)
 120                         return -EINVAL;
 121                 break;
 122         case MPOL_BIND:
 123         case MPOL_INTERLEAVE:
 124                 /* Preferred will only use the first bit, but allow
 125                    more for now. */
 126                 if (empty)
 127                         return -EINVAL;
 128                 break;
 129         }
 130         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 131 }
 132 /* Generate a custom zonelist for the BIND policy. */
 133 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 134 {
 135         struct zonelist *zl;
 136         int num, max, nd;
 137
 138         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 139         zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
 140         if (!zl)
 141                 return NULL;
 142         num = 0;
 143         for_each_node_mask(nd, *nodes)
 144                 zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
 145         zl->zones[num] = NULL;
 146         return zl;
 147 }
 148
 149 /* Create a new policy */
 150 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 151 {
 152         struct mempolicy *policy;
 153
 154         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 155         if (mode == MPOL_DEFAULT)
 156                 return NULL;
 157         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 158         if (!policy)
 159                 return ERR_PTR(-ENOMEM);
 160         atomic_set(&policy->refcnt, 1);
 161         switch (mode) {
 162         case MPOL_INTERLEAVE:
 163                 policy->v.nodes = *nodes;
 164                 if (nodes_weight(*nodes) == 0) {
 165                         kmem_cache_free(policy_cache, policy);
 166                         return ERR_PTR(-EINVAL);
 167                 }
 168                 break;
 169         case MPOL_PREFERRED:
 170                 policy->v.preferred_node = first_node(*nodes);
 171                 if (policy->v.preferred_node >= MAX_NUMNODES)
 172                         policy->v.preferred_node = -1;
 173                 break;
 174         case MPOL_BIND:
 175                 policy->v.zonelist = bind_zonelist(nodes);
 176                 if (policy->v.zonelist == NULL) {
 177                         kmem_cache_free(policy_cache, policy);
 178                         return ERR_PTR(-ENOMEM);
 179                 }
 180                 break;
 181         }
 182         policy->policy = mode;
 183         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 184         return policy;
 185 }
 186
 187 static void gather_stats(struct page *, void *);
 188 static void migrate_page_add(struct vm_area_struct *vma,
 189         struct page *page, struct list_head *pagelist, unsigned long flags);
 190
 191 /* Scan through pages checking if pages follow certain conditions. */
 192 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 193                 unsigned long addr, unsigned long end,
 194                 const nodemask_t *nodes, unsigned long flags,
 195                 void *private)
 196 {
 197         pte_t *orig_pte;
 198         pte_t *pte;
 199         spinlock_t *ptl;
 200
 201         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 202         do {
 203                 struct page *page;
 204                 unsigned int nid;
 205
 206                 if (!pte_present(*pte))
 207                         continue;
 208                 page = vm_normal_page(vma, addr, *pte);
 209                 if (!page)
 210                         continue;
 211                 /*
 212                  * The check for PageReserved here is important to avoid
 213                  * handling zero pages and other pages that may have been
 214                  * marked special by the system.
 215                  *
 216                  * If the PageReserved would not be checked here then f.e.
 217                  * the location of the zero page could have an influence
 218                  * on MPOL_MF_STRICT, zero pages would be counted for
 219                  * the per node stats, and there would be useless attempts
 220                  * to put zero pages on the migration list.
 221                  */
 222                 if (PageReserved(page))
 223                         continue;
 224                 nid = page_to_nid(page);
 225                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 226                         continue;
 227
 228                 if (flags & MPOL_MF_STATS)
 229                         gather_stats(page, private);
 230                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 231                         migrate_page_add(vma, page, private, flags);
 232                 else
 233                         break;
 234         } while (pte++, addr += PAGE_SIZE, addr != end);
 235         pte_unmap_unlock(orig_pte, ptl);
 236         return addr != end;
 237 }
 238
 239 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 240                 unsigned long addr, unsigned long end,
 241                 const nodemask_t *nodes, unsigned long flags,
 242                 void *private)
 243 {
 244         pmd_t *pmd;
 245         unsigned long next;
 246
 247         pmd = pmd_offset(pud, addr);
 248         do {
 249                 next = pmd_addr_end(addr, end);
 250                 if (pmd_none_or_clear_bad(pmd))
 251                         continue;
 252                 if (check_pte_range(vma, pmd, addr, next, nodes,
 253                                     flags, private))
 254                         return -EIO;
 255         } while (pmd++, addr = next, addr != end);
 256         return 0;
 257 }
 258
 259 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 260                 unsigned long addr, unsigned long end,
 261                 const nodemask_t *nodes, unsigned long flags,
 262                 void *private)
 263 {
 264         pud_t *pud;
 265         unsigned long next;
 266
 267         pud = pud_offset(pgd, addr);
 268         do {
 269                 next = pud_addr_end(addr, end);
 270                 if (pud_none_or_clear_bad(pud))
 271                         continue;
 272                 if (check_pmd_range(vma, pud, addr, next, nodes,
 273                                     flags, private))
 274                         return -EIO;
 275         } while (pud++, addr = next, addr != end);
 276         return 0;
 277 }
 278
 279 static inline int check_pgd_range(struct vm_area_struct *vma,
 280                 unsigned long addr, unsigned long end,
 281                 const nodemask_t *nodes, unsigned long flags,
 282                 void *private)
 283 {
 284         pgd_t *pgd;
 285         unsigned long next;
 286
 287         pgd = pgd_offset(vma->vm_mm, addr);
 288         do {
 289                 next = pgd_addr_end(addr, end);
 290                 if (pgd_none_or_clear_bad(pgd))
 291                         continue;
 292                 if (check_pud_range(vma, pgd, addr, next, nodes,
 293                                     flags, private))
 294                         return -EIO;
 295         } while (pgd++, addr = next, addr != end);
 296         return 0;
 297 }
 298
 299 /* Check if a vma is migratable */
 300 static inline int vma_migratable(struct vm_area_struct *vma)
 301 {
 302         if (vma->vm_flags & (
 303                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 304                 return 0;
 305         return 1;
 306 }
 307
 308 /*
 309  * Check if all pages in a range are on a set of nodes.
 310  * If pagelist != NULL then isolate pages from the LRU and
 311  * put them on the pagelist.
 312  */
 313 static struct vm_area_struct *
 314 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 315                 const nodemask_t *nodes, unsigned long flags, void *private)
 316 {
 317         int err;
 318         struct vm_area_struct *first, *vma, *prev;
 319
 320         /* Clear the LRU lists so pages can be isolated */
 321         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 322                 lru_add_drain_all();
 323
 324         first = find_vma(mm, start);
 325         if (!first)
 326                 return ERR_PTR(-EFAULT);
 327         prev = NULL;
 328         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 329                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 330                         if (!vma->vm_next && vma->vm_end < end)
 331                                 return ERR_PTR(-EFAULT);
 332                         if (prev && prev->vm_end < vma->vm_start)
 333                                 return ERR_PTR(-EFAULT);
 334                 }
 335                 if (!is_vm_hugetlb_page(vma) &&
 336                     ((flags & MPOL_MF_STRICT) ||
 337                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 338                                 vma_migratable(vma)))) {
 339                         unsigned long endvma = vma->vm_end;
 340
 341                         if (endvma > end)
 342                                 endvma = end;
 343                         if (vma->vm_start > start)
 344                                 start = vma->vm_start;
 345                         err = check_pgd_range(vma, start, endvma, nodes,
 346                                                 flags, private);
 347                         if (err) {
 348                                 first = ERR_PTR(err);
 349                                 break;
 350                         }
 351                 }
 352                 prev = vma;
 353         }
 354         return first;
 355 }
 356
 357 /* Apply policy to a single VMA */
 358 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 359 {
 360         int err = 0;
 361         struct mempolicy *old = vma->vm_policy;
 362
 363         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 364                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 365                  vma->vm_ops, vma->vm_file,
 366                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 367
 368         if (vma->vm_ops && vma->vm_ops->set_policy)
 369                 err = vma->vm_ops->set_policy(vma, new);
 370         if (!err) {
 371                 mpol_get(new);
 372                 vma->vm_policy = new;
 373                 mpol_free(old);
 374         }
 375         return err;
 376 }
 377
 378 /* Step 2: apply policy to a range and do splits. */
 379 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 380                        unsigned long end, struct mempolicy *new)
 381 {
 382         struct vm_area_struct *next;
 383         int err;
 384
 385         err = 0;
 386         for (; vma && vma->vm_start < end; vma = next) {
 387                 next = vma->vm_next;
 388                 if (vma->vm_start < start)
 389                         err = split_vma(vma->vm_mm, vma, start, 1);
 390                 if (!err && vma->vm_end > end)
 391                         err = split_vma(vma->vm_mm, vma, end, 0);
 392                 if (!err)
 393                         err = policy_vma(vma, new);
 394                 if (err)
 395                         break;
 396         }
 397         return err;
 398 }
 399
 400 static int contextualize_policy(int mode, nodemask_t *nodes)
 401 {
 402         if (!nodes)
 403                 return 0;
 404
 405         cpuset_update_task_memory_state();
 406         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 407                 return -EINVAL;
 408         return mpol_check_policy(mode, nodes);
 409 }
 410
 411 /* Set the process memory policy */
 412 long do_set_mempolicy(int mode, nodemask_t *nodes)
 413 {
 414         struct mempolicy *new;
 415
 416         if (contextualize_policy(mode, nodes))
 417                 return -EINVAL;
 418         new = mpol_new(mode, nodes);
 419         if (IS_ERR(new))
 420                 return PTR_ERR(new);
 421         mpol_free(current->mempolicy);
 422         current->mempolicy = new;
 423         if (new && new->policy == MPOL_INTERLEAVE)
 424                 current->il_next = first_node(new->v.nodes);
 425         return 0;
 426 }
 427
 428 /* Fill a zone bitmap for a policy */
 429 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 430 {
 431         int i;
 432
 433         nodes_clear(*nodes);
 434         switch (p->policy) {
 435         case MPOL_BIND:
 436                 for (i = 0; p->v.zonelist->zones[i]; i++)
 437                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 438                                 *nodes);
 439                 break;
 440         case MPOL_DEFAULT:
 441                 break;
 442         case MPOL_INTERLEAVE:
 443                 *nodes = p->v.nodes;
 444                 break;
 445         case MPOL_PREFERRED:
 446                 /* or use current node instead of online map? */
 447                 if (p->v.preferred_node < 0)
 448                         *nodes = node_online_map;
 449                 else
 450                         node_set(p->v.preferred_node, *nodes);
 451                 break;
 452         default:
 453                 BUG();
 454         }
 455 }
 456
 457 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 458 {
 459         struct page *p;
 460         int err;
 461
 462         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 463         if (err >= 0) {
 464                 err = page_to_nid(p);
 465                 put_page(p);
 466         }
 467         return err;
 468 }
 469
 470 /* Retrieve NUMA policy */
 471 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 472                         unsigned long addr, unsigned long flags)
 473 {
 474         int err;
 475         struct mm_struct *mm = current->mm;
 476         struct vm_area_struct *vma = NULL;
 477         struct mempolicy *pol = current->mempolicy;
 478
 479         cpuset_update_task_memory_state();
 480         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 481                 return -EINVAL;
 482         if (flags & MPOL_F_ADDR) {
 483                 down_read(&mm->mmap_sem);
 484                 vma = find_vma_intersection(mm, addr, addr+1);
 485                 if (!vma) {
 486                         up_read(&mm->mmap_sem);
 487                         return -EFAULT;
 488                 }
 489                 if (vma->vm_ops && vma->vm_ops->get_policy)
 490                         pol = vma->vm_ops->get_policy(vma, addr);
 491                 else
 492                         pol = vma->vm_policy;
 493         } else if (addr)
 494                 return -EINVAL;
 495
 496         if (!pol)
 497                 pol = &default_policy;
 498
 499         if (flags & MPOL_F_NODE) {
 500                 if (flags & MPOL_F_ADDR) {
 501                         err = lookup_node(mm, addr);
 502                         if (err < 0)
 503                                 goto out;
 504                         *policy = err;
 505                 } else if (pol == current->mempolicy &&
 506                                 pol->policy == MPOL_INTERLEAVE) {
 507                         *policy = current->il_next;
 508                 } else {
 509                         err = -EINVAL;
 510                         goto out;
 511                 }
 512         } else
 513                 *policy = pol->policy;
 514
 515         if (vma) {
 516                 up_read(&current->mm->mmap_sem);
 517                 vma = NULL;
 518         }
 519
 520         err = 0;
 521         if (nmask)
 522                 get_zonemask(pol, nmask);
 523
 524  out:
 525         if (vma)
 526                 up_read(&current->mm->mmap_sem);
 527         return err;
 528 }
 529
 530 /*
 531  * page migration
 532  */
 533
 534 /* Check if we are the only process mapping the page in question */
 535 static inline int single_mm_mapping(struct mm_struct *mm,
 536                         struct address_space *mapping)
 537 {
 538         struct vm_area_struct *vma;
 539         struct prio_tree_iter iter;
 540         int rc = 1;
 541
 542         spin_lock(&mapping->i_mmap_lock);
 543         vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
 544                 if (mm != vma->vm_mm) {
 545                         rc = 0;
 546                         goto out;
 547                 }
 548         list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
 549                 if (mm != vma->vm_mm) {
 550                         rc = 0;
 551                         goto out;
 552                 }
 553 out:
 554         spin_unlock(&mapping->i_mmap_lock);
 555         return rc;
 556 }
 557
 558 /*
 559  * Add a page to be migrated to the pagelist
 560  */
 561 static void migrate_page_add(struct vm_area_struct *vma,
 562         struct page *page, struct list_head *pagelist, unsigned long flags)
 563 {
 564         /*
 565          * Avoid migrating a page that is shared by others and not writable.
 566          */
 567         if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
 568             mapping_writably_mapped(page->mapping) ||
 569             single_mm_mapping(vma->vm_mm, page->mapping)) {
 570                 if (isolate_lru_page(page))
 571                         list_add(&page->lru, pagelist);
 572         }
 573 }
 574
 575 static int swap_pages(struct list_head *pagelist)
 576 {
 577         LIST_HEAD(moved);
 578         LIST_HEAD(failed);
 579         int n;
 580
 581         n = migrate_pages(pagelist, NULL, &moved, &failed);
 582         putback_lru_pages(&failed);
 583         putback_lru_pages(&moved);
 584
 585         return n;
 586 }
 587
 588 /*
 589  * For now migrate_pages simply swaps out the pages from nodes that are in
 590  * the source set but not in the target set. In the future, we would
 591  * want a function that moves pages between the two nodesets in such
 592  * a way as to preserve the physical layout as much as possible.
 593  *
 594  * Returns the number of page that could not be moved.
 595  */
 596 int do_migrate_pages(struct mm_struct *mm,
 597         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 598 {
 599         LIST_HEAD(pagelist);
 600         int count = 0;
 601         nodemask_t nodes;
 602
 603         nodes_andnot(nodes, *from_nodes, *to_nodes);
 604
 605         down_read(&mm->mmap_sem);
 606         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
 607                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 608
 609         if (!list_empty(&pagelist)) {
 610                 count = swap_pages(&pagelist);
 611                 putback_lru_pages(&pagelist);
 612         }
 613
 614         up_read(&mm->mmap_sem);
 615         return count;
 616 }
 617
 618 long do_mbind(unsigned long start, unsigned long len,
 619                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 620 {
 621         struct vm_area_struct *vma;
 622         struct mm_struct *mm = current->mm;
 623         struct mempolicy *new;
 624         unsigned long end;
 625         int err;
 626         LIST_HEAD(pagelist);
 627
 628         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 629                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 630             || mode > MPOL_MAX)
 631                 return -EINVAL;
 632         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
 633                 return -EPERM;
 634
 635         if (start & ~PAGE_MASK)
 636                 return -EINVAL;
 637
 638         if (mode == MPOL_DEFAULT)
 639                 flags &= ~MPOL_MF_STRICT;
 640
 641         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 642         end = start + len;
 643
 644         if (end < start)
 645                 return -EINVAL;
 646         if (end == start)
 647                 return 0;
 648
 649         if (mpol_check_policy(mode, nmask))
 650                 return -EINVAL;
 651
 652         new = mpol_new(mode, nmask);
 653         if (IS_ERR(new))
 654                 return PTR_ERR(new);
 655
 656         /*
 657          * If we are using the default policy then operation
 658          * on discontinuous address spaces is okay after all
 659          */
 660         if (!new)
 661                 flags |= MPOL_MF_DISCONTIG_OK;
 662
 663         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 664                         mode,nodes_addr(nodes)[0]);
 665
 666         down_write(&mm->mmap_sem);
 667         vma = check_range(mm, start, end, nmask,
 668                           flags | MPOL_MF_INVERT, &pagelist);
 669
 670         err = PTR_ERR(vma);
 671         if (!IS_ERR(vma)) {
 672                 int nr_failed = 0;
 673
 674                 err = mbind_range(vma, start, end, new);
 675                 if (!list_empty(&pagelist))
 676                         nr_failed = swap_pages(&pagelist);
 677
 678                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 679                         err = -EIO;
 680         }
 681         if (!list_empty(&pagelist))
 682                 putback_lru_pages(&pagelist);
 683
 684         up_write(&mm->mmap_sem);
 685         mpol_free(new);
 686         return err;
 687 }
 688
 689 /*
 690  * User space interface with variable sized bitmaps for nodelists.
 691  */
 692
 693 /* Copy a node mask from user space. */
 694 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 695                      unsigned long maxnode)
 696 {
 697         unsigned long k;
 698         unsigned long nlongs;
 699         unsigned long endmask;
 700
 701         --maxnode;
 702         nodes_clear(*nodes);
 703         if (maxnode == 0 || !nmask)
 704                 return 0;
 705
 706         nlongs = BITS_TO_LONGS(maxnode);
 707         if ((maxnode % BITS_PER_LONG) == 0)
 708                 endmask = ~0UL;
 709         else
 710                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 711
 712         /* When the user specified more nodes than supported just check
 713            if the non supported part is all zero. */
 714         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 715                 if (nlongs > PAGE_SIZE/sizeof(long))
 716                         return -EINVAL;
 717                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 718                         unsigned long t;
 719                         if (get_user(t, nmask + k))
 720                                 return -EFAULT;
 721                         if (k == nlongs - 1) {
 722                                 if (t & endmask)
 723                                         return -EINVAL;
 724                         } else if (t)
 725                                 return -EINVAL;
 726                 }
 727                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 728                 endmask = ~0UL;
 729         }
 730
 731         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 732                 return -EFAULT;
 733         nodes_addr(*nodes)[nlongs-1] &= endmask;
 734         return 0;
 735 }
 736
 737 /* Copy a kernel node mask to user space */
 738 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 739                               nodemask_t *nodes)
 740 {
 741         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 742         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 743
 744         if (copy > nbytes) {
 745                 if (copy > PAGE_SIZE)
 746                         return -EINVAL;
 747                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 748                         return -EFAULT;
 749                 copy = nbytes;
 750         }
 751         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 752 }
 753
 754 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 755                         unsigned long mode,
 756                         unsigned long __user *nmask, unsigned long maxnode,
 757                         unsigned flags)
 758 {
 759         nodemask_t nodes;
 760         int err;
 761
 762         err = get_nodes(&nodes, nmask, maxnode);
 763         if (err)
 764                 return err;
 765         return do_mbind(start, len, mode, &nodes, flags);
 766 }
 767
 768 /* Set the process memory policy */
 769 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 770                 unsigned long maxnode)
 771 {
 772         int err;
 773         nodemask_t nodes;
 774
 775         if (mode < 0 || mode > MPOL_MAX)
 776                 return -EINVAL;
 777         err = get_nodes(&nodes, nmask, maxnode);
 778         if (err)
 779                 return err;
 780         return do_set_mempolicy(mode, &nodes);
 781 }
 782
 783 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 784                 const unsigned long __user *old_nodes,
 785                 const unsigned long __user *new_nodes)
 786 {
 787         struct mm_struct *mm;
 788         struct task_struct *task;
 789         nodemask_t old;
 790         nodemask_t new;
 791         nodemask_t task_nodes;
 792         int err;
 793
 794         err = get_nodes(&old, old_nodes, maxnode);
 795         if (err)
 796                 return err;
 797
 798         err = get_nodes(&new, new_nodes, maxnode);
 799         if (err)
 800                 return err;
 801
 802         /* Find the mm_struct */
 803         read_lock(&tasklist_lock);
 804         task = pid ? find_task_by_pid(pid) : current;
 805         if (!task) {
 806                 read_unlock(&tasklist_lock);
 807                 return -ESRCH;
 808         }
 809         mm = get_task_mm(task);
 810         read_unlock(&tasklist_lock);
 811
 812         if (!mm)
 813                 return -EINVAL;
 814
 815         /*
 816          * Check if this process has the right to modify the specified
 817          * process. The right exists if the process has administrative
 818          * capabilities, superuser priviledges or the same
 819          * userid as the target process.
 820          */
 821         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 822             (current->uid != task->suid) && (current->uid != task->uid) &&
 823             !capable(CAP_SYS_ADMIN)) {
 824                 err = -EPERM;
 825                 goto out;
 826         }
 827
 828         task_nodes = cpuset_mems_allowed(task);
 829         /* Is the user allowed to access the target nodes? */
 830         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
 831                 err = -EPERM;
 832                 goto out;
 833         }
 834
 835         err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
 836 out:
 837         mmput(mm);
 838         return err;
 839 }
 840
 841
 842 /* Retrieve NUMA policy */
 843 asmlinkage long sys_get_mempolicy(int __user *policy,
 844                                 unsigned long __user *nmask,
 845                                 unsigned long maxnode,
 846                                 unsigned long addr, unsigned long flags)
 847 {
 848         int err, pval;
 849         nodemask_t nodes;
 850
 851         if (nmask != NULL && maxnode < MAX_NUMNODES)
 852                 return -EINVAL;
 853
 854         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 855
 856         if (err)
 857                 return err;
 858
 859         if (policy && put_user(pval, policy))
 860                 return -EFAULT;
 861
 862         if (nmask)
 863                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 864
 865         return err;
 866 }
 867
 868 #ifdef CONFIG_COMPAT
 869
 870 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 871                                      compat_ulong_t __user *nmask,
 872                                      compat_ulong_t maxnode,
 873                                      compat_ulong_t addr, compat_ulong_t flags)
 874 {
 875         long err;
 876         unsigned long __user *nm = NULL;
 877         unsigned long nr_bits, alloc_size;
 878         DECLARE_BITMAP(bm, MAX_NUMNODES);
 879
 880         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 881         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 882
 883         if (nmask)
 884                 nm = compat_alloc_user_space(alloc_size);
 885
 886         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
 887
 888         if (!err && nmask) {
 889                 err = copy_from_user(bm, nm, alloc_size);
 890                 /* ensure entire bitmap is zeroed */
 891                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
 892                 err |= compat_put_bitmap(nmask, bm, nr_bits);
 893         }
 894
 895         return err;
 896 }
 897
 898 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
 899                                      compat_ulong_t maxnode)
 900 {
 901         long err = 0;
 902         unsigned long __user *nm = NULL;
 903         unsigned long nr_bits, alloc_size;
 904         DECLARE_BITMAP(bm, MAX_NUMNODES);
 905
 906         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 907         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 908
 909         if (nmask) {
 910                 err = compat_get_bitmap(bm, nmask, nr_bits);
 911                 nm = compat_alloc_user_space(alloc_size);
 912                 err |= copy_to_user(nm, bm, alloc_size);
 913         }
 914
 915         if (err)
 916                 return -EFAULT;
 917
 918         return sys_set_mempolicy(mode, nm, nr_bits+1);
 919 }
 920
 921 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
 922                              compat_ulong_t mode, compat_ulong_t __user *nmask,
 923                              compat_ulong_t maxnode, compat_ulong_t flags)
 924 {
 925         long err = 0;
 926         unsigned long __user *nm = NULL;
 927         unsigned long nr_bits, alloc_size;
 928         nodemask_t bm;
 929
 930         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 931         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 932
 933         if (nmask) {
 934                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
 935                 nm = compat_alloc_user_space(alloc_size);
 936                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
 937         }
 938
 939         if (err)
 940                 return -EFAULT;
 941
 942         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
 943 }
 944
 945 #endif
 946
 947 /* Return effective policy for a VMA */
 948 static struct mempolicy * get_vma_policy(struct task_struct *task,
 949                 struct vm_area_struct *vma, unsigned long addr)
 950 {
 951         struct mempolicy *pol = task->mempolicy;
 952
 953         if (vma) {
 954                 if (vma->vm_ops && vma->vm_ops->get_policy)
 955                         pol = vma->vm_ops->get_policy(vma, addr);
 956                 else if (vma->vm_policy &&
 957                                 vma->vm_policy->policy != MPOL_DEFAULT)
 958                         pol = vma->vm_policy;
 959         }
 960         if (!pol)
 961                 pol = &default_policy;
 962         return pol;
 963 }
 964
 965 /* Return a zonelist representing a mempolicy */
 966 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
 967 {
 968         int nd;
 969
 970         switch (policy->policy) {
 971         case MPOL_PREFERRED:
 972                 nd = policy->v.preferred_node;
 973                 if (nd < 0)
 974                         nd = numa_node_id();
 975                 break;
 976         case MPOL_BIND:
 977                 /* Lower zones don't get a policy applied */
 978                 /* Careful: current->mems_allowed might have moved */
 979                 if (gfp_zone(gfp) >= policy_zone)
 980                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
 981                                 return policy->v.zonelist;
 982                 /*FALL THROUGH*/
 983         case MPOL_INTERLEAVE: /* should not happen */
 984         case MPOL_DEFAULT:
 985                 nd = numa_node_id();
 986                 break;
 987         default:
 988                 nd = 0;
 989                 BUG();
 990         }
 991         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
 992 }
 993
 994 /* Do dynamic interleaving for a process */
 995 static unsigned interleave_nodes(struct mempolicy *policy)
 996 {
 997         unsigned nid, next;
 998         struct task_struct *me = current;
 999
1000         nid = me->il_next;
1001         next = next_node(nid, policy->v.nodes);
1002         if (next >= MAX_NUMNODES)
1003                 next = first_node(policy->v.nodes);
1004         me->il_next = next;
1005         return nid;
1006 }
1007
1008 /* Do static interleaving for a VMA with known offset. */
1009 static unsigned offset_il_node(struct mempolicy *pol,
1010                 struct vm_area_struct *vma, unsigned long off)
1011 {
1012         unsigned nnodes = nodes_weight(pol->v.nodes);
1013         unsigned target = (unsigned)off % nnodes;
1014         int c;
1015         int nid = -1;
1016
1017         c = 0;
1018         do {
1019                 nid = next_node(nid, pol->v.nodes);
1020                 c++;
1021         } while (c <= target);
1022         return nid;
1023 }
1024
1025 /* Determine a node number for interleave */
1026 static inline unsigned interleave_nid(struct mempolicy *pol,
1027                  struct vm_area_struct *vma, unsigned long addr, int shift)
1028 {
1029         if (vma) {
1030                 unsigned long off;
1031
1032                 off = vma->vm_pgoff;
1033                 off += (addr - vma->vm_start) >> shift;
1034                 return offset_il_node(pol, vma, off);
1035         } else
1036                 return interleave_nodes(pol);
1037 }
1038
1039 /* Return a zonelist suitable for a huge page allocation. */
1040 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1041 {
1042         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1043
1044         if (pol->policy == MPOL_INTERLEAVE) {
1045                 unsigned nid;
1046
1047                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1048                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1049         }
1050         return zonelist_policy(GFP_HIGHUSER, pol);
1051 }
1052
1053 /* Allocate a page in interleaved policy.
1054    Own path because it needs to do special accounting. */
1055 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1056                                         unsigned nid)
1057 {
1058         struct zonelist *zl;
1059         struct page *page;
1060
1061         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1062         page = __alloc_pages(gfp, order, zl);
1063         if (page && page_zone(page) == zl->zones[0]) {
1064                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1065                 put_cpu();
1066         }
1067         return page;
1068 }
1069
1070 /**
1071  *      alloc_page_vma  - Allocate a page for a VMA.
1072  *
1073  *      @gfp:
1074  *      %GFP_USER    user allocation.
1075  *      %GFP_KERNEL  kernel allocations,
1076  *      %GFP_HIGHMEM highmem/user allocations,
1077  *      %GFP_FS      allocation should not call back into a file system.
1078  *      %GFP_ATOMIC  don't sleep.
1079  *
1080  *      @vma:  Pointer to VMA or NULL if not available.
1081  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1082  *
1083  *      This function allocates a page from the kernel page pool and applies
1084  *      a NUMA policy associated with the VMA or the current process.
1085  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1086  *      mm_struct of the VMA to prevent it from going away. Should be used for
1087  *      all allocations for pages that will be mapped into
1088  *      user space. Returns NULL when no page can be allocated.
1089  *
1090  *      Should be called with the mm_sem of the vma hold.
1091  */
1092 struct page *
1093 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1094 {
1095         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1096
1097         cpuset_update_task_memory_state();
1098
1099         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1100                 unsigned nid;
1101
1102                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1103                 return alloc_page_interleave(gfp, 0, nid);
1104         }
1105         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1106 }
1107
1108 /**
1109  *      alloc_pages_current - Allocate pages.
1110  *
1111  *      @gfp:
1112  *              %GFP_USER   user allocation,
1113  *              %GFP_KERNEL kernel allocation,
1114  *              %GFP_HIGHMEM highmem allocation,
1115  *              %GFP_FS     don't call back into a file system.
1116  *              %GFP_ATOMIC don't sleep.
1117  *      @order: Power of two of allocation size in pages. 0 is a single page.
1118  *
1119  *      Allocate a page from the kernel page pool.  When not in
1120  *      interrupt context and apply the current process NUMA policy.
1121  *      Returns NULL when no page can be allocated.
1122  *
1123  *      Don't call cpuset_update_task_memory_state() unless
1124  *      1) it's ok to take cpuset_sem (can WAIT), and
1125  *      2) allocating for current task (not interrupt).
1126  */
1127 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1128 {
1129         struct mempolicy *pol = current->mempolicy;
1130
1131         if ((gfp & __GFP_WAIT) && !in_interrupt())
1132                 cpuset_update_task_memory_state();
1133         if (!pol || in_interrupt())
1134                 pol = &default_policy;
1135         if (pol->policy == MPOL_INTERLEAVE)
1136                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1137         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1138 }
1139 EXPORT_SYMBOL(alloc_pages_current);
1140
1141 /*
1142  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1143  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1144  * with the mems_allowed returned by cpuset_mems_allowed().  This
1145  * keeps mempolicies cpuset relative after its cpuset moves.  See
1146  * further kernel/cpuset.c update_nodemask().
1147  */
1148 void *cpuset_being_rebound;
1149
1150 /* Slow path of a mempolicy copy */
1151 struct mempolicy *__mpol_copy(struct mempolicy *old)
1152 {
1153         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1154
1155         if (!new)
1156                 return ERR_PTR(-ENOMEM);
1157         if (current_cpuset_is_being_rebound()) {
1158                 nodemask_t mems = cpuset_mems_allowed(current);
1159                 mpol_rebind_policy(old, &mems);
1160         }
1161         *new = *old;
1162         atomic_set(&new->refcnt, 1);
1163         if (new->policy == MPOL_BIND) {
1164                 int sz = ksize(old->v.zonelist);
1165                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1166                 if (!new->v.zonelist) {
1167                         kmem_cache_free(policy_cache, new);
1168                         return ERR_PTR(-ENOMEM);
1169                 }
1170                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1171         }
1172         return new;
1173 }
1174
1175 /* Slow path of a mempolicy comparison */
1176 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1177 {
1178         if (!a || !b)
1179                 return 0;
1180         if (a->policy != b->policy)
1181                 return 0;
1182         switch (a->policy) {
1183         case MPOL_DEFAULT:
1184                 return 1;
1185         case MPOL_INTERLEAVE:
1186                 return nodes_equal(a->v.nodes, b->v.nodes);
1187         case MPOL_PREFERRED:
1188                 return a->v.preferred_node == b->v.preferred_node;
1189         case MPOL_BIND: {
1190                 int i;
1191                 for (i = 0; a->v.zonelist->zones[i]; i++)
1192                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1193                                 return 0;
1194                 return b->v.zonelist->zones[i] == NULL;
1195         }
1196         default:
1197                 BUG();
1198                 return 0;
1199         }
1200 }
1201
1202 /* Slow path of a mpol destructor. */
1203 void __mpol_free(struct mempolicy *p)
1204 {
1205         if (!atomic_dec_and_test(&p->refcnt))
1206                 return;
1207         if (p->policy == MPOL_BIND)
1208                 kfree(p->v.zonelist);
1209         p->policy = MPOL_DEFAULT;
1210         kmem_cache_free(policy_cache, p);
1211 }
1212
1213 /*
1214  * Shared memory backing store policy support.
1215  *
1216  * Remember policies even when nobody has shared memory mapped.
1217  * The policies are kept in Red-Black tree linked from the inode.
1218  * They are protected by the sp->lock spinlock, which should be held
1219  * for any accesses to the tree.
1220  */
1221
1222 /* lookup first element intersecting start-end */
1223 /* Caller holds sp->lock */
1224 static struct sp_node *
1225 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1226 {
1227         struct rb_node *n = sp->root.rb_node;
1228
1229         while (n) {
1230                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1231
1232                 if (start >= p->end)
1233                         n = n->rb_right;
1234                 else if (end <= p->start)
1235                         n = n->rb_left;
1236                 else
1237                         break;
1238         }
1239         if (!n)
1240                 return NULL;
1241         for (;;) {
1242                 struct sp_node *w = NULL;
1243                 struct rb_node *prev = rb_prev(n);
1244                 if (!prev)
1245                         break;
1246                 w = rb_entry(prev, struct sp_node, nd);
1247                 if (w->end <= start)
1248                         break;
1249                 n = prev;
1250         }
1251         return rb_entry(n, struct sp_node, nd);
1252 }
1253
1254 /* Insert a new shared policy into the list. */
1255 /* Caller holds sp->lock */
1256 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1257 {
1258         struct rb_node **p = &sp->root.rb_node;
1259         struct rb_node *parent = NULL;
1260         struct sp_node *nd;
1261
1262         while (*p) {
1263                 parent = *p;
1264                 nd = rb_entry(parent, struct sp_node, nd);
1265                 if (new->start < nd->start)
1266                         p = &(*p)->rb_left;
1267                 else if (new->end > nd->end)
1268                         p = &(*p)->rb_right;
1269                 else
1270                         BUG();
1271         }
1272         rb_link_node(&new->nd, parent, p);
1273         rb_insert_color(&new->nd, &sp->root);
1274         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1275                  new->policy ? new->policy->policy : 0);
1276 }
1277
1278 /* Find shared policy intersecting idx */
1279 struct mempolicy *
1280 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1281 {
1282         struct mempolicy *pol = NULL;
1283         struct sp_node *sn;
1284
1285         if (!sp->root.rb_node)
1286                 return NULL;
1287         spin_lock(&sp->lock);
1288         sn = sp_lookup(sp, idx, idx+1);
1289         if (sn) {
1290                 mpol_get(sn->policy);
1291                 pol = sn->policy;
1292         }
1293         spin_unlock(&sp->lock);
1294         return pol;
1295 }
1296
1297 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1298 {
1299         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1300         rb_erase(&n->nd, &sp->root);
1301         mpol_free(n->policy);
1302         kmem_cache_free(sn_cache, n);
1303 }
1304
1305 struct sp_node *
1306 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1307 {
1308         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1309
1310         if (!n)
1311                 return NULL;
1312         n->start = start;
1313         n->end = end;
1314         mpol_get(pol);
1315         n->policy = pol;
1316         return n;
1317 }
1318
1319 /* Replace a policy range. */
1320 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1321                                  unsigned long end, struct sp_node *new)
1322 {
1323         struct sp_node *n, *new2 = NULL;
1324
1325 restart:
1326         spin_lock(&sp->lock);
1327         n = sp_lookup(sp, start, end);
1328         /* Take care of old policies in the same range. */
1329         while (n && n->start < end) {
1330                 struct rb_node *next = rb_next(&n->nd);
1331                 if (n->start >= start) {
1332                         if (n->end <= end)
1333                                 sp_delete(sp, n);
1334                         else
1335                                 n->start = end;
1336                 } else {
1337                         /* Old policy spanning whole new range. */
1338                         if (n->end > end) {
1339                                 if (!new2) {
1340                                         spin_unlock(&sp->lock);
1341                                         new2 = sp_alloc(end, n->end, n->policy);
1342                                         if (!new2)
1343                                                 return -ENOMEM;
1344                                         goto restart;
1345                                 }
1346                                 n->end = start;
1347                                 sp_insert(sp, new2);
1348                                 new2 = NULL;
1349                                 break;
1350                         } else
1351                                 n->end = start;
1352                 }
1353                 if (!next)
1354                         break;
1355                 n = rb_entry(next, struct sp_node, nd);
1356         }
1357         if (new)
1358                 sp_insert(sp, new);
1359         spin_unlock(&sp->lock);
1360         if (new2) {
1361                 mpol_free(new2->policy);
1362                 kmem_cache_free(sn_cache, new2);
1363         }
1364         return 0;
1365 }
1366
1367 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1368                                 nodemask_t *policy_nodes)
1369 {
1370         info->root = RB_ROOT;
1371         spin_lock_init(&info->lock);
1372
1373         if (policy != MPOL_DEFAULT) {
1374                 struct mempolicy *newpol;
1375
1376                 /* Falls back to MPOL_DEFAULT on any error */
1377                 newpol = mpol_new(policy, policy_nodes);
1378                 if (!IS_ERR(newpol)) {
1379                         /* Create pseudo-vma that contains just the policy */
1380                         struct vm_area_struct pvma;
1381
1382                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1383                         /* Policy covers entire file */
1384                         pvma.vm_end = TASK_SIZE;
1385                         mpol_set_shared_policy(info, &pvma, newpol);
1386                         mpol_free(newpol);
1387                 }
1388         }
1389 }
1390
1391 int mpol_set_shared_policy(struct shared_policy *info,
1392                         struct vm_area_struct *vma, struct mempolicy *npol)
1393 {
1394         int err;
1395         struct sp_node *new = NULL;
1396         unsigned long sz = vma_pages(vma);
1397
1398         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1399                  vma->vm_pgoff,
1400                  sz, npol? npol->policy : -1,
1401                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1402
1403         if (npol) {
1404                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1405                 if (!new)
1406                         return -ENOMEM;
1407         }
1408         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1409         if (err && new)
1410                 kmem_cache_free(sn_cache, new);
1411         return err;
1412 }
1413
1414 /* Free a backing policy store on inode delete. */
1415 void mpol_free_shared_policy(struct shared_policy *p)
1416 {
1417         struct sp_node *n;
1418         struct rb_node *next;
1419
1420         if (!p->root.rb_node)
1421                 return;
1422         spin_lock(&p->lock);
1423         next = rb_first(&p->root);
1424         while (next) {
1425                 n = rb_entry(next, struct sp_node, nd);
1426                 next = rb_next(&n->nd);
1427                 rb_erase(&n->nd, &p->root);
1428                 mpol_free(n->policy);
1429                 kmem_cache_free(sn_cache, n);
1430         }
1431         spin_unlock(&p->lock);
1432 }
1433
1434 /* assumes fs == KERNEL_DS */
1435 void __init numa_policy_init(void)
1436 {
1437         policy_cache = kmem_cache_create("numa_policy",
1438                                          sizeof(struct mempolicy),
1439                                          0, SLAB_PANIC, NULL, NULL);
1440
1441         sn_cache = kmem_cache_create("shared_policy_node",
1442                                      sizeof(struct sp_node),
1443                                      0, SLAB_PANIC, NULL, NULL);
1444
1445         /* Set interleaving policy for system init. This way not all
1446            the data structures allocated at system boot end up in node zero. */
1447
1448         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1449                 printk("numa_policy_init: interleaving failed\n");
1450 }
1451
1452 /* Reset policy of current process to default */
1453 void numa_default_policy(void)
1454 {
1455         do_set_mempolicy(MPOL_DEFAULT, NULL);
1456 }
1457
1458 /* Migrate a policy to a different set of nodes */
1459 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1460 {
1461         nodemask_t *mpolmask;
1462         nodemask_t tmp;
1463
1464         if (!pol)
1465                 return;
1466         mpolmask = &pol->cpuset_mems_allowed;
1467         if (nodes_equal(*mpolmask, *newmask))
1468                 return;
1469
1470         switch (pol->policy) {
1471         case MPOL_DEFAULT:
1472                 break;
1473         case MPOL_INTERLEAVE:
1474                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1475                 pol->v.nodes = tmp;
1476                 *mpolmask = *newmask;
1477                 current->il_next = node_remap(current->il_next,
1478                                                 *mpolmask, *newmask);
1479                 break;
1480         case MPOL_PREFERRED:
1481                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1482                                                 *mpolmask, *newmask);
1483                 *mpolmask = *newmask;
1484                 break;
1485         case MPOL_BIND: {
1486                 nodemask_t nodes;
1487                 struct zone **z;
1488                 struct zonelist *zonelist;
1489
1490                 nodes_clear(nodes);
1491                 for (z = pol->v.zonelist->zones; *z; z++)
1492                         node_set((*z)->zone_pgdat->node_id, nodes);
1493                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1494                 nodes = tmp;
1495
1496                 zonelist = bind_zonelist(&nodes);
1497
1498                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1499                  * If that old zonelist has no remaining mems_allowed nodes,
1500                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1501                  */
1502
1503                 if (zonelist) {
1504                         /* Good - got mem - substitute new zonelist */
1505                         kfree(pol->v.zonelist);
1506                         pol->v.zonelist = zonelist;
1507                 }
1508                 *mpolmask = *newmask;
1509                 break;
1510         }
1511         default:
1512                 BUG();
1513                 break;
1514         }
1515 }
1516
1517 /*
1518  * Wrapper for mpol_rebind_policy() that just requires task
1519  * pointer, and updates task mempolicy.
1520  */
1521
1522 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1523 {
1524         mpol_rebind_policy(tsk->mempolicy, new);
1525 }
1526
1527 /*
1528  * Rebind each vma in mm to new nodemask.
1529  *
1530  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1531  */
1532
1533 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1534 {
1535         struct vm_area_struct *vma;
1536
1537         down_write(&mm->mmap_sem);
1538         for (vma = mm->mmap; vma; vma = vma->vm_next)
1539                 mpol_rebind_policy(vma->vm_policy, new);
1540         up_write(&mm->mmap_sem);
1541 }
1542
1543 /*
1544  * Display pages allocated per node and memory policy via /proc.
1545  */
1546
1547 static const char *policy_types[] = { "default", "prefer", "bind",
1548                                       "interleave" };
1549
1550 /*
1551  * Convert a mempolicy into a string.
1552  * Returns the number of characters in buffer (if positive)
1553  * or an error (negative)
1554  */
1555 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1556 {
1557         char *p = buffer;
1558         int l;
1559         nodemask_t nodes;
1560         int mode = pol ? pol->policy : MPOL_DEFAULT;
1561
1562         switch (mode) {
1563         case MPOL_DEFAULT:
1564                 nodes_clear(nodes);
1565                 break;
1566
1567         case MPOL_PREFERRED:
1568                 nodes_clear(nodes);
1569                 node_set(pol->v.preferred_node, nodes);
1570                 break;
1571
1572         case MPOL_BIND:
1573                 get_zonemask(pol, &nodes);
1574                 break;
1575
1576         case MPOL_INTERLEAVE:
1577                 nodes = pol->v.nodes;
1578                 break;
1579
1580         default:
1581                 BUG();
1582                 return -EFAULT;
1583         }
1584
1585         l = strlen(policy_types[mode]);
1586         if (buffer + maxlen < p + l + 1)
1587                 return -ENOSPC;
1588
1589         strcpy(p, policy_types[mode]);
1590         p += l;
1591
1592         if (!nodes_empty(nodes)) {
1593                 if (buffer + maxlen < p + 2)
1594                         return -ENOSPC;
1595                 *p++ = '=';
1596                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1597         }
1598         return p - buffer;
1599 }
1600
1601 struct numa_maps {
1602         unsigned long pages;
1603         unsigned long anon;
1604         unsigned long mapped;
1605         unsigned long mapcount_max;
1606         unsigned long node[MAX_NUMNODES];
1607 };
1608
1609 static void gather_stats(struct page *page, void *private)
1610 {
1611         struct numa_maps *md = private;
1612         int count = page_mapcount(page);
1613
1614         if (count)
1615                 md->mapped++;
1616
1617         if (count > md->mapcount_max)
1618                 md->mapcount_max = count;
1619
1620         md->pages++;
1621
1622         if (PageAnon(page))
1623                 md->anon++;
1624
1625         md->node[page_to_nid(page)]++;
1626         cond_resched();
1627 }
1628
1629 int show_numa_map(struct seq_file *m, void *v)
1630 {
1631         struct task_struct *task = m->private;
1632         struct vm_area_struct *vma = v;
1633         struct numa_maps *md;
1634         int n;
1635         char buffer[50];
1636
1637         if (!vma->vm_mm)
1638                 return 0;
1639
1640         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1641         if (!md)
1642                 return 0;
1643
1644         check_pgd_range(vma, vma->vm_start, vma->vm_end,
1645                     &node_online_map, MPOL_MF_STATS, md);
1646
1647         if (md->pages) {
1648                 mpol_to_str(buffer, sizeof(buffer),
1649                             get_vma_policy(task, vma, vma->vm_start));
1650
1651                 seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
1652                            vma->vm_start, buffer, md->pages,
1653                            md->mapped, md->mapcount_max);
1654
1655                 if (md->anon)
1656                         seq_printf(m," anon=%lu",md->anon);
1657
1658                 for_each_online_node(n)
1659                         if (md->node[n])
1660                                 seq_printf(m, " N%d=%lu", n, md->node[n]);
1661
1662                 seq_putc(m, '\n');
1663         }
1664         kfree(md);
1665
1666         if (m->count < m->size)
1667                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1668         return 0;
1669 }
1670