SAFE public projects git trees. - safe/jmp/linux-2.6/blob - mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/mm.h>
  76 #include <linux/nodemask.h>
  77 #include <linux/cpuset.h>
  78 #include <linux/gfp.h>
  79 #include <linux/slab.h>
  80 #include <linux/string.h>
  81 #include <linux/module.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/mempolicy.h>
  86 #include <linux/swap.h>
  87 #include <linux/seq_file.h>
  88 #include <linux/proc_fs.h>
  89 #include <linux/migrate.h>
  90 #include <linux/rmap.h>
  91 #include <linux/security.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 #define PDprintk(fmt...)
 105
 106 /* Highest zone. An specific allocation for a zone below that is not
 107    policied. */
 108 int policy_zone = ZONE_DMA;
 109
 110 struct mempolicy default_policy = {
 111         .refcnt = ATOMIC_INIT(1), /* never free it */
 112         .policy = MPOL_DEFAULT,
 113 };
 114
 115 /* Do sanity checking on a policy */
 116 static int mpol_check_policy(int mode, nodemask_t *nodes)
 117 {
 118         int empty = nodes_empty(*nodes);
 119
 120         switch (mode) {
 121         case MPOL_DEFAULT:
 122                 if (!empty)
 123                         return -EINVAL;
 124                 break;
 125         case MPOL_BIND:
 126         case MPOL_INTERLEAVE:
 127                 /* Preferred will only use the first bit, but allow
 128                    more for now. */
 129                 if (empty)
 130                         return -EINVAL;
 131                 break;
 132         }
 133         return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
 134 }
 135
 136 /* Generate a custom zonelist for the BIND policy. */
 137 static struct zonelist *bind_zonelist(nodemask_t *nodes)
 138 {
 139         struct zonelist *zl;
 140         int num, max, nd, k;
 141
 142         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
 143         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
 144         if (!zl)
 145                 return NULL;
 146         num = 0;
 147         /* First put in the highest zones from all nodes, then all the next
 148            lower zones etc. Avoid empty zones because the memory allocator
 149            doesn't like them. If you implement node hot removal you
 150            have to fix that. */
 151         for (k = policy_zone; k >= 0; k--) {
 152                 for_each_node_mask(nd, *nodes) {
 153                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
 154                         if (z->present_pages > 0)
 155                                 zl->zones[num++] = z;
 156                 }
 157         }
 158         zl->zones[num] = NULL;
 159         return zl;
 160 }
 161
 162 /* Create a new policy */
 163 static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
 164 {
 165         struct mempolicy *policy;
 166
 167         PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
 168         if (mode == MPOL_DEFAULT)
 169                 return NULL;
 170         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 171         if (!policy)
 172                 return ERR_PTR(-ENOMEM);
 173         atomic_set(&policy->refcnt, 1);
 174         switch (mode) {
 175         case MPOL_INTERLEAVE:
 176                 policy->v.nodes = *nodes;
 177                 if (nodes_weight(*nodes) == 0) {
 178                         kmem_cache_free(policy_cache, policy);
 179                         return ERR_PTR(-EINVAL);
 180                 }
 181                 break;
 182         case MPOL_PREFERRED:
 183                 policy->v.preferred_node = first_node(*nodes);
 184                 if (policy->v.preferred_node >= MAX_NUMNODES)
 185                         policy->v.preferred_node = -1;
 186                 break;
 187         case MPOL_BIND:
 188                 policy->v.zonelist = bind_zonelist(nodes);
 189                 if (policy->v.zonelist == NULL) {
 190                         kmem_cache_free(policy_cache, policy);
 191                         return ERR_PTR(-ENOMEM);
 192                 }
 193                 break;
 194         }
 195         policy->policy = mode;
 196         policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
 197         return policy;
 198 }
 199
 200 static void gather_stats(struct page *, void *, int pte_dirty);
 201 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 202                                 unsigned long flags);
 203
 204 /* Scan through pages checking if pages follow certain conditions. */
 205 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 206                 unsigned long addr, unsigned long end,
 207                 const nodemask_t *nodes, unsigned long flags,
 208                 void *private)
 209 {
 210         pte_t *orig_pte;
 211         pte_t *pte;
 212         spinlock_t *ptl;
 213
 214         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 215         do {
 216                 struct page *page;
 217                 unsigned int nid;
 218
 219                 if (!pte_present(*pte))
 220                         continue;
 221                 page = vm_normal_page(vma, addr, *pte);
 222                 if (!page)
 223                         continue;
 224                 /*
 225                  * The check for PageReserved here is important to avoid
 226                  * handling zero pages and other pages that may have been
 227                  * marked special by the system.
 228                  *
 229                  * If the PageReserved would not be checked here then f.e.
 230                  * the location of the zero page could have an influence
 231                  * on MPOL_MF_STRICT, zero pages would be counted for
 232                  * the per node stats, and there would be useless attempts
 233                  * to put zero pages on the migration list.
 234                  */
 235                 if (PageReserved(page))
 236                         continue;
 237                 nid = page_to_nid(page);
 238                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 239                         continue;
 240
 241                 if (flags & MPOL_MF_STATS)
 242                         gather_stats(page, private, pte_dirty(*pte));
 243                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 244                         migrate_page_add(page, private, flags);
 245                 else
 246                         break;
 247         } while (pte++, addr += PAGE_SIZE, addr != end);
 248         pte_unmap_unlock(orig_pte, ptl);
 249         return addr != end;
 250 }
 251
 252 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 253                 unsigned long addr, unsigned long end,
 254                 const nodemask_t *nodes, unsigned long flags,
 255                 void *private)
 256 {
 257         pmd_t *pmd;
 258         unsigned long next;
 259
 260         pmd = pmd_offset(pud, addr);
 261         do {
 262                 next = pmd_addr_end(addr, end);
 263                 if (pmd_none_or_clear_bad(pmd))
 264                         continue;
 265                 if (check_pte_range(vma, pmd, addr, next, nodes,
 266                                     flags, private))
 267                         return -EIO;
 268         } while (pmd++, addr = next, addr != end);
 269         return 0;
 270 }
 271
 272 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 273                 unsigned long addr, unsigned long end,
 274                 const nodemask_t *nodes, unsigned long flags,
 275                 void *private)
 276 {
 277         pud_t *pud;
 278         unsigned long next;
 279
 280         pud = pud_offset(pgd, addr);
 281         do {
 282                 next = pud_addr_end(addr, end);
 283                 if (pud_none_or_clear_bad(pud))
 284                         continue;
 285                 if (check_pmd_range(vma, pud, addr, next, nodes,
 286                                     flags, private))
 287                         return -EIO;
 288         } while (pud++, addr = next, addr != end);
 289         return 0;
 290 }
 291
 292 static inline int check_pgd_range(struct vm_area_struct *vma,
 293                 unsigned long addr, unsigned long end,
 294                 const nodemask_t *nodes, unsigned long flags,
 295                 void *private)
 296 {
 297         pgd_t *pgd;
 298         unsigned long next;
 299
 300         pgd = pgd_offset(vma->vm_mm, addr);
 301         do {
 302                 next = pgd_addr_end(addr, end);
 303                 if (pgd_none_or_clear_bad(pgd))
 304                         continue;
 305                 if (check_pud_range(vma, pgd, addr, next, nodes,
 306                                     flags, private))
 307                         return -EIO;
 308         } while (pgd++, addr = next, addr != end);
 309         return 0;
 310 }
 311
 312 /* Check if a vma is migratable */
 313 static inline int vma_migratable(struct vm_area_struct *vma)
 314 {
 315         if (vma->vm_flags & (
 316                 VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
 317                 return 0;
 318         return 1;
 319 }
 320
 321 /*
 322  * Check if all pages in a range are on a set of nodes.
 323  * If pagelist != NULL then isolate pages from the LRU and
 324  * put them on the pagelist.
 325  */
 326 static struct vm_area_struct *
 327 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 328                 const nodemask_t *nodes, unsigned long flags, void *private)
 329 {
 330         int err;
 331         struct vm_area_struct *first, *vma, *prev;
 332
 333         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 334
 335                 err = migrate_prep();
 336                 if (err)
 337                         return ERR_PTR(err);
 338         }
 339
 340         first = find_vma(mm, start);
 341         if (!first)
 342                 return ERR_PTR(-EFAULT);
 343         prev = NULL;
 344         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 345                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 346                         if (!vma->vm_next && vma->vm_end < end)
 347                                 return ERR_PTR(-EFAULT);
 348                         if (prev && prev->vm_end < vma->vm_start)
 349                                 return ERR_PTR(-EFAULT);
 350                 }
 351                 if (!is_vm_hugetlb_page(vma) &&
 352                     ((flags & MPOL_MF_STRICT) ||
 353                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 354                                 vma_migratable(vma)))) {
 355                         unsigned long endvma = vma->vm_end;
 356
 357                         if (endvma > end)
 358                                 endvma = end;
 359                         if (vma->vm_start > start)
 360                                 start = vma->vm_start;
 361                         err = check_pgd_range(vma, start, endvma, nodes,
 362                                                 flags, private);
 363                         if (err) {
 364                                 first = ERR_PTR(err);
 365                                 break;
 366                         }
 367                 }
 368                 prev = vma;
 369         }
 370         return first;
 371 }
 372
 373 /* Apply policy to a single VMA */
 374 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 375 {
 376         int err = 0;
 377         struct mempolicy *old = vma->vm_policy;
 378
 379         PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 380                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 381                  vma->vm_ops, vma->vm_file,
 382                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 383
 384         if (vma->vm_ops && vma->vm_ops->set_policy)
 385                 err = vma->vm_ops->set_policy(vma, new);
 386         if (!err) {
 387                 mpol_get(new);
 388                 vma->vm_policy = new;
 389                 mpol_free(old);
 390         }
 391         return err;
 392 }
 393
 394 /* Step 2: apply policy to a range and do splits. */
 395 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 396                        unsigned long end, struct mempolicy *new)
 397 {
 398         struct vm_area_struct *next;
 399         int err;
 400
 401         err = 0;
 402         for (; vma && vma->vm_start < end; vma = next) {
 403                 next = vma->vm_next;
 404                 if (vma->vm_start < start)
 405                         err = split_vma(vma->vm_mm, vma, start, 1);
 406                 if (!err && vma->vm_end > end)
 407                         err = split_vma(vma->vm_mm, vma, end, 0);
 408                 if (!err)
 409                         err = policy_vma(vma, new);
 410                 if (err)
 411                         break;
 412         }
 413         return err;
 414 }
 415
 416 static int contextualize_policy(int mode, nodemask_t *nodes)
 417 {
 418         if (!nodes)
 419                 return 0;
 420
 421         cpuset_update_task_memory_state();
 422         if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
 423                 return -EINVAL;
 424         return mpol_check_policy(mode, nodes);
 425 }
 426
 427
 428 /*
 429  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 430  * mempolicy.  Allows more rapid checking of this (combined perhaps
 431  * with other PF_* flag bits) on memory allocation hot code paths.
 432  *
 433  * If called from outside this file, the task 'p' should -only- be
 434  * a newly forked child not yet visible on the task list, because
 435  * manipulating the task flags of a visible task is not safe.
 436  *
 437  * The above limitation is why this routine has the funny name
 438  * mpol_fix_fork_child_flag().
 439  *
 440  * It is also safe to call this with a task pointer of current,
 441  * which the static wrapper mpol_set_task_struct_flag() does,
 442  * for use within this file.
 443  */
 444
 445 void mpol_fix_fork_child_flag(struct task_struct *p)
 446 {
 447         if (p->mempolicy)
 448                 p->flags |= PF_MEMPOLICY;
 449         else
 450                 p->flags &= ~PF_MEMPOLICY;
 451 }
 452
 453 static void mpol_set_task_struct_flag(void)
 454 {
 455         mpol_fix_fork_child_flag(current);
 456 }
 457
 458 /* Set the process memory policy */
 459 long do_set_mempolicy(int mode, nodemask_t *nodes)
 460 {
 461         struct mempolicy *new;
 462
 463         if (contextualize_policy(mode, nodes))
 464                 return -EINVAL;
 465         new = mpol_new(mode, nodes);
 466         if (IS_ERR(new))
 467                 return PTR_ERR(new);
 468         mpol_free(current->mempolicy);
 469         current->mempolicy = new;
 470         mpol_set_task_struct_flag();
 471         if (new && new->policy == MPOL_INTERLEAVE)
 472                 current->il_next = first_node(new->v.nodes);
 473         return 0;
 474 }
 475
 476 /* Fill a zone bitmap for a policy */
 477 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 478 {
 479         int i;
 480
 481         nodes_clear(*nodes);
 482         switch (p->policy) {
 483         case MPOL_BIND:
 484                 for (i = 0; p->v.zonelist->zones[i]; i++)
 485                         node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
 486                                 *nodes);
 487                 break;
 488         case MPOL_DEFAULT:
 489                 break;
 490         case MPOL_INTERLEAVE:
 491                 *nodes = p->v.nodes;
 492                 break;
 493         case MPOL_PREFERRED:
 494                 /* or use current node instead of online map? */
 495                 if (p->v.preferred_node < 0)
 496                         *nodes = node_online_map;
 497                 else
 498                         node_set(p->v.preferred_node, *nodes);
 499                 break;
 500         default:
 501                 BUG();
 502         }
 503 }
 504
 505 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 506 {
 507         struct page *p;
 508         int err;
 509
 510         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 511         if (err >= 0) {
 512                 err = page_to_nid(p);
 513                 put_page(p);
 514         }
 515         return err;
 516 }
 517
 518 /* Retrieve NUMA policy */
 519 long do_get_mempolicy(int *policy, nodemask_t *nmask,
 520                         unsigned long addr, unsigned long flags)
 521 {
 522         int err;
 523         struct mm_struct *mm = current->mm;
 524         struct vm_area_struct *vma = NULL;
 525         struct mempolicy *pol = current->mempolicy;
 526
 527         cpuset_update_task_memory_state();
 528         if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
 529                 return -EINVAL;
 530         if (flags & MPOL_F_ADDR) {
 531                 down_read(&mm->mmap_sem);
 532                 vma = find_vma_intersection(mm, addr, addr+1);
 533                 if (!vma) {
 534                         up_read(&mm->mmap_sem);
 535                         return -EFAULT;
 536                 }
 537                 if (vma->vm_ops && vma->vm_ops->get_policy)
 538                         pol = vma->vm_ops->get_policy(vma, addr);
 539                 else
 540                         pol = vma->vm_policy;
 541         } else if (addr)
 542                 return -EINVAL;
 543
 544         if (!pol)
 545                 pol = &default_policy;
 546
 547         if (flags & MPOL_F_NODE) {
 548                 if (flags & MPOL_F_ADDR) {
 549                         err = lookup_node(mm, addr);
 550                         if (err < 0)
 551                                 goto out;
 552                         *policy = err;
 553                 } else if (pol == current->mempolicy &&
 554                                 pol->policy == MPOL_INTERLEAVE) {
 555                         *policy = current->il_next;
 556                 } else {
 557                         err = -EINVAL;
 558                         goto out;
 559                 }
 560         } else
 561                 *policy = pol->policy;
 562
 563         if (vma) {
 564                 up_read(&current->mm->mmap_sem);
 565                 vma = NULL;
 566         }
 567
 568         err = 0;
 569         if (nmask)
 570                 get_zonemask(pol, nmask);
 571
 572  out:
 573         if (vma)
 574                 up_read(&current->mm->mmap_sem);
 575         return err;
 576 }
 577
 578 #ifdef CONFIG_MIGRATION
 579 /*
 580  * page migration
 581  */
 582 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 583                                 unsigned long flags)
 584 {
 585         /*
 586          * Avoid migrating a page that is shared with others.
 587          */
 588         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 589                 isolate_lru_page(page, pagelist);
 590 }
 591
 592 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 593 {
 594         return alloc_pages_node(node, GFP_HIGHUSER, 0);
 595 }
 596
 597 /*
 598  * Migrate pages from one node to a target node.
 599  * Returns error or the number of pages not migrated.
 600  */
 601 int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
 602 {
 603         nodemask_t nmask;
 604         LIST_HEAD(pagelist);
 605         int err = 0;
 606
 607         nodes_clear(nmask);
 608         node_set(source, nmask);
 609
 610         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 611                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 612
 613         if (!list_empty(&pagelist))
 614                 err = migrate_pages(&pagelist, new_node_page, dest);
 615
 616         return err;
 617 }
 618
 619 /*
 620  * Move pages between the two nodesets so as to preserve the physical
 621  * layout as much as possible.
 622  *
 623  * Returns the number of page that could not be moved.
 624  */
 625 int do_migrate_pages(struct mm_struct *mm,
 626         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 627 {
 628         LIST_HEAD(pagelist);
 629         int busy = 0;
 630         int err = 0;
 631         nodemask_t tmp;
 632
 633         down_read(&mm->mmap_sem);
 634
 635 /*
 636  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 637  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 638  * bit in 'tmp', and return that <source, dest> pair for migration.
 639  * The pair of nodemasks 'to' and 'from' define the map.
 640  *
 641  * If no pair of bits is found that way, fallback to picking some
 642  * pair of 'source' and 'dest' bits that are not the same.  If the
 643  * 'source' and 'dest' bits are the same, this represents a node
 644  * that will be migrating to itself, so no pages need move.
 645  *
 646  * If no bits are left in 'tmp', or if all remaining bits left
 647  * in 'tmp' correspond to the same bit in 'to', return false
 648  * (nothing left to migrate).
 649  *
 650  * This lets us pick a pair of nodes to migrate between, such that
 651  * if possible the dest node is not already occupied by some other
 652  * source node, minimizing the risk of overloading the memory on a
 653  * node that would happen if we migrated incoming memory to a node
 654  * before migrating outgoing memory source that same node.
 655  *
 656  * A single scan of tmp is sufficient.  As we go, we remember the
 657  * most recent <s, d> pair that moved (s != d).  If we find a pair
 658  * that not only moved, but what's better, moved to an empty slot
 659  * (d is not set in tmp), then we break out then, with that pair.
 660  * Otherwise when we finish scannng from_tmp, we at least have the
 661  * most recent <s, d> pair that moved.  If we get all the way through
 662  * the scan of tmp without finding any node that moved, much less
 663  * moved to an empty node, then there is nothing left worth migrating.
 664  */
 665
 666         tmp = *from_nodes;
 667         while (!nodes_empty(tmp)) {
 668                 int s,d;
 669                 int source = -1;
 670                 int dest = 0;
 671
 672                 for_each_node_mask(s, tmp) {
 673                         d = node_remap(s, *from_nodes, *to_nodes);
 674                         if (s == d)
 675                                 continue;
 676
 677                         source = s;     /* Node moved. Memorize */
 678                         dest = d;
 679
 680                         /* dest not in remaining from nodes? */
 681                         if (!node_isset(dest, tmp))
 682                                 break;
 683                 }
 684                 if (source == -1)
 685                         break;
 686
 687                 node_clear(source, tmp);
 688                 err = migrate_to_node(mm, source, dest, flags);
 689                 if (err > 0)
 690                         busy += err;
 691                 if (err < 0)
 692                         break;
 693         }
 694
 695         up_read(&mm->mmap_sem);
 696         if (err < 0)
 697                 return err;
 698         return busy;
 699
 700 }
 701
 702 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 703 {
 704         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 705
 706         return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
 707 }
 708 #else
 709
 710 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 711                                 unsigned long flags)
 712 {
 713 }
 714
 715 int do_migrate_pages(struct mm_struct *mm,
 716         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 717 {
 718         return -ENOSYS;
 719 }
 720
 721 static struct page *new_vma_page(struct page *page, unsigned long private)
 722 {
 723         return NULL;
 724 }
 725 #endif
 726
 727 long do_mbind(unsigned long start, unsigned long len,
 728                 unsigned long mode, nodemask_t *nmask, unsigned long flags)
 729 {
 730         struct vm_area_struct *vma;
 731         struct mm_struct *mm = current->mm;
 732         struct mempolicy *new;
 733         unsigned long end;
 734         int err;
 735         LIST_HEAD(pagelist);
 736
 737         if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
 738                                       MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 739             || mode > MPOL_MAX)
 740                 return -EINVAL;
 741         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 742                 return -EPERM;
 743
 744         if (start & ~PAGE_MASK)
 745                 return -EINVAL;
 746
 747         if (mode == MPOL_DEFAULT)
 748                 flags &= ~MPOL_MF_STRICT;
 749
 750         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 751         end = start + len;
 752
 753         if (end < start)
 754                 return -EINVAL;
 755         if (end == start)
 756                 return 0;
 757
 758         if (mpol_check_policy(mode, nmask))
 759                 return -EINVAL;
 760
 761         new = mpol_new(mode, nmask);
 762         if (IS_ERR(new))
 763                 return PTR_ERR(new);
 764
 765         /*
 766          * If we are using the default policy then operation
 767          * on discontinuous address spaces is okay after all
 768          */
 769         if (!new)
 770                 flags |= MPOL_MF_DISCONTIG_OK;
 771
 772         PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
 773                         mode,nodes_addr(nodes)[0]);
 774
 775         down_write(&mm->mmap_sem);
 776         vma = check_range(mm, start, end, nmask,
 777                           flags | MPOL_MF_INVERT, &pagelist);
 778
 779         err = PTR_ERR(vma);
 780         if (!IS_ERR(vma)) {
 781                 int nr_failed = 0;
 782
 783                 err = mbind_range(vma, start, end, new);
 784
 785                 if (!list_empty(&pagelist))
 786                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 787                                                 (unsigned long)vma);
 788
 789                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 790                         err = -EIO;
 791         }
 792
 793         up_write(&mm->mmap_sem);
 794         mpol_free(new);
 795         return err;
 796 }
 797
 798 /*
 799  * User space interface with variable sized bitmaps for nodelists.
 800  */
 801
 802 /* Copy a node mask from user space. */
 803 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 804                      unsigned long maxnode)
 805 {
 806         unsigned long k;
 807         unsigned long nlongs;
 808         unsigned long endmask;
 809
 810         --maxnode;
 811         nodes_clear(*nodes);
 812         if (maxnode == 0 || !nmask)
 813                 return 0;
 814         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 815                 return -EINVAL;
 816
 817         nlongs = BITS_TO_LONGS(maxnode);
 818         if ((maxnode % BITS_PER_LONG) == 0)
 819                 endmask = ~0UL;
 820         else
 821                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 822
 823         /* When the user specified more nodes than supported just check
 824            if the non supported part is all zero. */
 825         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 826                 if (nlongs > PAGE_SIZE/sizeof(long))
 827                         return -EINVAL;
 828                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 829                         unsigned long t;
 830                         if (get_user(t, nmask + k))
 831                                 return -EFAULT;
 832                         if (k == nlongs - 1) {
 833                                 if (t & endmask)
 834                                         return -EINVAL;
 835                         } else if (t)
 836                                 return -EINVAL;
 837                 }
 838                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 839                 endmask = ~0UL;
 840         }
 841
 842         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 843                 return -EFAULT;
 844         nodes_addr(*nodes)[nlongs-1] &= endmask;
 845         return 0;
 846 }
 847
 848 /* Copy a kernel node mask to user space */
 849 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 850                               nodemask_t *nodes)
 851 {
 852         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 853         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 854
 855         if (copy > nbytes) {
 856                 if (copy > PAGE_SIZE)
 857                         return -EINVAL;
 858                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 859                         return -EFAULT;
 860                 copy = nbytes;
 861         }
 862         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 863 }
 864
 865 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 866                         unsigned long mode,
 867                         unsigned long __user *nmask, unsigned long maxnode,
 868                         unsigned flags)
 869 {
 870         nodemask_t nodes;
 871         int err;
 872
 873         err = get_nodes(&nodes, nmask, maxnode);
 874         if (err)
 875                 return err;
 876         return do_mbind(start, len, mode, &nodes, flags);
 877 }
 878
 879 /* Set the process memory policy */
 880 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 881                 unsigned long maxnode)
 882 {
 883         int err;
 884         nodemask_t nodes;
 885
 886         if (mode < 0 || mode > MPOL_MAX)
 887                 return -EINVAL;
 888         err = get_nodes(&nodes, nmask, maxnode);
 889         if (err)
 890                 return err;
 891         return do_set_mempolicy(mode, &nodes);
 892 }
 893
 894 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 895                 const unsigned long __user *old_nodes,
 896                 const unsigned long __user *new_nodes)
 897 {
 898         struct mm_struct *mm;
 899         struct task_struct *task;
 900         nodemask_t old;
 901         nodemask_t new;
 902         nodemask_t task_nodes;
 903         int err;
 904
 905         err = get_nodes(&old, old_nodes, maxnode);
 906         if (err)
 907                 return err;
 908
 909         err = get_nodes(&new, new_nodes, maxnode);
 910         if (err)
 911                 return err;
 912
 913         /* Find the mm_struct */
 914         read_lock(&tasklist_lock);
 915         task = pid ? find_task_by_pid(pid) : current;
 916         if (!task) {
 917                 read_unlock(&tasklist_lock);
 918                 return -ESRCH;
 919         }
 920         mm = get_task_mm(task);
 921         read_unlock(&tasklist_lock);
 922
 923         if (!mm)
 924                 return -EINVAL;
 925
 926         /*
 927          * Check if this process has the right to modify the specified
 928          * process. The right exists if the process has administrative
 929          * capabilities, superuser privileges or the same
 930          * userid as the target process.
 931          */
 932         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 933             (current->uid != task->suid) && (current->uid != task->uid) &&
 934             !capable(CAP_SYS_NICE)) {
 935                 err = -EPERM;
 936                 goto out;
 937         }
 938
 939         task_nodes = cpuset_mems_allowed(task);
 940         /* Is the user allowed to access the target nodes? */
 941         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 942                 err = -EPERM;
 943                 goto out;
 944         }
 945
 946         err = security_task_movememory(task);
 947         if (err)
 948                 goto out;
 949
 950         err = do_migrate_pages(mm, &old, &new,
 951                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 952 out:
 953         mmput(mm);
 954         return err;
 955 }
 956
 957
 958 /* Retrieve NUMA policy */
 959 asmlinkage long sys_get_mempolicy(int __user *policy,
 960                                 unsigned long __user *nmask,
 961                                 unsigned long maxnode,
 962                                 unsigned long addr, unsigned long flags)
 963 {
 964         int err, pval;
 965         nodemask_t nodes;
 966
 967         if (nmask != NULL && maxnode < MAX_NUMNODES)
 968                 return -EINVAL;
 969
 970         err = do_get_mempolicy(&pval, &nodes, addr, flags);
 971
 972         if (err)
 973                 return err;
 974
 975         if (policy && put_user(pval, policy))
 976                 return -EFAULT;
 977
 978         if (nmask)
 979                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
 980
 981         return err;
 982 }
 983
 984 #ifdef CONFIG_COMPAT
 985
 986 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
 987                                      compat_ulong_t __user *nmask,
 988                                      compat_ulong_t maxnode,
 989                                      compat_ulong_t addr, compat_ulong_t flags)
 990 {
 991         long err;
 992         unsigned long __user *nm = NULL;
 993         unsigned long nr_bits, alloc_size;
 994         DECLARE_BITMAP(bm, MAX_NUMNODES);
 995
 996         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
 997         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
 998
 999         if (nmask)
1000                 nm = compat_alloc_user_space(alloc_size);
1001
1002         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1003
1004         if (!err && nmask) {
1005                 err = copy_from_user(bm, nm, alloc_size);
1006                 /* ensure entire bitmap is zeroed */
1007                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1008                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1009         }
1010
1011         return err;
1012 }
1013
1014 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1015                                      compat_ulong_t maxnode)
1016 {
1017         long err = 0;
1018         unsigned long __user *nm = NULL;
1019         unsigned long nr_bits, alloc_size;
1020         DECLARE_BITMAP(bm, MAX_NUMNODES);
1021
1022         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1023         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1024
1025         if (nmask) {
1026                 err = compat_get_bitmap(bm, nmask, nr_bits);
1027                 nm = compat_alloc_user_space(alloc_size);
1028                 err |= copy_to_user(nm, bm, alloc_size);
1029         }
1030
1031         if (err)
1032                 return -EFAULT;
1033
1034         return sys_set_mempolicy(mode, nm, nr_bits+1);
1035 }
1036
1037 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1038                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1039                              compat_ulong_t maxnode, compat_ulong_t flags)
1040 {
1041         long err = 0;
1042         unsigned long __user *nm = NULL;
1043         unsigned long nr_bits, alloc_size;
1044         nodemask_t bm;
1045
1046         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1047         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1048
1049         if (nmask) {
1050                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1051                 nm = compat_alloc_user_space(alloc_size);
1052                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1053         }
1054
1055         if (err)
1056                 return -EFAULT;
1057
1058         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1059 }
1060
1061 #endif
1062
1063 /* Return effective policy for a VMA */
1064 static struct mempolicy * get_vma_policy(struct task_struct *task,
1065                 struct vm_area_struct *vma, unsigned long addr)
1066 {
1067         struct mempolicy *pol = task->mempolicy;
1068
1069         if (vma) {
1070                 if (vma->vm_ops && vma->vm_ops->get_policy)
1071                         pol = vma->vm_ops->get_policy(vma, addr);
1072                 else if (vma->vm_policy &&
1073                                 vma->vm_policy->policy != MPOL_DEFAULT)
1074                         pol = vma->vm_policy;
1075         }
1076         if (!pol)
1077                 pol = &default_policy;
1078         return pol;
1079 }
1080
1081 /* Return a zonelist representing a mempolicy */
1082 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1083 {
1084         int nd;
1085
1086         switch (policy->policy) {
1087         case MPOL_PREFERRED:
1088                 nd = policy->v.preferred_node;
1089                 if (nd < 0)
1090                         nd = numa_node_id();
1091                 break;
1092         case MPOL_BIND:
1093                 /* Lower zones don't get a policy applied */
1094                 /* Careful: current->mems_allowed might have moved */
1095                 if (gfp_zone(gfp) >= policy_zone)
1096                         if (cpuset_zonelist_valid_mems_allowed(policy->v.zonelist))
1097                                 return policy->v.zonelist;
1098                 /*FALL THROUGH*/
1099         case MPOL_INTERLEAVE: /* should not happen */
1100         case MPOL_DEFAULT:
1101                 nd = numa_node_id();
1102                 break;
1103         default:
1104                 nd = 0;
1105                 BUG();
1106         }
1107         return NODE_DATA(nd)->node_zonelists + gfp_zone(gfp);
1108 }
1109
1110 /* Do dynamic interleaving for a process */
1111 static unsigned interleave_nodes(struct mempolicy *policy)
1112 {
1113         unsigned nid, next;
1114         struct task_struct *me = current;
1115
1116         nid = me->il_next;
1117         next = next_node(nid, policy->v.nodes);
1118         if (next >= MAX_NUMNODES)
1119                 next = first_node(policy->v.nodes);
1120         me->il_next = next;
1121         return nid;
1122 }
1123
1124 /*
1125  * Depending on the memory policy provide a node from which to allocate the
1126  * next slab entry.
1127  */
1128 unsigned slab_node(struct mempolicy *policy)
1129 {
1130         switch (policy->policy) {
1131         case MPOL_INTERLEAVE:
1132                 return interleave_nodes(policy);
1133
1134         case MPOL_BIND:
1135                 /*
1136                  * Follow bind policy behavior and start allocation at the
1137                  * first node.
1138                  */
1139                 return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
1140
1141         case MPOL_PREFERRED:
1142                 if (policy->v.preferred_node >= 0)
1143                         return policy->v.preferred_node;
1144                 /* Fall through */
1145
1146         default:
1147                 return numa_node_id();
1148         }
1149 }
1150
1151 /* Do static interleaving for a VMA with known offset. */
1152 static unsigned offset_il_node(struct mempolicy *pol,
1153                 struct vm_area_struct *vma, unsigned long off)
1154 {
1155         unsigned nnodes = nodes_weight(pol->v.nodes);
1156         unsigned target = (unsigned)off % nnodes;
1157         int c;
1158         int nid = -1;
1159
1160         c = 0;
1161         do {
1162                 nid = next_node(nid, pol->v.nodes);
1163                 c++;
1164         } while (c <= target);
1165         return nid;
1166 }
1167
1168 /* Determine a node number for interleave */
1169 static inline unsigned interleave_nid(struct mempolicy *pol,
1170                  struct vm_area_struct *vma, unsigned long addr, int shift)
1171 {
1172         if (vma) {
1173                 unsigned long off;
1174
1175                 off = vma->vm_pgoff;
1176                 off += (addr - vma->vm_start) >> shift;
1177                 return offset_il_node(pol, vma, off);
1178         } else
1179                 return interleave_nodes(pol);
1180 }
1181
1182 #ifdef CONFIG_HUGETLBFS
1183 /* Return a zonelist suitable for a huge page allocation. */
1184 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
1185 {
1186         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1187
1188         if (pol->policy == MPOL_INTERLEAVE) {
1189                 unsigned nid;
1190
1191                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1192                 return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
1193         }
1194         return zonelist_policy(GFP_HIGHUSER, pol);
1195 }
1196 #endif
1197
1198 /* Allocate a page in interleaved policy.
1199    Own path because it needs to do special accounting. */
1200 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1201                                         unsigned nid)
1202 {
1203         struct zonelist *zl;
1204         struct page *page;
1205
1206         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
1207         page = __alloc_pages(gfp, order, zl);
1208         if (page && page_zone(page) == zl->zones[0]) {
1209                 zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
1210                 put_cpu();
1211         }
1212         return page;
1213 }
1214
1215 /**
1216  *      alloc_page_vma  - Allocate a page for a VMA.
1217  *
1218  *      @gfp:
1219  *      %GFP_USER    user allocation.
1220  *      %GFP_KERNEL  kernel allocations,
1221  *      %GFP_HIGHMEM highmem/user allocations,
1222  *      %GFP_FS      allocation should not call back into a file system.
1223  *      %GFP_ATOMIC  don't sleep.
1224  *
1225  *      @vma:  Pointer to VMA or NULL if not available.
1226  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1227  *
1228  *      This function allocates a page from the kernel page pool and applies
1229  *      a NUMA policy associated with the VMA or the current process.
1230  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1231  *      mm_struct of the VMA to prevent it from going away. Should be used for
1232  *      all allocations for pages that will be mapped into
1233  *      user space. Returns NULL when no page can be allocated.
1234  *
1235  *      Should be called with the mm_sem of the vma hold.
1236  */
1237 struct page *
1238 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1239 {
1240         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1241
1242         cpuset_update_task_memory_state();
1243
1244         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1245                 unsigned nid;
1246
1247                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1248                 return alloc_page_interleave(gfp, 0, nid);
1249         }
1250         return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
1251 }
1252
1253 /**
1254  *      alloc_pages_current - Allocate pages.
1255  *
1256  *      @gfp:
1257  *              %GFP_USER   user allocation,
1258  *              %GFP_KERNEL kernel allocation,
1259  *              %GFP_HIGHMEM highmem allocation,
1260  *              %GFP_FS     don't call back into a file system.
1261  *              %GFP_ATOMIC don't sleep.
1262  *      @order: Power of two of allocation size in pages. 0 is a single page.
1263  *
1264  *      Allocate a page from the kernel page pool.  When not in
1265  *      interrupt context and apply the current process NUMA policy.
1266  *      Returns NULL when no page can be allocated.
1267  *
1268  *      Don't call cpuset_update_task_memory_state() unless
1269  *      1) it's ok to take cpuset_sem (can WAIT), and
1270  *      2) allocating for current task (not interrupt).
1271  */
1272 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1273 {
1274         struct mempolicy *pol = current->mempolicy;
1275
1276         if ((gfp & __GFP_WAIT) && !in_interrupt())
1277                 cpuset_update_task_memory_state();
1278         if (!pol || in_interrupt())
1279                 pol = &default_policy;
1280         if (pol->policy == MPOL_INTERLEAVE)
1281                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1282         return __alloc_pages(gfp, order, zonelist_policy(gfp, pol));
1283 }
1284 EXPORT_SYMBOL(alloc_pages_current);
1285
1286 /*
1287  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1288  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1289  * with the mems_allowed returned by cpuset_mems_allowed().  This
1290  * keeps mempolicies cpuset relative after its cpuset moves.  See
1291  * further kernel/cpuset.c update_nodemask().
1292  */
1293 void *cpuset_being_rebound;
1294
1295 /* Slow path of a mempolicy copy */
1296 struct mempolicy *__mpol_copy(struct mempolicy *old)
1297 {
1298         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1299
1300         if (!new)
1301                 return ERR_PTR(-ENOMEM);
1302         if (current_cpuset_is_being_rebound()) {
1303                 nodemask_t mems = cpuset_mems_allowed(current);
1304                 mpol_rebind_policy(old, &mems);
1305         }
1306         *new = *old;
1307         atomic_set(&new->refcnt, 1);
1308         if (new->policy == MPOL_BIND) {
1309                 int sz = ksize(old->v.zonelist);
1310                 new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
1311                 if (!new->v.zonelist) {
1312                         kmem_cache_free(policy_cache, new);
1313                         return ERR_PTR(-ENOMEM);
1314                 }
1315                 memcpy(new->v.zonelist, old->v.zonelist, sz);
1316         }
1317         return new;
1318 }
1319
1320 /* Slow path of a mempolicy comparison */
1321 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1322 {
1323         if (!a || !b)
1324                 return 0;
1325         if (a->policy != b->policy)
1326                 return 0;
1327         switch (a->policy) {
1328         case MPOL_DEFAULT:
1329                 return 1;
1330         case MPOL_INTERLEAVE:
1331                 return nodes_equal(a->v.nodes, b->v.nodes);
1332         case MPOL_PREFERRED:
1333                 return a->v.preferred_node == b->v.preferred_node;
1334         case MPOL_BIND: {
1335                 int i;
1336                 for (i = 0; a->v.zonelist->zones[i]; i++)
1337                         if (a->v.zonelist->zones[i] != b->v.zonelist->zones[i])
1338                                 return 0;
1339                 return b->v.zonelist->zones[i] == NULL;
1340         }
1341         default:
1342                 BUG();
1343                 return 0;
1344         }
1345 }
1346
1347 /* Slow path of a mpol destructor. */
1348 void __mpol_free(struct mempolicy *p)
1349 {
1350         if (!atomic_dec_and_test(&p->refcnt))
1351                 return;
1352         if (p->policy == MPOL_BIND)
1353                 kfree(p->v.zonelist);
1354         p->policy = MPOL_DEFAULT;
1355         kmem_cache_free(policy_cache, p);
1356 }
1357
1358 /*
1359  * Shared memory backing store policy support.
1360  *
1361  * Remember policies even when nobody has shared memory mapped.
1362  * The policies are kept in Red-Black tree linked from the inode.
1363  * They are protected by the sp->lock spinlock, which should be held
1364  * for any accesses to the tree.
1365  */
1366
1367 /* lookup first element intersecting start-end */
1368 /* Caller holds sp->lock */
1369 static struct sp_node *
1370 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1371 {
1372         struct rb_node *n = sp->root.rb_node;
1373
1374         while (n) {
1375                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1376
1377                 if (start >= p->end)
1378                         n = n->rb_right;
1379                 else if (end <= p->start)
1380                         n = n->rb_left;
1381                 else
1382                         break;
1383         }
1384         if (!n)
1385                 return NULL;
1386         for (;;) {
1387                 struct sp_node *w = NULL;
1388                 struct rb_node *prev = rb_prev(n);
1389                 if (!prev)
1390                         break;
1391                 w = rb_entry(prev, struct sp_node, nd);
1392                 if (w->end <= start)
1393                         break;
1394                 n = prev;
1395         }
1396         return rb_entry(n, struct sp_node, nd);
1397 }
1398
1399 /* Insert a new shared policy into the list. */
1400 /* Caller holds sp->lock */
1401 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1402 {
1403         struct rb_node **p = &sp->root.rb_node;
1404         struct rb_node *parent = NULL;
1405         struct sp_node *nd;
1406
1407         while (*p) {
1408                 parent = *p;
1409                 nd = rb_entry(parent, struct sp_node, nd);
1410                 if (new->start < nd->start)
1411                         p = &(*p)->rb_left;
1412                 else if (new->end > nd->end)
1413                         p = &(*p)->rb_right;
1414                 else
1415                         BUG();
1416         }
1417         rb_link_node(&new->nd, parent, p);
1418         rb_insert_color(&new->nd, &sp->root);
1419         PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
1420                  new->policy ? new->policy->policy : 0);
1421 }
1422
1423 /* Find shared policy intersecting idx */
1424 struct mempolicy *
1425 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1426 {
1427         struct mempolicy *pol = NULL;
1428         struct sp_node *sn;
1429
1430         if (!sp->root.rb_node)
1431                 return NULL;
1432         spin_lock(&sp->lock);
1433         sn = sp_lookup(sp, idx, idx+1);
1434         if (sn) {
1435                 mpol_get(sn->policy);
1436                 pol = sn->policy;
1437         }
1438         spin_unlock(&sp->lock);
1439         return pol;
1440 }
1441
1442 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1443 {
1444         PDprintk("deleting %lx-l%x\n", n->start, n->end);
1445         rb_erase(&n->nd, &sp->root);
1446         mpol_free(n->policy);
1447         kmem_cache_free(sn_cache, n);
1448 }
1449
1450 struct sp_node *
1451 sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
1452 {
1453         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1454
1455         if (!n)
1456                 return NULL;
1457         n->start = start;
1458         n->end = end;
1459         mpol_get(pol);
1460         n->policy = pol;
1461         return n;
1462 }
1463
1464 /* Replace a policy range. */
1465 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1466                                  unsigned long end, struct sp_node *new)
1467 {
1468         struct sp_node *n, *new2 = NULL;
1469
1470 restart:
1471         spin_lock(&sp->lock);
1472         n = sp_lookup(sp, start, end);
1473         /* Take care of old policies in the same range. */
1474         while (n && n->start < end) {
1475                 struct rb_node *next = rb_next(&n->nd);
1476                 if (n->start >= start) {
1477                         if (n->end <= end)
1478                                 sp_delete(sp, n);
1479                         else
1480                                 n->start = end;
1481                 } else {
1482                         /* Old policy spanning whole new range. */
1483                         if (n->end > end) {
1484                                 if (!new2) {
1485                                         spin_unlock(&sp->lock);
1486                                         new2 = sp_alloc(end, n->end, n->policy);
1487                                         if (!new2)
1488                                                 return -ENOMEM;
1489                                         goto restart;
1490                                 }
1491                                 n->end = start;
1492                                 sp_insert(sp, new2);
1493                                 new2 = NULL;
1494                                 break;
1495                         } else
1496                                 n->end = start;
1497                 }
1498                 if (!next)
1499                         break;
1500                 n = rb_entry(next, struct sp_node, nd);
1501         }
1502         if (new)
1503                 sp_insert(sp, new);
1504         spin_unlock(&sp->lock);
1505         if (new2) {
1506                 mpol_free(new2->policy);
1507                 kmem_cache_free(sn_cache, new2);
1508         }
1509         return 0;
1510 }
1511
1512 void mpol_shared_policy_init(struct shared_policy *info, int policy,
1513                                 nodemask_t *policy_nodes)
1514 {
1515         info->root = RB_ROOT;
1516         spin_lock_init(&info->lock);
1517
1518         if (policy != MPOL_DEFAULT) {
1519                 struct mempolicy *newpol;
1520
1521                 /* Falls back to MPOL_DEFAULT on any error */
1522                 newpol = mpol_new(policy, policy_nodes);
1523                 if (!IS_ERR(newpol)) {
1524                         /* Create pseudo-vma that contains just the policy */
1525                         struct vm_area_struct pvma;
1526
1527                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1528                         /* Policy covers entire file */
1529                         pvma.vm_end = TASK_SIZE;
1530                         mpol_set_shared_policy(info, &pvma, newpol);
1531                         mpol_free(newpol);
1532                 }
1533         }
1534 }
1535
1536 int mpol_set_shared_policy(struct shared_policy *info,
1537                         struct vm_area_struct *vma, struct mempolicy *npol)
1538 {
1539         int err;
1540         struct sp_node *new = NULL;
1541         unsigned long sz = vma_pages(vma);
1542
1543         PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
1544                  vma->vm_pgoff,
1545                  sz, npol? npol->policy : -1,
1546                 npol ? nodes_addr(npol->v.nodes)[0] : -1);
1547
1548         if (npol) {
1549                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1550                 if (!new)
1551                         return -ENOMEM;
1552         }
1553         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1554         if (err && new)
1555                 kmem_cache_free(sn_cache, new);
1556         return err;
1557 }
1558
1559 /* Free a backing policy store on inode delete. */
1560 void mpol_free_shared_policy(struct shared_policy *p)
1561 {
1562         struct sp_node *n;
1563         struct rb_node *next;
1564
1565         if (!p->root.rb_node)
1566                 return;
1567         spin_lock(&p->lock);
1568         next = rb_first(&p->root);
1569         while (next) {
1570                 n = rb_entry(next, struct sp_node, nd);
1571                 next = rb_next(&n->nd);
1572                 rb_erase(&n->nd, &p->root);
1573                 mpol_free(n->policy);
1574                 kmem_cache_free(sn_cache, n);
1575         }
1576         spin_unlock(&p->lock);
1577 }
1578
1579 /* assumes fs == KERNEL_DS */
1580 void __init numa_policy_init(void)
1581 {
1582         policy_cache = kmem_cache_create("numa_policy",
1583                                          sizeof(struct mempolicy),
1584                                          0, SLAB_PANIC, NULL, NULL);
1585
1586         sn_cache = kmem_cache_create("shared_policy_node",
1587                                      sizeof(struct sp_node),
1588                                      0, SLAB_PANIC, NULL, NULL);
1589
1590         /* Set interleaving policy for system init. This way not all
1591            the data structures allocated at system boot end up in node zero. */
1592
1593         if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
1594                 printk("numa_policy_init: interleaving failed\n");
1595 }
1596
1597 /* Reset policy of current process to default */
1598 void numa_default_policy(void)
1599 {
1600         do_set_mempolicy(MPOL_DEFAULT, NULL);
1601 }
1602
1603 /* Migrate a policy to a different set of nodes */
1604 void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
1605 {
1606         nodemask_t *mpolmask;
1607         nodemask_t tmp;
1608
1609         if (!pol)
1610                 return;
1611         mpolmask = &pol->cpuset_mems_allowed;
1612         if (nodes_equal(*mpolmask, *newmask))
1613                 return;
1614
1615         switch (pol->policy) {
1616         case MPOL_DEFAULT:
1617                 break;
1618         case MPOL_INTERLEAVE:
1619                 nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
1620                 pol->v.nodes = tmp;
1621                 *mpolmask = *newmask;
1622                 current->il_next = node_remap(current->il_next,
1623                                                 *mpolmask, *newmask);
1624                 break;
1625         case MPOL_PREFERRED:
1626                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
1627                                                 *mpolmask, *newmask);
1628                 *mpolmask = *newmask;
1629                 break;
1630         case MPOL_BIND: {
1631                 nodemask_t nodes;
1632                 struct zone **z;
1633                 struct zonelist *zonelist;
1634
1635                 nodes_clear(nodes);
1636                 for (z = pol->v.zonelist->zones; *z; z++)
1637                         node_set((*z)->zone_pgdat->node_id, nodes);
1638                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
1639                 nodes = tmp;
1640
1641                 zonelist = bind_zonelist(&nodes);
1642
1643                 /* If no mem, then zonelist is NULL and we keep old zonelist.
1644                  * If that old zonelist has no remaining mems_allowed nodes,
1645                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
1646                  */
1647
1648                 if (zonelist) {
1649                         /* Good - got mem - substitute new zonelist */
1650                         kfree(pol->v.zonelist);
1651                         pol->v.zonelist = zonelist;
1652                 }
1653                 *mpolmask = *newmask;
1654                 break;
1655         }
1656         default:
1657                 BUG();
1658                 break;
1659         }
1660 }
1661
1662 /*
1663  * Wrapper for mpol_rebind_policy() that just requires task
1664  * pointer, and updates task mempolicy.
1665  */
1666
1667 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1668 {
1669         mpol_rebind_policy(tsk->mempolicy, new);
1670 }
1671
1672 /*
1673  * Rebind each vma in mm to new nodemask.
1674  *
1675  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1676  */
1677
1678 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1679 {
1680         struct vm_area_struct *vma;
1681
1682         down_write(&mm->mmap_sem);
1683         for (vma = mm->mmap; vma; vma = vma->vm_next)
1684                 mpol_rebind_policy(vma->vm_policy, new);
1685         up_write(&mm->mmap_sem);
1686 }
1687
1688 /*
1689  * Display pages allocated per node and memory policy via /proc.
1690  */
1691
1692 static const char *policy_types[] = { "default", "prefer", "bind",
1693                                       "interleave" };
1694
1695 /*
1696  * Convert a mempolicy into a string.
1697  * Returns the number of characters in buffer (if positive)
1698  * or an error (negative)
1699  */
1700 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1701 {
1702         char *p = buffer;
1703         int l;
1704         nodemask_t nodes;
1705         int mode = pol ? pol->policy : MPOL_DEFAULT;
1706
1707         switch (mode) {
1708         case MPOL_DEFAULT:
1709                 nodes_clear(nodes);
1710                 break;
1711
1712         case MPOL_PREFERRED:
1713                 nodes_clear(nodes);
1714                 node_set(pol->v.preferred_node, nodes);
1715                 break;
1716
1717         case MPOL_BIND:
1718                 get_zonemask(pol, &nodes);
1719                 break;
1720
1721         case MPOL_INTERLEAVE:
1722                 nodes = pol->v.nodes;
1723                 break;
1724
1725         default:
1726                 BUG();
1727                 return -EFAULT;
1728         }
1729
1730         l = strlen(policy_types[mode]);
1731         if (buffer + maxlen < p + l + 1)
1732                 return -ENOSPC;
1733
1734         strcpy(p, policy_types[mode]);
1735         p += l;
1736
1737         if (!nodes_empty(nodes)) {
1738                 if (buffer + maxlen < p + 2)
1739                         return -ENOSPC;
1740                 *p++ = '=';
1741                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1742         }
1743         return p - buffer;
1744 }
1745
1746 struct numa_maps {
1747         unsigned long pages;
1748         unsigned long anon;
1749         unsigned long active;
1750         unsigned long writeback;
1751         unsigned long mapcount_max;
1752         unsigned long dirty;
1753         unsigned long swapcache;
1754         unsigned long node[MAX_NUMNODES];
1755 };
1756
1757 static void gather_stats(struct page *page, void *private, int pte_dirty)
1758 {
1759         struct numa_maps *md = private;
1760         int count = page_mapcount(page);
1761
1762         md->pages++;
1763         if (pte_dirty || PageDirty(page))
1764                 md->dirty++;
1765
1766         if (PageSwapCache(page))
1767                 md->swapcache++;
1768
1769         if (PageActive(page))
1770                 md->active++;
1771
1772         if (PageWriteback(page))
1773                 md->writeback++;
1774
1775         if (PageAnon(page))
1776                 md->anon++;
1777
1778         if (count > md->mapcount_max)
1779                 md->mapcount_max = count;
1780
1781         md->node[page_to_nid(page)]++;
1782 }
1783
1784 #ifdef CONFIG_HUGETLB_PAGE
1785 static void check_huge_range(struct vm_area_struct *vma,
1786                 unsigned long start, unsigned long end,
1787                 struct numa_maps *md)
1788 {
1789         unsigned long addr;
1790         struct page *page;
1791
1792         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1793                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1794                 pte_t pte;
1795
1796                 if (!ptep)
1797                         continue;
1798
1799                 pte = *ptep;
1800                 if (pte_none(pte))
1801                         continue;
1802
1803                 page = pte_page(pte);
1804                 if (!page)
1805                         continue;
1806
1807                 gather_stats(page, md, pte_dirty(*ptep));
1808         }
1809 }
1810 #else
1811 static inline void check_huge_range(struct vm_area_struct *vma,
1812                 unsigned long start, unsigned long end,
1813                 struct numa_maps *md)
1814 {
1815 }
1816 #endif
1817
1818 int show_numa_map(struct seq_file *m, void *v)
1819 {
1820         struct task_struct *task = m->private;
1821         struct vm_area_struct *vma = v;
1822         struct numa_maps *md;
1823         struct file *file = vma->vm_file;
1824         struct mm_struct *mm = vma->vm_mm;
1825         int n;
1826         char buffer[50];
1827
1828         if (!mm)
1829                 return 0;
1830
1831         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
1832         if (!md)
1833                 return 0;
1834
1835         mpol_to_str(buffer, sizeof(buffer),
1836                         get_vma_policy(task, vma, vma->vm_start));
1837
1838         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
1839
1840         if (file) {
1841                 seq_printf(m, " file=");
1842                 seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
1843         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
1844                 seq_printf(m, " heap");
1845         } else if (vma->vm_start <= mm->start_stack &&
1846                         vma->vm_end >= mm->start_stack) {
1847                 seq_printf(m, " stack");
1848         }
1849
1850         if (is_vm_hugetlb_page(vma)) {
1851                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
1852                 seq_printf(m, " huge");
1853         } else {
1854                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
1855                                 &node_online_map, MPOL_MF_STATS, md);
1856         }
1857
1858         if (!md->pages)
1859                 goto out;
1860
1861         if (md->anon)
1862                 seq_printf(m," anon=%lu",md->anon);
1863
1864         if (md->dirty)
1865                 seq_printf(m," dirty=%lu",md->dirty);
1866
1867         if (md->pages != md->anon && md->pages != md->dirty)
1868                 seq_printf(m, " mapped=%lu", md->pages);
1869
1870         if (md->mapcount_max > 1)
1871                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
1872
1873         if (md->swapcache)
1874                 seq_printf(m," swapcache=%lu", md->swapcache);
1875
1876         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
1877                 seq_printf(m," active=%lu", md->active);
1878
1879         if (md->writeback)
1880                 seq_printf(m," writeback=%lu", md->writeback);
1881
1882         for_each_online_node(n)
1883                 if (md->node[n])
1884                         seq_printf(m, " N%d=%lu", n, md->node[n]);
1885 out:
1886         seq_putc(m, '\n');
1887         kfree(md);
1888
1889         if (m->count < m->size)
1890                 m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
1891         return 0;
1892 }
1893