SAFE public projects git trees. - safe/jmp/linux-2.6/blob - mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66    could replace all the switch()es with a mempolicy_ops structure.
  67 */
  68
  69 #include <linux/mempolicy.h>
  70 #include <linux/mm.h>
  71 #include <linux/highmem.h>
  72 #include <linux/hugetlb.h>
  73 #include <linux/kernel.h>
  74 #include <linux/sched.h>
  75 #include <linux/nodemask.h>
  76 #include <linux/cpuset.h>
  77 #include <linux/gfp.h>
  78 #include <linux/slab.h>
  79 #include <linux/string.h>
  80 #include <linux/module.h>
  81 #include <linux/nsproxy.h>
  82 #include <linux/interrupt.h>
  83 #include <linux/init.h>
  84 #include <linux/compat.h>
  85 #include <linux/swap.h>
  86 #include <linux/seq_file.h>
  87 #include <linux/proc_fs.h>
  88 #include <linux/migrate.h>
  89 #include <linux/rmap.h>
  90 #include <linux/security.h>
  91 #include <linux/syscalls.h>
  92
  93 #include <asm/tlbflush.h>
  94 #include <asm/uaccess.h>
  95
  96 /* Internal flags */
  97 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
  98 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
  99 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 100
 101 static struct kmem_cache *policy_cache;
 102 static struct kmem_cache *sn_cache;
 103
 104 /* Highest zone. An specific allocation for a zone below that is not
 105    policied. */
 106 enum zone_type policy_zone = 0;
 107
 108 struct mempolicy default_policy = {
 109         .refcnt = ATOMIC_INIT(1), /* never free it */
 110         .policy = MPOL_DEFAULT,
 111 };
 112
 113 static void mpol_rebind_policy(struct mempolicy *pol,
 114                                const nodemask_t *newmask);
 115
 116 /* Check that the nodemask contains at least one populated zone */
 117 static int is_valid_nodemask(nodemask_t *nodemask)
 118 {
 119         int nd, k;
 120
 121         /* Check that there is something useful in this mask */
 122         k = policy_zone;
 123
 124         for_each_node_mask(nd, *nodemask) {
 125                 struct zone *z;
 126
 127                 for (k = 0; k <= policy_zone; k++) {
 128                         z = &NODE_DATA(nd)->node_zones[k];
 129                         if (z->present_pages > 0)
 130                                 return 1;
 131                 }
 132         }
 133
 134         return 0;
 135 }
 136
 137 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 138 {
 139         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 140 }
 141
 142 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 143                                    const nodemask_t *rel)
 144 {
 145         nodemask_t tmp;
 146         nodes_fold(tmp, *orig, nodes_weight(*rel));
 147         nodes_onto(*ret, tmp, *rel);
 148 }
 149
 150 /* Create a new policy */
 151 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 152                                   nodemask_t *nodes)
 153 {
 154         struct mempolicy *policy;
 155         nodemask_t cpuset_context_nmask;
 156
 157         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 158                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 159
 160         if (mode == MPOL_DEFAULT)
 161                 return (nodes && nodes_weight(*nodes)) ? ERR_PTR(-EINVAL) :
 162                                                          NULL;
 163         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 164         if (!policy)
 165                 return ERR_PTR(-ENOMEM);
 166         atomic_set(&policy->refcnt, 1);
 167         cpuset_update_task_memory_state();
 168         if (flags & MPOL_F_RELATIVE_NODES)
 169                 mpol_relative_nodemask(&cpuset_context_nmask, nodes,
 170                                        &cpuset_current_mems_allowed);
 171         else
 172                 nodes_and(cpuset_context_nmask, *nodes,
 173                           cpuset_current_mems_allowed);
 174         switch (mode) {
 175         case MPOL_INTERLEAVE:
 176                 if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
 177                         goto free;
 178                 policy->v.nodes = cpuset_context_nmask;
 179                 break;
 180         case MPOL_PREFERRED:
 181                 policy->v.preferred_node = first_node(cpuset_context_nmask);
 182                 if (policy->v.preferred_node >= MAX_NUMNODES)
 183                         goto free;
 184                 break;
 185         case MPOL_BIND:
 186                 if (!is_valid_nodemask(&cpuset_context_nmask))
 187                         goto free;
 188                 policy->v.nodes = cpuset_context_nmask;
 189                 break;
 190         default:
 191                 BUG();
 192         }
 193         policy->policy = mode;
 194         policy->flags = flags;
 195         if (mpol_store_user_nodemask(policy))
 196                 policy->w.user_nodemask = *nodes;
 197         else
 198                 policy->w.cpuset_mems_allowed = cpuset_mems_allowed(current);
 199         return policy;
 200
 201 free:
 202         kmem_cache_free(policy_cache, policy);
 203         return ERR_PTR(-EINVAL);
 204 }
 205
 206 static void gather_stats(struct page *, void *, int pte_dirty);
 207 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 208                                 unsigned long flags);
 209
 210 /* Scan through pages checking if pages follow certain conditions. */
 211 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 212                 unsigned long addr, unsigned long end,
 213                 const nodemask_t *nodes, unsigned long flags,
 214                 void *private)
 215 {
 216         pte_t *orig_pte;
 217         pte_t *pte;
 218         spinlock_t *ptl;
 219
 220         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 221         do {
 222                 struct page *page;
 223                 int nid;
 224
 225                 if (!pte_present(*pte))
 226                         continue;
 227                 page = vm_normal_page(vma, addr, *pte);
 228                 if (!page)
 229                         continue;
 230                 /*
 231                  * The check for PageReserved here is important to avoid
 232                  * handling zero pages and other pages that may have been
 233                  * marked special by the system.
 234                  *
 235                  * If the PageReserved would not be checked here then f.e.
 236                  * the location of the zero page could have an influence
 237                  * on MPOL_MF_STRICT, zero pages would be counted for
 238                  * the per node stats, and there would be useless attempts
 239                  * to put zero pages on the migration list.
 240                  */
 241                 if (PageReserved(page))
 242                         continue;
 243                 nid = page_to_nid(page);
 244                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 245                         continue;
 246
 247                 if (flags & MPOL_MF_STATS)
 248                         gather_stats(page, private, pte_dirty(*pte));
 249                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 250                         migrate_page_add(page, private, flags);
 251                 else
 252                         break;
 253         } while (pte++, addr += PAGE_SIZE, addr != end);
 254         pte_unmap_unlock(orig_pte, ptl);
 255         return addr != end;
 256 }
 257
 258 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 259                 unsigned long addr, unsigned long end,
 260                 const nodemask_t *nodes, unsigned long flags,
 261                 void *private)
 262 {
 263         pmd_t *pmd;
 264         unsigned long next;
 265
 266         pmd = pmd_offset(pud, addr);
 267         do {
 268                 next = pmd_addr_end(addr, end);
 269                 if (pmd_none_or_clear_bad(pmd))
 270                         continue;
 271                 if (check_pte_range(vma, pmd, addr, next, nodes,
 272                                     flags, private))
 273                         return -EIO;
 274         } while (pmd++, addr = next, addr != end);
 275         return 0;
 276 }
 277
 278 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 279                 unsigned long addr, unsigned long end,
 280                 const nodemask_t *nodes, unsigned long flags,
 281                 void *private)
 282 {
 283         pud_t *pud;
 284         unsigned long next;
 285
 286         pud = pud_offset(pgd, addr);
 287         do {
 288                 next = pud_addr_end(addr, end);
 289                 if (pud_none_or_clear_bad(pud))
 290                         continue;
 291                 if (check_pmd_range(vma, pud, addr, next, nodes,
 292                                     flags, private))
 293                         return -EIO;
 294         } while (pud++, addr = next, addr != end);
 295         return 0;
 296 }
 297
 298 static inline int check_pgd_range(struct vm_area_struct *vma,
 299                 unsigned long addr, unsigned long end,
 300                 const nodemask_t *nodes, unsigned long flags,
 301                 void *private)
 302 {
 303         pgd_t *pgd;
 304         unsigned long next;
 305
 306         pgd = pgd_offset(vma->vm_mm, addr);
 307         do {
 308                 next = pgd_addr_end(addr, end);
 309                 if (pgd_none_or_clear_bad(pgd))
 310                         continue;
 311                 if (check_pud_range(vma, pgd, addr, next, nodes,
 312                                     flags, private))
 313                         return -EIO;
 314         } while (pgd++, addr = next, addr != end);
 315         return 0;
 316 }
 317
 318 /*
 319  * Check if all pages in a range are on a set of nodes.
 320  * If pagelist != NULL then isolate pages from the LRU and
 321  * put them on the pagelist.
 322  */
 323 static struct vm_area_struct *
 324 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 325                 const nodemask_t *nodes, unsigned long flags, void *private)
 326 {
 327         int err;
 328         struct vm_area_struct *first, *vma, *prev;
 329
 330         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
 331
 332                 err = migrate_prep();
 333                 if (err)
 334                         return ERR_PTR(err);
 335         }
 336
 337         first = find_vma(mm, start);
 338         if (!first)
 339                 return ERR_PTR(-EFAULT);
 340         prev = NULL;
 341         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 342                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 343                         if (!vma->vm_next && vma->vm_end < end)
 344                                 return ERR_PTR(-EFAULT);
 345                         if (prev && prev->vm_end < vma->vm_start)
 346                                 return ERR_PTR(-EFAULT);
 347                 }
 348                 if (!is_vm_hugetlb_page(vma) &&
 349                     ((flags & MPOL_MF_STRICT) ||
 350                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 351                                 vma_migratable(vma)))) {
 352                         unsigned long endvma = vma->vm_end;
 353
 354                         if (endvma > end)
 355                                 endvma = end;
 356                         if (vma->vm_start > start)
 357                                 start = vma->vm_start;
 358                         err = check_pgd_range(vma, start, endvma, nodes,
 359                                                 flags, private);
 360                         if (err) {
 361                                 first = ERR_PTR(err);
 362                                 break;
 363                         }
 364                 }
 365                 prev = vma;
 366         }
 367         return first;
 368 }
 369
 370 /* Apply policy to a single VMA */
 371 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 372 {
 373         int err = 0;
 374         struct mempolicy *old = vma->vm_policy;
 375
 376         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 377                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 378                  vma->vm_ops, vma->vm_file,
 379                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 380
 381         if (vma->vm_ops && vma->vm_ops->set_policy)
 382                 err = vma->vm_ops->set_policy(vma, new);
 383         if (!err) {
 384                 mpol_get(new);
 385                 vma->vm_policy = new;
 386                 mpol_free(old);
 387         }
 388         return err;
 389 }
 390
 391 /* Step 2: apply policy to a range and do splits. */
 392 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 393                        unsigned long end, struct mempolicy *new)
 394 {
 395         struct vm_area_struct *next;
 396         int err;
 397
 398         err = 0;
 399         for (; vma && vma->vm_start < end; vma = next) {
 400                 next = vma->vm_next;
 401                 if (vma->vm_start < start)
 402                         err = split_vma(vma->vm_mm, vma, start, 1);
 403                 if (!err && vma->vm_end > end)
 404                         err = split_vma(vma->vm_mm, vma, end, 0);
 405                 if (!err)
 406                         err = policy_vma(vma, new);
 407                 if (err)
 408                         break;
 409         }
 410         return err;
 411 }
 412
 413 /*
 414  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 415  * mempolicy.  Allows more rapid checking of this (combined perhaps
 416  * with other PF_* flag bits) on memory allocation hot code paths.
 417  *
 418  * If called from outside this file, the task 'p' should -only- be
 419  * a newly forked child not yet visible on the task list, because
 420  * manipulating the task flags of a visible task is not safe.
 421  *
 422  * The above limitation is why this routine has the funny name
 423  * mpol_fix_fork_child_flag().
 424  *
 425  * It is also safe to call this with a task pointer of current,
 426  * which the static wrapper mpol_set_task_struct_flag() does,
 427  * for use within this file.
 428  */
 429
 430 void mpol_fix_fork_child_flag(struct task_struct *p)
 431 {
 432         if (p->mempolicy)
 433                 p->flags |= PF_MEMPOLICY;
 434         else
 435                 p->flags &= ~PF_MEMPOLICY;
 436 }
 437
 438 static void mpol_set_task_struct_flag(void)
 439 {
 440         mpol_fix_fork_child_flag(current);
 441 }
 442
 443 /* Set the process memory policy */
 444 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 445                              nodemask_t *nodes)
 446 {
 447         struct mempolicy *new;
 448
 449         new = mpol_new(mode, flags, nodes);
 450         if (IS_ERR(new))
 451                 return PTR_ERR(new);
 452         mpol_free(current->mempolicy);
 453         current->mempolicy = new;
 454         mpol_set_task_struct_flag();
 455         if (new && new->policy == MPOL_INTERLEAVE &&
 456             nodes_weight(new->v.nodes))
 457                 current->il_next = first_node(new->v.nodes);
 458         return 0;
 459 }
 460
 461 /* Fill a zone bitmap for a policy */
 462 static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
 463 {
 464         nodes_clear(*nodes);
 465         switch (p->policy) {
 466         case MPOL_DEFAULT:
 467                 break;
 468         case MPOL_BIND:
 469                 /* Fall through */
 470         case MPOL_INTERLEAVE:
 471                 *nodes = p->v.nodes;
 472                 break;
 473         case MPOL_PREFERRED:
 474                 /* or use current node instead of memory_map? */
 475                 if (p->v.preferred_node < 0)
 476                         *nodes = node_states[N_HIGH_MEMORY];
 477                 else
 478                         node_set(p->v.preferred_node, *nodes);
 479                 break;
 480         default:
 481                 BUG();
 482         }
 483 }
 484
 485 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 486 {
 487         struct page *p;
 488         int err;
 489
 490         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 491         if (err >= 0) {
 492                 err = page_to_nid(p);
 493                 put_page(p);
 494         }
 495         return err;
 496 }
 497
 498 /* Retrieve NUMA policy */
 499 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 500                              unsigned long addr, unsigned long flags)
 501 {
 502         int err;
 503         struct mm_struct *mm = current->mm;
 504         struct vm_area_struct *vma = NULL;
 505         struct mempolicy *pol = current->mempolicy;
 506
 507         cpuset_update_task_memory_state();
 508         if (flags &
 509                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 510                 return -EINVAL;
 511
 512         if (flags & MPOL_F_MEMS_ALLOWED) {
 513                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 514                         return -EINVAL;
 515                 *policy = 0;    /* just so it's initialized */
 516                 *nmask  = cpuset_current_mems_allowed;
 517                 return 0;
 518         }
 519
 520         if (flags & MPOL_F_ADDR) {
 521                 down_read(&mm->mmap_sem);
 522                 vma = find_vma_intersection(mm, addr, addr+1);
 523                 if (!vma) {
 524                         up_read(&mm->mmap_sem);
 525                         return -EFAULT;
 526                 }
 527                 if (vma->vm_ops && vma->vm_ops->get_policy)
 528                         pol = vma->vm_ops->get_policy(vma, addr);
 529                 else
 530                         pol = vma->vm_policy;
 531         } else if (addr)
 532                 return -EINVAL;
 533
 534         if (!pol)
 535                 pol = &default_policy;
 536
 537         if (flags & MPOL_F_NODE) {
 538                 if (flags & MPOL_F_ADDR) {
 539                         err = lookup_node(mm, addr);
 540                         if (err < 0)
 541                                 goto out;
 542                         *policy = err;
 543                 } else if (pol == current->mempolicy &&
 544                                 pol->policy == MPOL_INTERLEAVE) {
 545                         *policy = current->il_next;
 546                 } else {
 547                         err = -EINVAL;
 548                         goto out;
 549                 }
 550         } else
 551                 *policy = pol->policy | pol->flags;
 552
 553         if (vma) {
 554                 up_read(&current->mm->mmap_sem);
 555                 vma = NULL;
 556         }
 557
 558         err = 0;
 559         if (nmask)
 560                 get_zonemask(pol, nmask);
 561
 562  out:
 563         if (vma)
 564                 up_read(&current->mm->mmap_sem);
 565         return err;
 566 }
 567
 568 #ifdef CONFIG_MIGRATION
 569 /*
 570  * page migration
 571  */
 572 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 573                                 unsigned long flags)
 574 {
 575         /*
 576          * Avoid migrating a page that is shared with others.
 577          */
 578         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
 579                 isolate_lru_page(page, pagelist);
 580 }
 581
 582 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 583 {
 584         return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
 585 }
 586
 587 /*
 588  * Migrate pages from one node to a target node.
 589  * Returns error or the number of pages not migrated.
 590  */
 591 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 592                            int flags)
 593 {
 594         nodemask_t nmask;
 595         LIST_HEAD(pagelist);
 596         int err = 0;
 597
 598         nodes_clear(nmask);
 599         node_set(source, nmask);
 600
 601         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 602                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 603
 604         if (!list_empty(&pagelist))
 605                 err = migrate_pages(&pagelist, new_node_page, dest);
 606
 607         return err;
 608 }
 609
 610 /*
 611  * Move pages between the two nodesets so as to preserve the physical
 612  * layout as much as possible.
 613  *
 614  * Returns the number of page that could not be moved.
 615  */
 616 int do_migrate_pages(struct mm_struct *mm,
 617         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 618 {
 619         LIST_HEAD(pagelist);
 620         int busy = 0;
 621         int err = 0;
 622         nodemask_t tmp;
 623
 624         down_read(&mm->mmap_sem);
 625
 626         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 627         if (err)
 628                 goto out;
 629
 630 /*
 631  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 632  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 633  * bit in 'tmp', and return that <source, dest> pair for migration.
 634  * The pair of nodemasks 'to' and 'from' define the map.
 635  *
 636  * If no pair of bits is found that way, fallback to picking some
 637  * pair of 'source' and 'dest' bits that are not the same.  If the
 638  * 'source' and 'dest' bits are the same, this represents a node
 639  * that will be migrating to itself, so no pages need move.
 640  *
 641  * If no bits are left in 'tmp', or if all remaining bits left
 642  * in 'tmp' correspond to the same bit in 'to', return false
 643  * (nothing left to migrate).
 644  *
 645  * This lets us pick a pair of nodes to migrate between, such that
 646  * if possible the dest node is not already occupied by some other
 647  * source node, minimizing the risk of overloading the memory on a
 648  * node that would happen if we migrated incoming memory to a node
 649  * before migrating outgoing memory source that same node.
 650  *
 651  * A single scan of tmp is sufficient.  As we go, we remember the
 652  * most recent <s, d> pair that moved (s != d).  If we find a pair
 653  * that not only moved, but what's better, moved to an empty slot
 654  * (d is not set in tmp), then we break out then, with that pair.
 655  * Otherwise when we finish scannng from_tmp, we at least have the
 656  * most recent <s, d> pair that moved.  If we get all the way through
 657  * the scan of tmp without finding any node that moved, much less
 658  * moved to an empty node, then there is nothing left worth migrating.
 659  */
 660
 661         tmp = *from_nodes;
 662         while (!nodes_empty(tmp)) {
 663                 int s,d;
 664                 int source = -1;
 665                 int dest = 0;
 666
 667                 for_each_node_mask(s, tmp) {
 668                         d = node_remap(s, *from_nodes, *to_nodes);
 669                         if (s == d)
 670                                 continue;
 671
 672                         source = s;     /* Node moved. Memorize */
 673                         dest = d;
 674
 675                         /* dest not in remaining from nodes? */
 676                         if (!node_isset(dest, tmp))
 677                                 break;
 678                 }
 679                 if (source == -1)
 680                         break;
 681
 682                 node_clear(source, tmp);
 683                 err = migrate_to_node(mm, source, dest, flags);
 684                 if (err > 0)
 685                         busy += err;
 686                 if (err < 0)
 687                         break;
 688         }
 689 out:
 690         up_read(&mm->mmap_sem);
 691         if (err < 0)
 692                 return err;
 693         return busy;
 694
 695 }
 696
 697 /*
 698  * Allocate a new page for page migration based on vma policy.
 699  * Start assuming that page is mapped by vma pointed to by @private.
 700  * Search forward from there, if not.  N.B., this assumes that the
 701  * list of pages handed to migrate_pages()--which is how we get here--
 702  * is in virtual address order.
 703  */
 704 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 705 {
 706         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 707         unsigned long uninitialized_var(address);
 708
 709         while (vma) {
 710                 address = page_address_in_vma(page, vma);
 711                 if (address != -EFAULT)
 712                         break;
 713                 vma = vma->vm_next;
 714         }
 715
 716         /*
 717          * if !vma, alloc_page_vma() will use task or system default policy
 718          */
 719         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 720 }
 721 #else
 722
 723 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 724                                 unsigned long flags)
 725 {
 726 }
 727
 728 int do_migrate_pages(struct mm_struct *mm,
 729         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 730 {
 731         return -ENOSYS;
 732 }
 733
 734 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 735 {
 736         return NULL;
 737 }
 738 #endif
 739
 740 static long do_mbind(unsigned long start, unsigned long len,
 741                      unsigned short mode, unsigned short mode_flags,
 742                      nodemask_t *nmask, unsigned long flags)
 743 {
 744         struct vm_area_struct *vma;
 745         struct mm_struct *mm = current->mm;
 746         struct mempolicy *new;
 747         unsigned long end;
 748         int err;
 749         LIST_HEAD(pagelist);
 750
 751         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 752                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 753                 return -EINVAL;
 754         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 755                 return -EPERM;
 756
 757         if (start & ~PAGE_MASK)
 758                 return -EINVAL;
 759
 760         if (mode == MPOL_DEFAULT)
 761                 flags &= ~MPOL_MF_STRICT;
 762
 763         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 764         end = start + len;
 765
 766         if (end < start)
 767                 return -EINVAL;
 768         if (end == start)
 769                 return 0;
 770
 771         new = mpol_new(mode, mode_flags, nmask);
 772         if (IS_ERR(new))
 773                 return PTR_ERR(new);
 774
 775         /*
 776          * If we are using the default policy then operation
 777          * on discontinuous address spaces is okay after all
 778          */
 779         if (!new)
 780                 flags |= MPOL_MF_DISCONTIG_OK;
 781
 782         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
 783                  start, start + len, mode, mode_flags,
 784                  nmask ? nodes_addr(*nmask)[0] : -1);
 785
 786         down_write(&mm->mmap_sem);
 787         vma = check_range(mm, start, end, nmask,
 788                           flags | MPOL_MF_INVERT, &pagelist);
 789
 790         err = PTR_ERR(vma);
 791         if (!IS_ERR(vma)) {
 792                 int nr_failed = 0;
 793
 794                 err = mbind_range(vma, start, end, new);
 795
 796                 if (!list_empty(&pagelist))
 797                         nr_failed = migrate_pages(&pagelist, new_vma_page,
 798                                                 (unsigned long)vma);
 799
 800                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
 801                         err = -EIO;
 802         }
 803
 804         up_write(&mm->mmap_sem);
 805         mpol_free(new);
 806         return err;
 807 }
 808
 809 /*
 810  * User space interface with variable sized bitmaps for nodelists.
 811  */
 812
 813 /* Copy a node mask from user space. */
 814 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
 815                      unsigned long maxnode)
 816 {
 817         unsigned long k;
 818         unsigned long nlongs;
 819         unsigned long endmask;
 820
 821         --maxnode;
 822         nodes_clear(*nodes);
 823         if (maxnode == 0 || !nmask)
 824                 return 0;
 825         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
 826                 return -EINVAL;
 827
 828         nlongs = BITS_TO_LONGS(maxnode);
 829         if ((maxnode % BITS_PER_LONG) == 0)
 830                 endmask = ~0UL;
 831         else
 832                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
 833
 834         /* When the user specified more nodes than supported just check
 835            if the non supported part is all zero. */
 836         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
 837                 if (nlongs > PAGE_SIZE/sizeof(long))
 838                         return -EINVAL;
 839                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
 840                         unsigned long t;
 841                         if (get_user(t, nmask + k))
 842                                 return -EFAULT;
 843                         if (k == nlongs - 1) {
 844                                 if (t & endmask)
 845                                         return -EINVAL;
 846                         } else if (t)
 847                                 return -EINVAL;
 848                 }
 849                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
 850                 endmask = ~0UL;
 851         }
 852
 853         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
 854                 return -EFAULT;
 855         nodes_addr(*nodes)[nlongs-1] &= endmask;
 856         return 0;
 857 }
 858
 859 /* Copy a kernel node mask to user space */
 860 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
 861                               nodemask_t *nodes)
 862 {
 863         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
 864         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
 865
 866         if (copy > nbytes) {
 867                 if (copy > PAGE_SIZE)
 868                         return -EINVAL;
 869                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
 870                         return -EFAULT;
 871                 copy = nbytes;
 872         }
 873         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
 874 }
 875
 876 asmlinkage long sys_mbind(unsigned long start, unsigned long len,
 877                         unsigned long mode,
 878                         unsigned long __user *nmask, unsigned long maxnode,
 879                         unsigned flags)
 880 {
 881         nodemask_t nodes;
 882         int err;
 883         unsigned short mode_flags;
 884
 885         mode_flags = mode & MPOL_MODE_FLAGS;
 886         mode &= ~MPOL_MODE_FLAGS;
 887         if (mode >= MPOL_MAX)
 888                 return -EINVAL;
 889         if ((mode_flags & MPOL_F_STATIC_NODES) &&
 890             (mode_flags & MPOL_F_RELATIVE_NODES))
 891                 return -EINVAL;
 892         err = get_nodes(&nodes, nmask, maxnode);
 893         if (err)
 894                 return err;
 895         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
 896 }
 897
 898 /* Set the process memory policy */
 899 asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
 900                 unsigned long maxnode)
 901 {
 902         int err;
 903         nodemask_t nodes;
 904         unsigned short flags;
 905
 906         flags = mode & MPOL_MODE_FLAGS;
 907         mode &= ~MPOL_MODE_FLAGS;
 908         if ((unsigned int)mode >= MPOL_MAX)
 909                 return -EINVAL;
 910         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
 911                 return -EINVAL;
 912         err = get_nodes(&nodes, nmask, maxnode);
 913         if (err)
 914                 return err;
 915         return do_set_mempolicy(mode, flags, &nodes);
 916 }
 917
 918 asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
 919                 const unsigned long __user *old_nodes,
 920                 const unsigned long __user *new_nodes)
 921 {
 922         struct mm_struct *mm;
 923         struct task_struct *task;
 924         nodemask_t old;
 925         nodemask_t new;
 926         nodemask_t task_nodes;
 927         int err;
 928
 929         err = get_nodes(&old, old_nodes, maxnode);
 930         if (err)
 931                 return err;
 932
 933         err = get_nodes(&new, new_nodes, maxnode);
 934         if (err)
 935                 return err;
 936
 937         /* Find the mm_struct */
 938         read_lock(&tasklist_lock);
 939         task = pid ? find_task_by_vpid(pid) : current;
 940         if (!task) {
 941                 read_unlock(&tasklist_lock);
 942                 return -ESRCH;
 943         }
 944         mm = get_task_mm(task);
 945         read_unlock(&tasklist_lock);
 946
 947         if (!mm)
 948                 return -EINVAL;
 949
 950         /*
 951          * Check if this process has the right to modify the specified
 952          * process. The right exists if the process has administrative
 953          * capabilities, superuser privileges or the same
 954          * userid as the target process.
 955          */
 956         if ((current->euid != task->suid) && (current->euid != task->uid) &&
 957             (current->uid != task->suid) && (current->uid != task->uid) &&
 958             !capable(CAP_SYS_NICE)) {
 959                 err = -EPERM;
 960                 goto out;
 961         }
 962
 963         task_nodes = cpuset_mems_allowed(task);
 964         /* Is the user allowed to access the target nodes? */
 965         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
 966                 err = -EPERM;
 967                 goto out;
 968         }
 969
 970         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
 971                 err = -EINVAL;
 972                 goto out;
 973         }
 974
 975         err = security_task_movememory(task);
 976         if (err)
 977                 goto out;
 978
 979         err = do_migrate_pages(mm, &old, &new,
 980                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
 981 out:
 982         mmput(mm);
 983         return err;
 984 }
 985
 986
 987 /* Retrieve NUMA policy */
 988 asmlinkage long sys_get_mempolicy(int __user *policy,
 989                                 unsigned long __user *nmask,
 990                                 unsigned long maxnode,
 991                                 unsigned long addr, unsigned long flags)
 992 {
 993         int err;
 994         int uninitialized_var(pval);
 995         nodemask_t nodes;
 996
 997         if (nmask != NULL && maxnode < MAX_NUMNODES)
 998                 return -EINVAL;
 999
1000         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1001
1002         if (err)
1003                 return err;
1004
1005         if (policy && put_user(pval, policy))
1006                 return -EFAULT;
1007
1008         if (nmask)
1009                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1010
1011         return err;
1012 }
1013
1014 #ifdef CONFIG_COMPAT
1015
1016 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1017                                      compat_ulong_t __user *nmask,
1018                                      compat_ulong_t maxnode,
1019                                      compat_ulong_t addr, compat_ulong_t flags)
1020 {
1021         long err;
1022         unsigned long __user *nm = NULL;
1023         unsigned long nr_bits, alloc_size;
1024         DECLARE_BITMAP(bm, MAX_NUMNODES);
1025
1026         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1027         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1028
1029         if (nmask)
1030                 nm = compat_alloc_user_space(alloc_size);
1031
1032         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1033
1034         if (!err && nmask) {
1035                 err = copy_from_user(bm, nm, alloc_size);
1036                 /* ensure entire bitmap is zeroed */
1037                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1038                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1039         }
1040
1041         return err;
1042 }
1043
1044 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1045                                      compat_ulong_t maxnode)
1046 {
1047         long err = 0;
1048         unsigned long __user *nm = NULL;
1049         unsigned long nr_bits, alloc_size;
1050         DECLARE_BITMAP(bm, MAX_NUMNODES);
1051
1052         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1053         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1054
1055         if (nmask) {
1056                 err = compat_get_bitmap(bm, nmask, nr_bits);
1057                 nm = compat_alloc_user_space(alloc_size);
1058                 err |= copy_to_user(nm, bm, alloc_size);
1059         }
1060
1061         if (err)
1062                 return -EFAULT;
1063
1064         return sys_set_mempolicy(mode, nm, nr_bits+1);
1065 }
1066
1067 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1068                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1069                              compat_ulong_t maxnode, compat_ulong_t flags)
1070 {
1071         long err = 0;
1072         unsigned long __user *nm = NULL;
1073         unsigned long nr_bits, alloc_size;
1074         nodemask_t bm;
1075
1076         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1077         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1078
1079         if (nmask) {
1080                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1081                 nm = compat_alloc_user_space(alloc_size);
1082                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1083         }
1084
1085         if (err)
1086                 return -EFAULT;
1087
1088         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1089 }
1090
1091 #endif
1092
1093 /*
1094  * get_vma_policy(@task, @vma, @addr)
1095  * @task - task for fallback if vma policy == default
1096  * @vma   - virtual memory area whose policy is sought
1097  * @addr  - address in @vma for shared policy lookup
1098  *
1099  * Returns effective policy for a VMA at specified address.
1100  * Falls back to @task or system default policy, as necessary.
1101  * Returned policy has extra reference count if shared, vma,
1102  * or some other task's policy [show_numa_maps() can pass
1103  * @task != current].  It is the caller's responsibility to
1104  * free the reference in these cases.
1105  */
1106 static struct mempolicy * get_vma_policy(struct task_struct *task,
1107                 struct vm_area_struct *vma, unsigned long addr)
1108 {
1109         struct mempolicy *pol = task->mempolicy;
1110         int shared_pol = 0;
1111
1112         if (vma) {
1113                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1114                         pol = vma->vm_ops->get_policy(vma, addr);
1115                         shared_pol = 1; /* if pol non-NULL, add ref below */
1116                 } else if (vma->vm_policy &&
1117                                 vma->vm_policy->policy != MPOL_DEFAULT)
1118                         pol = vma->vm_policy;
1119         }
1120         if (!pol)
1121                 pol = &default_policy;
1122         else if (!shared_pol && pol != current->mempolicy)
1123                 mpol_get(pol);  /* vma or other task's policy */
1124         return pol;
1125 }
1126
1127 /* Return a nodemask representing a mempolicy */
1128 static nodemask_t *nodemask_policy(gfp_t gfp, struct mempolicy *policy)
1129 {
1130         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1131         if (unlikely(policy->policy == MPOL_BIND) &&
1132                         gfp_zone(gfp) >= policy_zone &&
1133                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1134                 return &policy->v.nodes;
1135
1136         return NULL;
1137 }
1138
1139 /* Return a zonelist representing a mempolicy */
1140 static struct zonelist *zonelist_policy(gfp_t gfp, struct mempolicy *policy)
1141 {
1142         int nd;
1143
1144         switch (policy->policy) {
1145         case MPOL_PREFERRED:
1146                 nd = policy->v.preferred_node;
1147                 if (nd < 0)
1148                         nd = numa_node_id();
1149                 break;
1150         case MPOL_BIND:
1151                 /*
1152                  * Normally, MPOL_BIND allocations node-local are node-local
1153                  * within the allowed nodemask. However, if __GFP_THISNODE is
1154                  * set and the current node is part of the mask, we use the
1155                  * the zonelist for the first node in the mask instead.
1156                  */
1157                 nd = numa_node_id();
1158                 if (unlikely(gfp & __GFP_THISNODE) &&
1159                                 unlikely(!node_isset(nd, policy->v.nodes)))
1160                         nd = first_node(policy->v.nodes);
1161                 break;
1162         case MPOL_INTERLEAVE: /* should not happen */
1163         case MPOL_DEFAULT:
1164                 nd = numa_node_id();
1165                 break;
1166         default:
1167                 nd = 0;
1168                 BUG();
1169         }
1170         return node_zonelist(nd, gfp);
1171 }
1172
1173 /* Do dynamic interleaving for a process */
1174 static unsigned interleave_nodes(struct mempolicy *policy)
1175 {
1176         unsigned nid, next;
1177         struct task_struct *me = current;
1178
1179         nid = me->il_next;
1180         next = next_node(nid, policy->v.nodes);
1181         if (next >= MAX_NUMNODES)
1182                 next = first_node(policy->v.nodes);
1183         if (next < MAX_NUMNODES)
1184                 me->il_next = next;
1185         return nid;
1186 }
1187
1188 /*
1189  * Depending on the memory policy provide a node from which to allocate the
1190  * next slab entry.
1191  */
1192 unsigned slab_node(struct mempolicy *policy)
1193 {
1194         unsigned short pol = policy ? policy->policy : MPOL_DEFAULT;
1195
1196         switch (pol) {
1197         case MPOL_INTERLEAVE:
1198                 return interleave_nodes(policy);
1199
1200         case MPOL_BIND: {
1201                 /*
1202                  * Follow bind policy behavior and start allocation at the
1203                  * first node.
1204                  */
1205                 struct zonelist *zonelist;
1206                 struct zone *zone;
1207                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1208                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1209                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1210                                                         &policy->v.nodes,
1211                                                         &zone);
1212                 return zone->node;
1213         }
1214
1215         case MPOL_PREFERRED:
1216                 if (policy->v.preferred_node >= 0)
1217                         return policy->v.preferred_node;
1218                 /* Fall through */
1219
1220         default:
1221                 return numa_node_id();
1222         }
1223 }
1224
1225 /* Do static interleaving for a VMA with known offset. */
1226 static unsigned offset_il_node(struct mempolicy *pol,
1227                 struct vm_area_struct *vma, unsigned long off)
1228 {
1229         unsigned nnodes = nodes_weight(pol->v.nodes);
1230         unsigned target;
1231         int c;
1232         int nid = -1;
1233
1234         if (!nnodes)
1235                 return numa_node_id();
1236         target = (unsigned int)off % nnodes;
1237         c = 0;
1238         do {
1239                 nid = next_node(nid, pol->v.nodes);
1240                 c++;
1241         } while (c <= target);
1242         return nid;
1243 }
1244
1245 /* Determine a node number for interleave */
1246 static inline unsigned interleave_nid(struct mempolicy *pol,
1247                  struct vm_area_struct *vma, unsigned long addr, int shift)
1248 {
1249         if (vma) {
1250                 unsigned long off;
1251
1252                 /*
1253                  * for small pages, there is no difference between
1254                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1255                  * for huge pages, since vm_pgoff is in units of small
1256                  * pages, we need to shift off the always 0 bits to get
1257                  * a useful offset.
1258                  */
1259                 BUG_ON(shift < PAGE_SHIFT);
1260                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1261                 off += (addr - vma->vm_start) >> shift;
1262                 return offset_il_node(pol, vma, off);
1263         } else
1264                 return interleave_nodes(pol);
1265 }
1266
1267 #ifdef CONFIG_HUGETLBFS
1268 /*
1269  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1270  * @vma = virtual memory area whose policy is sought
1271  * @addr = address in @vma for shared policy lookup and interleave policy
1272  * @gfp_flags = for requested zone
1273  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1274  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1275  *
1276  * Returns a zonelist suitable for a huge page allocation.
1277  * If the effective policy is 'BIND, returns pointer to local node's zonelist,
1278  * and a pointer to the mempolicy's @nodemask for filtering the zonelist.
1279  * If it is also a policy for which get_vma_policy() returns an extra
1280  * reference, we must hold that reference until after the allocation.
1281  * In that case, return policy via @mpol so hugetlb allocation can drop
1282  * the reference. For non-'BIND referenced policies, we can/do drop the
1283  * reference here, so the caller doesn't need to know about the special case
1284  * for default and current task policy.
1285  */
1286 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1287                                 gfp_t gfp_flags, struct mempolicy **mpol,
1288                                 nodemask_t **nodemask)
1289 {
1290         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1291         struct zonelist *zl;
1292
1293         *mpol = NULL;           /* probably no unref needed */
1294         *nodemask = NULL;       /* assume !MPOL_BIND */
1295         if (pol->policy == MPOL_BIND) {
1296                         *nodemask = &pol->v.nodes;
1297         } else if (pol->policy == MPOL_INTERLEAVE) {
1298                 unsigned nid;
1299
1300                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
1301                 if (unlikely(pol != &default_policy &&
1302                                 pol != current->mempolicy))
1303                         __mpol_free(pol);       /* finished with pol */
1304                 return node_zonelist(nid, gfp_flags);
1305         }
1306
1307         zl = zonelist_policy(GFP_HIGHUSER, pol);
1308         if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
1309                 if (pol->policy != MPOL_BIND)
1310                         __mpol_free(pol);       /* finished with pol */
1311                 else
1312                         *mpol = pol;    /* unref needed after allocation */
1313         }
1314         return zl;
1315 }
1316 #endif
1317
1318 /* Allocate a page in interleaved policy.
1319    Own path because it needs to do special accounting. */
1320 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1321                                         unsigned nid)
1322 {
1323         struct zonelist *zl;
1324         struct page *page;
1325
1326         zl = node_zonelist(nid, gfp);
1327         page = __alloc_pages(gfp, order, zl);
1328         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1329                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1330         return page;
1331 }
1332
1333 /**
1334  *      alloc_page_vma  - Allocate a page for a VMA.
1335  *
1336  *      @gfp:
1337  *      %GFP_USER    user allocation.
1338  *      %GFP_KERNEL  kernel allocations,
1339  *      %GFP_HIGHMEM highmem/user allocations,
1340  *      %GFP_FS      allocation should not call back into a file system.
1341  *      %GFP_ATOMIC  don't sleep.
1342  *
1343  *      @vma:  Pointer to VMA or NULL if not available.
1344  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1345  *
1346  *      This function allocates a page from the kernel page pool and applies
1347  *      a NUMA policy associated with the VMA or the current process.
1348  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1349  *      mm_struct of the VMA to prevent it from going away. Should be used for
1350  *      all allocations for pages that will be mapped into
1351  *      user space. Returns NULL when no page can be allocated.
1352  *
1353  *      Should be called with the mm_sem of the vma hold.
1354  */
1355 struct page *
1356 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1357 {
1358         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1359         struct zonelist *zl;
1360
1361         cpuset_update_task_memory_state();
1362
1363         if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
1364                 unsigned nid;
1365
1366                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1367                 if (unlikely(pol != &default_policy &&
1368                                 pol != current->mempolicy))
1369                         __mpol_free(pol);       /* finished with pol */
1370                 return alloc_page_interleave(gfp, 0, nid);
1371         }
1372         zl = zonelist_policy(gfp, pol);
1373         if (pol != &default_policy && pol != current->mempolicy) {
1374                 /*
1375                  * slow path: ref counted policy -- shared or vma
1376                  */
1377                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1378                                                 zl, nodemask_policy(gfp, pol));
1379                 __mpol_free(pol);
1380                 return page;
1381         }
1382         /*
1383          * fast path:  default or task policy
1384          */
1385         return __alloc_pages_nodemask(gfp, 0, zl, nodemask_policy(gfp, pol));
1386 }
1387
1388 /**
1389  *      alloc_pages_current - Allocate pages.
1390  *
1391  *      @gfp:
1392  *              %GFP_USER   user allocation,
1393  *              %GFP_KERNEL kernel allocation,
1394  *              %GFP_HIGHMEM highmem allocation,
1395  *              %GFP_FS     don't call back into a file system.
1396  *              %GFP_ATOMIC don't sleep.
1397  *      @order: Power of two of allocation size in pages. 0 is a single page.
1398  *
1399  *      Allocate a page from the kernel page pool.  When not in
1400  *      interrupt context and apply the current process NUMA policy.
1401  *      Returns NULL when no page can be allocated.
1402  *
1403  *      Don't call cpuset_update_task_memory_state() unless
1404  *      1) it's ok to take cpuset_sem (can WAIT), and
1405  *      2) allocating for current task (not interrupt).
1406  */
1407 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1408 {
1409         struct mempolicy *pol = current->mempolicy;
1410
1411         if ((gfp & __GFP_WAIT) && !in_interrupt())
1412                 cpuset_update_task_memory_state();
1413         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1414                 pol = &default_policy;
1415         if (pol->policy == MPOL_INTERLEAVE)
1416                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1417         return __alloc_pages_nodemask(gfp, order,
1418                         zonelist_policy(gfp, pol), nodemask_policy(gfp, pol));
1419 }
1420 EXPORT_SYMBOL(alloc_pages_current);
1421
1422 /*
1423  * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
1424  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1425  * with the mems_allowed returned by cpuset_mems_allowed().  This
1426  * keeps mempolicies cpuset relative after its cpuset moves.  See
1427  * further kernel/cpuset.c update_nodemask().
1428  */
1429
1430 /* Slow path of a mempolicy copy */
1431 struct mempolicy *__mpol_copy(struct mempolicy *old)
1432 {
1433         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1434
1435         if (!new)
1436                 return ERR_PTR(-ENOMEM);
1437         if (current_cpuset_is_being_rebound()) {
1438                 nodemask_t mems = cpuset_mems_allowed(current);
1439                 mpol_rebind_policy(old, &mems);
1440         }
1441         *new = *old;
1442         atomic_set(&new->refcnt, 1);
1443         return new;
1444 }
1445
1446 static int mpol_match_intent(const struct mempolicy *a,
1447                              const struct mempolicy *b)
1448 {
1449         if (a->flags != b->flags)
1450                 return 0;
1451         if (!mpol_store_user_nodemask(a))
1452                 return 1;
1453         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1454 }
1455
1456 /* Slow path of a mempolicy comparison */
1457 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1458 {
1459         if (!a || !b)
1460                 return 0;
1461         if (a->policy != b->policy)
1462                 return 0;
1463         if (a->policy != MPOL_DEFAULT && !mpol_match_intent(a, b))
1464                 return 0;
1465         switch (a->policy) {
1466         case MPOL_DEFAULT:
1467                 return 1;
1468         case MPOL_BIND:
1469                 /* Fall through */
1470         case MPOL_INTERLEAVE:
1471                 return nodes_equal(a->v.nodes, b->v.nodes);
1472         case MPOL_PREFERRED:
1473                 return a->v.preferred_node == b->v.preferred_node;
1474         default:
1475                 BUG();
1476                 return 0;
1477         }
1478 }
1479
1480 /* Slow path of a mpol destructor. */
1481 void __mpol_free(struct mempolicy *p)
1482 {
1483         if (!atomic_dec_and_test(&p->refcnt))
1484                 return;
1485         p->policy = MPOL_DEFAULT;
1486         kmem_cache_free(policy_cache, p);
1487 }
1488
1489 /*
1490  * Shared memory backing store policy support.
1491  *
1492  * Remember policies even when nobody has shared memory mapped.
1493  * The policies are kept in Red-Black tree linked from the inode.
1494  * They are protected by the sp->lock spinlock, which should be held
1495  * for any accesses to the tree.
1496  */
1497
1498 /* lookup first element intersecting start-end */
1499 /* Caller holds sp->lock */
1500 static struct sp_node *
1501 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1502 {
1503         struct rb_node *n = sp->root.rb_node;
1504
1505         while (n) {
1506                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1507
1508                 if (start >= p->end)
1509                         n = n->rb_right;
1510                 else if (end <= p->start)
1511                         n = n->rb_left;
1512                 else
1513                         break;
1514         }
1515         if (!n)
1516                 return NULL;
1517         for (;;) {
1518                 struct sp_node *w = NULL;
1519                 struct rb_node *prev = rb_prev(n);
1520                 if (!prev)
1521                         break;
1522                 w = rb_entry(prev, struct sp_node, nd);
1523                 if (w->end <= start)
1524                         break;
1525                 n = prev;
1526         }
1527         return rb_entry(n, struct sp_node, nd);
1528 }
1529
1530 /* Insert a new shared policy into the list. */
1531 /* Caller holds sp->lock */
1532 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1533 {
1534         struct rb_node **p = &sp->root.rb_node;
1535         struct rb_node *parent = NULL;
1536         struct sp_node *nd;
1537
1538         while (*p) {
1539                 parent = *p;
1540                 nd = rb_entry(parent, struct sp_node, nd);
1541                 if (new->start < nd->start)
1542                         p = &(*p)->rb_left;
1543                 else if (new->end > nd->end)
1544                         p = &(*p)->rb_right;
1545                 else
1546                         BUG();
1547         }
1548         rb_link_node(&new->nd, parent, p);
1549         rb_insert_color(&new->nd, &sp->root);
1550         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1551                  new->policy ? new->policy->policy : 0);
1552 }
1553
1554 /* Find shared policy intersecting idx */
1555 struct mempolicy *
1556 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1557 {
1558         struct mempolicy *pol = NULL;
1559         struct sp_node *sn;
1560
1561         if (!sp->root.rb_node)
1562                 return NULL;
1563         spin_lock(&sp->lock);
1564         sn = sp_lookup(sp, idx, idx+1);
1565         if (sn) {
1566                 mpol_get(sn->policy);
1567                 pol = sn->policy;
1568         }
1569         spin_unlock(&sp->lock);
1570         return pol;
1571 }
1572
1573 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1574 {
1575         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1576         rb_erase(&n->nd, &sp->root);
1577         mpol_free(n->policy);
1578         kmem_cache_free(sn_cache, n);
1579 }
1580
1581 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1582                                 struct mempolicy *pol)
1583 {
1584         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1585
1586         if (!n)
1587                 return NULL;
1588         n->start = start;
1589         n->end = end;
1590         mpol_get(pol);
1591         n->policy = pol;
1592         return n;
1593 }
1594
1595 /* Replace a policy range. */
1596 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1597                                  unsigned long end, struct sp_node *new)
1598 {
1599         struct sp_node *n, *new2 = NULL;
1600
1601 restart:
1602         spin_lock(&sp->lock);
1603         n = sp_lookup(sp, start, end);
1604         /* Take care of old policies in the same range. */
1605         while (n && n->start < end) {
1606                 struct rb_node *next = rb_next(&n->nd);
1607                 if (n->start >= start) {
1608                         if (n->end <= end)
1609                                 sp_delete(sp, n);
1610                         else
1611                                 n->start = end;
1612                 } else {
1613                         /* Old policy spanning whole new range. */
1614                         if (n->end > end) {
1615                                 if (!new2) {
1616                                         spin_unlock(&sp->lock);
1617                                         new2 = sp_alloc(end, n->end, n->policy);
1618                                         if (!new2)
1619                                                 return -ENOMEM;
1620                                         goto restart;
1621                                 }
1622                                 n->end = start;
1623                                 sp_insert(sp, new2);
1624                                 new2 = NULL;
1625                                 break;
1626                         } else
1627                                 n->end = start;
1628                 }
1629                 if (!next)
1630                         break;
1631                 n = rb_entry(next, struct sp_node, nd);
1632         }
1633         if (new)
1634                 sp_insert(sp, new);
1635         spin_unlock(&sp->lock);
1636         if (new2) {
1637                 mpol_free(new2->policy);
1638                 kmem_cache_free(sn_cache, new2);
1639         }
1640         return 0;
1641 }
1642
1643 void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
1644                         unsigned short flags, nodemask_t *policy_nodes)
1645 {
1646         info->root = RB_ROOT;
1647         spin_lock_init(&info->lock);
1648
1649         if (policy != MPOL_DEFAULT) {
1650                 struct mempolicy *newpol;
1651
1652                 /* Falls back to MPOL_DEFAULT on any error */
1653                 newpol = mpol_new(policy, flags, policy_nodes);
1654                 if (!IS_ERR(newpol)) {
1655                         /* Create pseudo-vma that contains just the policy */
1656                         struct vm_area_struct pvma;
1657
1658                         memset(&pvma, 0, sizeof(struct vm_area_struct));
1659                         /* Policy covers entire file */
1660                         pvma.vm_end = TASK_SIZE;
1661                         mpol_set_shared_policy(info, &pvma, newpol);
1662                         mpol_free(newpol);
1663                 }
1664         }
1665 }
1666
1667 int mpol_set_shared_policy(struct shared_policy *info,
1668                         struct vm_area_struct *vma, struct mempolicy *npol)
1669 {
1670         int err;
1671         struct sp_node *new = NULL;
1672         unsigned long sz = vma_pages(vma);
1673
1674         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
1675                  vma->vm_pgoff,
1676                  sz, npol ? npol->policy : -1,
1677                  npol ? npol->flags : -1,
1678                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
1679
1680         if (npol) {
1681                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
1682                 if (!new)
1683                         return -ENOMEM;
1684         }
1685         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
1686         if (err && new)
1687                 kmem_cache_free(sn_cache, new);
1688         return err;
1689 }
1690
1691 /* Free a backing policy store on inode delete. */
1692 void mpol_free_shared_policy(struct shared_policy *p)
1693 {
1694         struct sp_node *n;
1695         struct rb_node *next;
1696
1697         if (!p->root.rb_node)
1698                 return;
1699         spin_lock(&p->lock);
1700         next = rb_first(&p->root);
1701         while (next) {
1702                 n = rb_entry(next, struct sp_node, nd);
1703                 next = rb_next(&n->nd);
1704                 rb_erase(&n->nd, &p->root);
1705                 mpol_free(n->policy);
1706                 kmem_cache_free(sn_cache, n);
1707         }
1708         spin_unlock(&p->lock);
1709 }
1710
1711 /* assumes fs == KERNEL_DS */
1712 void __init numa_policy_init(void)
1713 {
1714         nodemask_t interleave_nodes;
1715         unsigned long largest = 0;
1716         int nid, prefer = 0;
1717
1718         policy_cache = kmem_cache_create("numa_policy",
1719                                          sizeof(struct mempolicy),
1720                                          0, SLAB_PANIC, NULL);
1721
1722         sn_cache = kmem_cache_create("shared_policy_node",
1723                                      sizeof(struct sp_node),
1724                                      0, SLAB_PANIC, NULL);
1725
1726         /*
1727          * Set interleaving policy for system init. Interleaving is only
1728          * enabled across suitably sized nodes (default is >= 16MB), or
1729          * fall back to the largest node if they're all smaller.
1730          */
1731         nodes_clear(interleave_nodes);
1732         for_each_node_state(nid, N_HIGH_MEMORY) {
1733                 unsigned long total_pages = node_present_pages(nid);
1734
1735                 /* Preserve the largest node */
1736                 if (largest < total_pages) {
1737                         largest = total_pages;
1738                         prefer = nid;
1739                 }
1740
1741                 /* Interleave this node? */
1742                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
1743                         node_set(nid, interleave_nodes);
1744         }
1745
1746         /* All too small, use the largest */
1747         if (unlikely(nodes_empty(interleave_nodes)))
1748                 node_set(prefer, interleave_nodes);
1749
1750         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
1751                 printk("numa_policy_init: interleaving failed\n");
1752 }
1753
1754 /* Reset policy of current process to default */
1755 void numa_default_policy(void)
1756 {
1757         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
1758 }
1759
1760 /* Migrate a policy to a different set of nodes */
1761 static void mpol_rebind_policy(struct mempolicy *pol,
1762                                const nodemask_t *newmask)
1763 {
1764         nodemask_t tmp;
1765         int static_nodes;
1766         int relative_nodes;
1767
1768         if (!pol)
1769                 return;
1770         static_nodes = pol->flags & MPOL_F_STATIC_NODES;
1771         relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
1772         if (!mpol_store_user_nodemask(pol) &&
1773             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
1774                 return;
1775
1776         switch (pol->policy) {
1777         case MPOL_DEFAULT:
1778                 break;
1779         case MPOL_BIND:
1780                 /* Fall through */
1781         case MPOL_INTERLEAVE:
1782                 if (static_nodes)
1783                         nodes_and(tmp, pol->w.user_nodemask, *newmask);
1784                 else if (relative_nodes)
1785                         mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
1786                                                newmask);
1787                 else {
1788                         nodes_remap(tmp, pol->v.nodes,
1789                                     pol->w.cpuset_mems_allowed, *newmask);
1790                         pol->w.cpuset_mems_allowed = *newmask;
1791                 }
1792                 pol->v.nodes = tmp;
1793                 if (!node_isset(current->il_next, tmp)) {
1794                         current->il_next = next_node(current->il_next, tmp);
1795                         if (current->il_next >= MAX_NUMNODES)
1796                                 current->il_next = first_node(tmp);
1797                         if (current->il_next >= MAX_NUMNODES)
1798                                 current->il_next = numa_node_id();
1799                 }
1800                 break;
1801         case MPOL_PREFERRED:
1802                 if (static_nodes) {
1803                         int node = first_node(pol->w.user_nodemask);
1804
1805                         if (node_isset(node, *newmask))
1806                                 pol->v.preferred_node = node;
1807                         else
1808                                 pol->v.preferred_node = -1;
1809                 } else if (relative_nodes) {
1810                         mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
1811                                                newmask);
1812                         pol->v.preferred_node = first_node(tmp);
1813                 } else {
1814                         pol->v.preferred_node = node_remap(pol->v.preferred_node,
1815                                         pol->w.cpuset_mems_allowed, *newmask);
1816                         pol->w.cpuset_mems_allowed = *newmask;
1817                 }
1818                 break;
1819         default:
1820                 BUG();
1821                 break;
1822         }
1823 }
1824
1825 /*
1826  * Wrapper for mpol_rebind_policy() that just requires task
1827  * pointer, and updates task mempolicy.
1828  */
1829
1830 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
1831 {
1832         mpol_rebind_policy(tsk->mempolicy, new);
1833 }
1834
1835 /*
1836  * Rebind each vma in mm to new nodemask.
1837  *
1838  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
1839  */
1840
1841 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
1842 {
1843         struct vm_area_struct *vma;
1844
1845         down_write(&mm->mmap_sem);
1846         for (vma = mm->mmap; vma; vma = vma->vm_next)
1847                 mpol_rebind_policy(vma->vm_policy, new);
1848         up_write(&mm->mmap_sem);
1849 }
1850
1851 /*
1852  * Display pages allocated per node and memory policy via /proc.
1853  */
1854
1855 static const char * const policy_types[] =
1856         { "default", "prefer", "bind", "interleave" };
1857
1858 /*
1859  * Convert a mempolicy into a string.
1860  * Returns the number of characters in buffer (if positive)
1861  * or an error (negative)
1862  */
1863 static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
1864 {
1865         char *p = buffer;
1866         int l;
1867         nodemask_t nodes;
1868         unsigned short mode = pol ? pol->policy : MPOL_DEFAULT;
1869         unsigned short flags = pol ? pol->flags : 0;
1870
1871         switch (mode) {
1872         case MPOL_DEFAULT:
1873                 nodes_clear(nodes);
1874                 break;
1875
1876         case MPOL_PREFERRED:
1877                 nodes_clear(nodes);
1878                 node_set(pol->v.preferred_node, nodes);
1879                 break;
1880
1881         case MPOL_BIND:
1882                 /* Fall through */
1883         case MPOL_INTERLEAVE:
1884                 nodes = pol->v.nodes;
1885                 break;
1886
1887         default:
1888                 BUG();
1889                 return -EFAULT;
1890         }
1891
1892         l = strlen(policy_types[mode]);
1893         if (buffer + maxlen < p + l + 1)
1894                 return -ENOSPC;
1895
1896         strcpy(p, policy_types[mode]);
1897         p += l;
1898
1899         if (flags) {
1900                 int need_bar = 0;
1901
1902                 if (buffer + maxlen < p + 2)
1903                         return -ENOSPC;
1904                 *p++ = '=';
1905
1906                 if (flags & MPOL_F_STATIC_NODES)
1907                         p += sprintf(p, "%sstatic", need_bar++ ? "|" : "");
1908                 if (flags & MPOL_F_RELATIVE_NODES)
1909                         p += sprintf(p, "%srelative", need_bar++ ? "|" : "");
1910         }
1911
1912         if (!nodes_empty(nodes)) {
1913                 if (buffer + maxlen < p + 2)
1914                         return -ENOSPC;
1915                 *p++ = '=';
1916                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
1917         }
1918         return p - buffer;
1919 }
1920
1921 struct numa_maps {
1922         unsigned long pages;
1923         unsigned long anon;
1924         unsigned long active;
1925         unsigned long writeback;
1926         unsigned long mapcount_max;
1927         unsigned long dirty;
1928         unsigned long swapcache;
1929         unsigned long node[MAX_NUMNODES];
1930 };
1931
1932 static void gather_stats(struct page *page, void *private, int pte_dirty)
1933 {
1934         struct numa_maps *md = private;
1935         int count = page_mapcount(page);
1936
1937         md->pages++;
1938         if (pte_dirty || PageDirty(page))
1939                 md->dirty++;
1940
1941         if (PageSwapCache(page))
1942                 md->swapcache++;
1943
1944         if (PageActive(page))
1945                 md->active++;
1946
1947         if (PageWriteback(page))
1948                 md->writeback++;
1949
1950         if (PageAnon(page))
1951                 md->anon++;
1952
1953         if (count > md->mapcount_max)
1954                 md->mapcount_max = count;
1955
1956         md->node[page_to_nid(page)]++;
1957 }
1958
1959 #ifdef CONFIG_HUGETLB_PAGE
1960 static void check_huge_range(struct vm_area_struct *vma,
1961                 unsigned long start, unsigned long end,
1962                 struct numa_maps *md)
1963 {
1964         unsigned long addr;
1965         struct page *page;
1966
1967         for (addr = start; addr < end; addr += HPAGE_SIZE) {
1968                 pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
1969                 pte_t pte;
1970
1971                 if (!ptep)
1972                         continue;
1973
1974                 pte = *ptep;
1975                 if (pte_none(pte))
1976                         continue;
1977
1978                 page = pte_page(pte);
1979                 if (!page)
1980                         continue;
1981
1982                 gather_stats(page, md, pte_dirty(*ptep));
1983         }
1984 }
1985 #else
1986 static inline void check_huge_range(struct vm_area_struct *vma,
1987                 unsigned long start, unsigned long end,
1988                 struct numa_maps *md)
1989 {
1990 }
1991 #endif
1992
1993 int show_numa_map(struct seq_file *m, void *v)
1994 {
1995         struct proc_maps_private *priv = m->private;
1996         struct vm_area_struct *vma = v;
1997         struct numa_maps *md;
1998         struct file *file = vma->vm_file;
1999         struct mm_struct *mm = vma->vm_mm;
2000         struct mempolicy *pol;
2001         int n;
2002         char buffer[50];
2003
2004         if (!mm)
2005                 return 0;
2006
2007         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2008         if (!md)
2009                 return 0;
2010
2011         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2012         mpol_to_str(buffer, sizeof(buffer), pol);
2013         /*
2014          * unref shared or other task's mempolicy
2015          */
2016         if (pol != &default_policy && pol != current->mempolicy)
2017                 __mpol_free(pol);
2018
2019         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2020
2021         if (file) {
2022                 seq_printf(m, " file=");
2023                 seq_path(m, &file->f_path, "\n\t= ");
2024         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2025                 seq_printf(m, " heap");
2026         } else if (vma->vm_start <= mm->start_stack &&
2027                         vma->vm_end >= mm->start_stack) {
2028                 seq_printf(m, " stack");
2029         }
2030
2031         if (is_vm_hugetlb_page(vma)) {
2032                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2033                 seq_printf(m, " huge");
2034         } else {
2035                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2036                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2037         }
2038
2039         if (!md->pages)
2040                 goto out;
2041
2042         if (md->anon)
2043                 seq_printf(m," anon=%lu",md->anon);
2044
2045         if (md->dirty)
2046                 seq_printf(m," dirty=%lu",md->dirty);
2047
2048         if (md->pages != md->anon && md->pages != md->dirty)
2049                 seq_printf(m, " mapped=%lu", md->pages);
2050
2051         if (md->mapcount_max > 1)
2052                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2053
2054         if (md->swapcache)
2055                 seq_printf(m," swapcache=%lu", md->swapcache);
2056
2057         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2058                 seq_printf(m," active=%lu", md->active);
2059
2060         if (md->writeback)
2061                 seq_printf(m," writeback=%lu", md->writeback);
2062
2063         for_each_node_state(n, N_HIGH_MEMORY)
2064                 if (md->node[n])
2065                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2066 out:
2067         seq_putc(m, '\n');
2068         kfree(md);
2069
2070         if (m->count < m->size)
2071                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2072         return 0;
2073 }