SAFE public projects git trees. - safe/jmp/linux-2.6/blob - mm/mempolicy.c

   1 /*
   2  * Simple NUMA memory policy for the Linux kernel.
   3  *
   4  * Copyright 2003,2004 Andi Kleen, SuSE Labs.
   5  * (C) Copyright 2005 Christoph Lameter, Silicon Graphics, Inc.
   6  * Subject to the GNU Public License, version 2.
   7  *
   8  * NUMA policy allows the user to give hints in which node(s) memory should
   9  * be allocated.
  10  *
  11  * Support four policies per VMA and per process:
  12  *
  13  * The VMA policy has priority over the process policy for a page fault.
  14  *
  15  * interleave     Allocate memory interleaved over a set of nodes,
  16  *                with normal fallback if it fails.
  17  *                For VMA based allocations this interleaves based on the
  18  *                offset into the backing object or offset into the mapping
  19  *                for anonymous memory. For process policy an process counter
  20  *                is used.
  21  *
  22  * bind           Only allocate memory on a specific set of nodes,
  23  *                no fallback.
  24  *                FIXME: memory is allocated starting with the first node
  25  *                to the last. It would be better if bind would truly restrict
  26  *                the allocation to memory nodes instead
  27  *
  28  * preferred       Try a specific node first before normal fallback.
  29  *                As a special case node -1 here means do the allocation
  30  *                on the local CPU. This is normally identical to default,
  31  *                but useful to set in a VMA when you have a non default
  32  *                process policy.
  33  *
  34  * default        Allocate on the local node first, or when on a VMA
  35  *                use the process policy. This is what Linux always did
  36  *                in a NUMA aware kernel and still does by, ahem, default.
  37  *
  38  * The process policy is applied for most non interrupt memory allocations
  39  * in that process' context. Interrupts ignore the policies and always
  40  * try to allocate on the local CPU. The VMA policy is only applied for memory
  41  * allocations for a VMA in the VM.
  42  *
  43  * Currently there are a few corner cases in swapping where the policy
  44  * is not applied, but the majority should be handled. When process policy
  45  * is used it is not remembered over swap outs/swap ins.
  46  *
  47  * Only the highest zone in the zone hierarchy gets policied. Allocations
  48  * requesting a lower zone just use default policy. This implies that
  49  * on systems with highmem kernel lowmem allocation don't get policied.
  50  * Same with GFP_DMA allocations.
  51  *
  52  * For shmfs/tmpfs/hugetlbfs shared memory the policy is shared between
  53  * all users and remembered even when nobody has memory mapped.
  54  */
  55
  56 /* Notebook:
  57    fix mmap readahead to honour policy and enable policy for any page cache
  58    object
  59    statistics for bigpages
  60    global policy for page cache? currently it uses process policy. Requires
  61    first item above.
  62    handle mremap for shared memory (currently ignored for the policy)
  63    grows down?
  64    make bind policy root only? It can trigger oom much faster and the
  65    kernel is not always grateful with that.
  66 */
  67
  68 #include <linux/mempolicy.h>
  69 #include <linux/mm.h>
  70 #include <linux/highmem.h>
  71 #include <linux/hugetlb.h>
  72 #include <linux/kernel.h>
  73 #include <linux/sched.h>
  74 #include <linux/nodemask.h>
  75 #include <linux/cpuset.h>
  76 #include <linux/gfp.h>
  77 #include <linux/slab.h>
  78 #include <linux/string.h>
  79 #include <linux/module.h>
  80 #include <linux/nsproxy.h>
  81 #include <linux/interrupt.h>
  82 #include <linux/init.h>
  83 #include <linux/compat.h>
  84 #include <linux/swap.h>
  85 #include <linux/seq_file.h>
  86 #include <linux/proc_fs.h>
  87 #include <linux/migrate.h>
  88 #include <linux/ksm.h>
  89 #include <linux/rmap.h>
  90 #include <linux/security.h>
  91 #include <linux/syscalls.h>
  92 #include <linux/ctype.h>
  93 #include <linux/mm_inline.h>
  94
  95 #include <asm/tlbflush.h>
  96 #include <asm/uaccess.h>
  97
  98 #include "internal.h"
  99
 100 /* Internal flags */
 101 #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0)    /* Skip checks for continuous vmas */
 102 #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)          /* Invert check for nodemask */
 103 #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)           /* Gather statistics */
 104
 105 static struct kmem_cache *policy_cache;
 106 static struct kmem_cache *sn_cache;
 107
 108 /* Highest zone. An specific allocation for a zone below that is not
 109    policied. */
 110 enum zone_type policy_zone = 0;
 111
 112 /*
 113  * run-time system-wide default policy => local allocation
 114  */
 115 struct mempolicy default_policy = {
 116         .refcnt = ATOMIC_INIT(1), /* never free it */
 117         .mode = MPOL_PREFERRED,
 118         .flags = MPOL_F_LOCAL,
 119 };
 120
 121 static const struct mempolicy_operations {
 122         int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
 123         void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
 124 } mpol_ops[MPOL_MAX];
 125
 126 /* Check that the nodemask contains at least one populated zone */
 127 static int is_valid_nodemask(const nodemask_t *nodemask)
 128 {
 129         int nd, k;
 130
 131         /* Check that there is something useful in this mask */
 132         k = policy_zone;
 133
 134         for_each_node_mask(nd, *nodemask) {
 135                 struct zone *z;
 136
 137                 for (k = 0; k <= policy_zone; k++) {
 138                         z = &NODE_DATA(nd)->node_zones[k];
 139                         if (z->present_pages > 0)
 140                                 return 1;
 141                 }
 142         }
 143
 144         return 0;
 145 }
 146
 147 static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
 148 {
 149         return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
 150 }
 151
 152 static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
 153                                    const nodemask_t *rel)
 154 {
 155         nodemask_t tmp;
 156         nodes_fold(tmp, *orig, nodes_weight(*rel));
 157         nodes_onto(*ret, tmp, *rel);
 158 }
 159
 160 static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
 161 {
 162         if (nodes_empty(*nodes))
 163                 return -EINVAL;
 164         pol->v.nodes = *nodes;
 165         return 0;
 166 }
 167
 168 static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
 169 {
 170         if (!nodes)
 171                 pol->flags |= MPOL_F_LOCAL;     /* local allocation */
 172         else if (nodes_empty(*nodes))
 173                 return -EINVAL;                 /*  no allowed nodes */
 174         else
 175                 pol->v.preferred_node = first_node(*nodes);
 176         return 0;
 177 }
 178
 179 static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
 180 {
 181         if (!is_valid_nodemask(nodes))
 182                 return -EINVAL;
 183         pol->v.nodes = *nodes;
 184         return 0;
 185 }
 186
 187 /*
 188  * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
 189  * any, for the new policy.  mpol_new() has already validated the nodes
 190  * parameter with respect to the policy mode and flags.  But, we need to
 191  * handle an empty nodemask with MPOL_PREFERRED here.
 192  *
 193  * Must be called holding task's alloc_lock to protect task's mems_allowed
 194  * and mempolicy.  May also be called holding the mmap_semaphore for write.
 195  */
 196 static int mpol_set_nodemask(struct mempolicy *pol,
 197                      const nodemask_t *nodes, struct nodemask_scratch *nsc)
 198 {
 199         int ret;
 200
 201         /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
 202         if (pol == NULL)
 203                 return 0;
 204         /* Check N_HIGH_MEMORY */
 205         nodes_and(nsc->mask1,
 206                   cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
 207
 208         VM_BUG_ON(!nodes);
 209         if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
 210                 nodes = NULL;   /* explicit local allocation */
 211         else {
 212                 if (pol->flags & MPOL_F_RELATIVE_NODES)
 213                         mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
 214                 else
 215                         nodes_and(nsc->mask2, *nodes, nsc->mask1);
 216
 217                 if (mpol_store_user_nodemask(pol))
 218                         pol->w.user_nodemask = *nodes;
 219                 else
 220                         pol->w.cpuset_mems_allowed =
 221                                                 cpuset_current_mems_allowed;
 222         }
 223
 224         if (nodes)
 225                 ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
 226         else
 227                 ret = mpol_ops[pol->mode].create(pol, NULL);
 228         return ret;
 229 }
 230
 231 /*
 232  * This function just creates a new policy, does some check and simple
 233  * initialization. You must invoke mpol_set_nodemask() to set nodes.
 234  */
 235 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
 236                                   nodemask_t *nodes)
 237 {
 238         struct mempolicy *policy;
 239
 240         pr_debug("setting mode %d flags %d nodes[0] %lx\n",
 241                  mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 242
 243         if (mode == MPOL_DEFAULT) {
 244                 if (nodes && !nodes_empty(*nodes))
 245                         return ERR_PTR(-EINVAL);
 246                 return NULL;    /* simply delete any existing policy */
 247         }
 248         VM_BUG_ON(!nodes);
 249
 250         /*
 251          * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
 252          * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
 253          * All other modes require a valid pointer to a non-empty nodemask.
 254          */
 255         if (mode == MPOL_PREFERRED) {
 256                 if (nodes_empty(*nodes)) {
 257                         if (((flags & MPOL_F_STATIC_NODES) ||
 258                              (flags & MPOL_F_RELATIVE_NODES)))
 259                                 return ERR_PTR(-EINVAL);
 260                 }
 261         } else if (nodes_empty(*nodes))
 262                 return ERR_PTR(-EINVAL);
 263         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 264         if (!policy)
 265                 return ERR_PTR(-ENOMEM);
 266         atomic_set(&policy->refcnt, 1);
 267         policy->mode = mode;
 268         policy->flags = flags;
 269
 270         return policy;
 271 }
 272
 273 /* Slow path of a mpol destructor. */
 274 void __mpol_put(struct mempolicy *p)
 275 {
 276         if (!atomic_dec_and_test(&p->refcnt))
 277                 return;
 278         kmem_cache_free(policy_cache, p);
 279 }
 280
 281 static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
 282 {
 283 }
 284
 285 static void mpol_rebind_nodemask(struct mempolicy *pol,
 286                                  const nodemask_t *nodes)
 287 {
 288         nodemask_t tmp;
 289
 290         if (pol->flags & MPOL_F_STATIC_NODES)
 291                 nodes_and(tmp, pol->w.user_nodemask, *nodes);
 292         else if (pol->flags & MPOL_F_RELATIVE_NODES)
 293                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 294         else {
 295                 nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
 296                             *nodes);
 297                 pol->w.cpuset_mems_allowed = *nodes;
 298         }
 299
 300         pol->v.nodes = tmp;
 301         if (!node_isset(current->il_next, tmp)) {
 302                 current->il_next = next_node(current->il_next, tmp);
 303                 if (current->il_next >= MAX_NUMNODES)
 304                         current->il_next = first_node(tmp);
 305                 if (current->il_next >= MAX_NUMNODES)
 306                         current->il_next = numa_node_id();
 307         }
 308 }
 309
 310 static void mpol_rebind_preferred(struct mempolicy *pol,
 311                                   const nodemask_t *nodes)
 312 {
 313         nodemask_t tmp;
 314
 315         if (pol->flags & MPOL_F_STATIC_NODES) {
 316                 int node = first_node(pol->w.user_nodemask);
 317
 318                 if (node_isset(node, *nodes)) {
 319                         pol->v.preferred_node = node;
 320                         pol->flags &= ~MPOL_F_LOCAL;
 321                 } else
 322                         pol->flags |= MPOL_F_LOCAL;
 323         } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
 324                 mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
 325                 pol->v.preferred_node = first_node(tmp);
 326         } else if (!(pol->flags & MPOL_F_LOCAL)) {
 327                 pol->v.preferred_node = node_remap(pol->v.preferred_node,
 328                                                    pol->w.cpuset_mems_allowed,
 329                                                    *nodes);
 330                 pol->w.cpuset_mems_allowed = *nodes;
 331         }
 332 }
 333
 334 /* Migrate a policy to a different set of nodes */
 335 static void mpol_rebind_policy(struct mempolicy *pol,
 336                                const nodemask_t *newmask)
 337 {
 338         if (!pol)
 339                 return;
 340         if (!mpol_store_user_nodemask(pol) &&
 341             nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
 342                 return;
 343         mpol_ops[pol->mode].rebind(pol, newmask);
 344 }
 345
 346 /*
 347  * Wrapper for mpol_rebind_policy() that just requires task
 348  * pointer, and updates task mempolicy.
 349  *
 350  * Called with task's alloc_lock held.
 351  */
 352
 353 void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
 354 {
 355         mpol_rebind_policy(tsk->mempolicy, new);
 356 }
 357
 358 /*
 359  * Rebind each vma in mm to new nodemask.
 360  *
 361  * Call holding a reference to mm.  Takes mm->mmap_sem during call.
 362  */
 363
 364 void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
 365 {
 366         struct vm_area_struct *vma;
 367
 368         down_write(&mm->mmap_sem);
 369         for (vma = mm->mmap; vma; vma = vma->vm_next)
 370                 mpol_rebind_policy(vma->vm_policy, new);
 371         up_write(&mm->mmap_sem);
 372 }
 373
 374 static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
 375         [MPOL_DEFAULT] = {
 376                 .rebind = mpol_rebind_default,
 377         },
 378         [MPOL_INTERLEAVE] = {
 379                 .create = mpol_new_interleave,
 380                 .rebind = mpol_rebind_nodemask,
 381         },
 382         [MPOL_PREFERRED] = {
 383                 .create = mpol_new_preferred,
 384                 .rebind = mpol_rebind_preferred,
 385         },
 386         [MPOL_BIND] = {
 387                 .create = mpol_new_bind,
 388                 .rebind = mpol_rebind_nodemask,
 389         },
 390 };
 391
 392 static void gather_stats(struct page *, void *, int pte_dirty);
 393 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 394                                 unsigned long flags);
 395
 396 /* Scan through pages checking if pages follow certain conditions. */
 397 static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
 398                 unsigned long addr, unsigned long end,
 399                 const nodemask_t *nodes, unsigned long flags,
 400                 void *private)
 401 {
 402         pte_t *orig_pte;
 403         pte_t *pte;
 404         spinlock_t *ptl;
 405
 406         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
 407         do {
 408                 struct page *page;
 409                 int nid;
 410
 411                 if (!pte_present(*pte))
 412                         continue;
 413                 page = vm_normal_page(vma, addr, *pte);
 414                 if (!page)
 415                         continue;
 416                 /*
 417                  * vm_normal_page() filters out zero pages, but there might
 418                  * still be PageReserved pages to skip, perhaps in a VDSO.
 419                  * And we cannot move PageKsm pages sensibly or safely yet.
 420                  */
 421                 if (PageReserved(page) || PageKsm(page))
 422                         continue;
 423                 nid = page_to_nid(page);
 424                 if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
 425                         continue;
 426
 427                 if (flags & MPOL_MF_STATS)
 428                         gather_stats(page, private, pte_dirty(*pte));
 429                 else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 430                         migrate_page_add(page, private, flags);
 431                 else
 432                         break;
 433         } while (pte++, addr += PAGE_SIZE, addr != end);
 434         pte_unmap_unlock(orig_pte, ptl);
 435         return addr != end;
 436 }
 437
 438 static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud,
 439                 unsigned long addr, unsigned long end,
 440                 const nodemask_t *nodes, unsigned long flags,
 441                 void *private)
 442 {
 443         pmd_t *pmd;
 444         unsigned long next;
 445
 446         pmd = pmd_offset(pud, addr);
 447         do {
 448                 next = pmd_addr_end(addr, end);
 449                 if (pmd_none_or_clear_bad(pmd))
 450                         continue;
 451                 if (check_pte_range(vma, pmd, addr, next, nodes,
 452                                     flags, private))
 453                         return -EIO;
 454         } while (pmd++, addr = next, addr != end);
 455         return 0;
 456 }
 457
 458 static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd,
 459                 unsigned long addr, unsigned long end,
 460                 const nodemask_t *nodes, unsigned long flags,
 461                 void *private)
 462 {
 463         pud_t *pud;
 464         unsigned long next;
 465
 466         pud = pud_offset(pgd, addr);
 467         do {
 468                 next = pud_addr_end(addr, end);
 469                 if (pud_none_or_clear_bad(pud))
 470                         continue;
 471                 if (check_pmd_range(vma, pud, addr, next, nodes,
 472                                     flags, private))
 473                         return -EIO;
 474         } while (pud++, addr = next, addr != end);
 475         return 0;
 476 }
 477
 478 static inline int check_pgd_range(struct vm_area_struct *vma,
 479                 unsigned long addr, unsigned long end,
 480                 const nodemask_t *nodes, unsigned long flags,
 481                 void *private)
 482 {
 483         pgd_t *pgd;
 484         unsigned long next;
 485
 486         pgd = pgd_offset(vma->vm_mm, addr);
 487         do {
 488                 next = pgd_addr_end(addr, end);
 489                 if (pgd_none_or_clear_bad(pgd))
 490                         continue;
 491                 if (check_pud_range(vma, pgd, addr, next, nodes,
 492                                     flags, private))
 493                         return -EIO;
 494         } while (pgd++, addr = next, addr != end);
 495         return 0;
 496 }
 497
 498 /*
 499  * Check if all pages in a range are on a set of nodes.
 500  * If pagelist != NULL then isolate pages from the LRU and
 501  * put them on the pagelist.
 502  */
 503 static struct vm_area_struct *
 504 check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
 505                 const nodemask_t *nodes, unsigned long flags, void *private)
 506 {
 507         int err;
 508         struct vm_area_struct *first, *vma, *prev;
 509
 510
 511         first = find_vma(mm, start);
 512         if (!first)
 513                 return ERR_PTR(-EFAULT);
 514         prev = NULL;
 515         for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) {
 516                 if (!(flags & MPOL_MF_DISCONTIG_OK)) {
 517                         if (!vma->vm_next && vma->vm_end < end)
 518                                 return ERR_PTR(-EFAULT);
 519                         if (prev && prev->vm_end < vma->vm_start)
 520                                 return ERR_PTR(-EFAULT);
 521                 }
 522                 if (!is_vm_hugetlb_page(vma) &&
 523                     ((flags & MPOL_MF_STRICT) ||
 524                      ((flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) &&
 525                                 vma_migratable(vma)))) {
 526                         unsigned long endvma = vma->vm_end;
 527
 528                         if (endvma > end)
 529                                 endvma = end;
 530                         if (vma->vm_start > start)
 531                                 start = vma->vm_start;
 532                         err = check_pgd_range(vma, start, endvma, nodes,
 533                                                 flags, private);
 534                         if (err) {
 535                                 first = ERR_PTR(err);
 536                                 break;
 537                         }
 538                 }
 539                 prev = vma;
 540         }
 541         return first;
 542 }
 543
 544 /* Apply policy to a single VMA */
 545 static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
 546 {
 547         int err = 0;
 548         struct mempolicy *old = vma->vm_policy;
 549
 550         pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
 551                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
 552                  vma->vm_ops, vma->vm_file,
 553                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
 554
 555         if (vma->vm_ops && vma->vm_ops->set_policy)
 556                 err = vma->vm_ops->set_policy(vma, new);
 557         if (!err) {
 558                 mpol_get(new);
 559                 vma->vm_policy = new;
 560                 mpol_put(old);
 561         }
 562         return err;
 563 }
 564
 565 /* Step 2: apply policy to a range and do splits. */
 566 static int mbind_range(struct vm_area_struct *vma, unsigned long start,
 567                        unsigned long end, struct mempolicy *new)
 568 {
 569         struct vm_area_struct *next;
 570         int err;
 571
 572         err = 0;
 573         for (; vma && vma->vm_start < end; vma = next) {
 574                 next = vma->vm_next;
 575                 if (vma->vm_start < start)
 576                         err = split_vma(vma->vm_mm, vma, start, 1);
 577                 if (!err && vma->vm_end > end)
 578                         err = split_vma(vma->vm_mm, vma, end, 0);
 579                 if (!err)
 580                         err = policy_vma(vma, new);
 581                 if (err)
 582                         break;
 583         }
 584         return err;
 585 }
 586
 587 /*
 588  * Update task->flags PF_MEMPOLICY bit: set iff non-default
 589  * mempolicy.  Allows more rapid checking of this (combined perhaps
 590  * with other PF_* flag bits) on memory allocation hot code paths.
 591  *
 592  * If called from outside this file, the task 'p' should -only- be
 593  * a newly forked child not yet visible on the task list, because
 594  * manipulating the task flags of a visible task is not safe.
 595  *
 596  * The above limitation is why this routine has the funny name
 597  * mpol_fix_fork_child_flag().
 598  *
 599  * It is also safe to call this with a task pointer of current,
 600  * which the static wrapper mpol_set_task_struct_flag() does,
 601  * for use within this file.
 602  */
 603
 604 void mpol_fix_fork_child_flag(struct task_struct *p)
 605 {
 606         if (p->mempolicy)
 607                 p->flags |= PF_MEMPOLICY;
 608         else
 609                 p->flags &= ~PF_MEMPOLICY;
 610 }
 611
 612 static void mpol_set_task_struct_flag(void)
 613 {
 614         mpol_fix_fork_child_flag(current);
 615 }
 616
 617 /* Set the process memory policy */
 618 static long do_set_mempolicy(unsigned short mode, unsigned short flags,
 619                              nodemask_t *nodes)
 620 {
 621         struct mempolicy *new, *old;
 622         struct mm_struct *mm = current->mm;
 623         NODEMASK_SCRATCH(scratch);
 624         int ret;
 625
 626         if (!scratch)
 627                 return -ENOMEM;
 628
 629         new = mpol_new(mode, flags, nodes);
 630         if (IS_ERR(new)) {
 631                 ret = PTR_ERR(new);
 632                 goto out;
 633         }
 634         /*
 635          * prevent changing our mempolicy while show_numa_maps()
 636          * is using it.
 637          * Note:  do_set_mempolicy() can be called at init time
 638          * with no 'mm'.
 639          */
 640         if (mm)
 641                 down_write(&mm->mmap_sem);
 642         task_lock(current);
 643         ret = mpol_set_nodemask(new, nodes, scratch);
 644         if (ret) {
 645                 task_unlock(current);
 646                 if (mm)
 647                         up_write(&mm->mmap_sem);
 648                 mpol_put(new);
 649                 goto out;
 650         }
 651         old = current->mempolicy;
 652         current->mempolicy = new;
 653         mpol_set_task_struct_flag();
 654         if (new && new->mode == MPOL_INTERLEAVE &&
 655             nodes_weight(new->v.nodes))
 656                 current->il_next = first_node(new->v.nodes);
 657         task_unlock(current);
 658         if (mm)
 659                 up_write(&mm->mmap_sem);
 660
 661         mpol_put(old);
 662         ret = 0;
 663 out:
 664         NODEMASK_SCRATCH_FREE(scratch);
 665         return ret;
 666 }
 667
 668 /*
 669  * Return nodemask for policy for get_mempolicy() query
 670  *
 671  * Called with task's alloc_lock held
 672  */
 673 static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
 674 {
 675         nodes_clear(*nodes);
 676         if (p == &default_policy)
 677                 return;
 678
 679         switch (p->mode) {
 680         case MPOL_BIND:
 681                 /* Fall through */
 682         case MPOL_INTERLEAVE:
 683                 *nodes = p->v.nodes;
 684                 break;
 685         case MPOL_PREFERRED:
 686                 if (!(p->flags & MPOL_F_LOCAL))
 687                         node_set(p->v.preferred_node, *nodes);
 688                 /* else return empty node mask for local allocation */
 689                 break;
 690         default:
 691                 BUG();
 692         }
 693 }
 694
 695 static int lookup_node(struct mm_struct *mm, unsigned long addr)
 696 {
 697         struct page *p;
 698         int err;
 699
 700         err = get_user_pages(current, mm, addr & PAGE_MASK, 1, 0, 0, &p, NULL);
 701         if (err >= 0) {
 702                 err = page_to_nid(p);
 703                 put_page(p);
 704         }
 705         return err;
 706 }
 707
 708 /* Retrieve NUMA policy */
 709 static long do_get_mempolicy(int *policy, nodemask_t *nmask,
 710                              unsigned long addr, unsigned long flags)
 711 {
 712         int err;
 713         struct mm_struct *mm = current->mm;
 714         struct vm_area_struct *vma = NULL;
 715         struct mempolicy *pol = current->mempolicy;
 716
 717         if (flags &
 718                 ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
 719                 return -EINVAL;
 720
 721         if (flags & MPOL_F_MEMS_ALLOWED) {
 722                 if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
 723                         return -EINVAL;
 724                 *policy = 0;    /* just so it's initialized */
 725                 task_lock(current);
 726                 *nmask  = cpuset_current_mems_allowed;
 727                 task_unlock(current);
 728                 return 0;
 729         }
 730
 731         if (flags & MPOL_F_ADDR) {
 732                 /*
 733                  * Do NOT fall back to task policy if the
 734                  * vma/shared policy at addr is NULL.  We
 735                  * want to return MPOL_DEFAULT in this case.
 736                  */
 737                 down_read(&mm->mmap_sem);
 738                 vma = find_vma_intersection(mm, addr, addr+1);
 739                 if (!vma) {
 740                         up_read(&mm->mmap_sem);
 741                         return -EFAULT;
 742                 }
 743                 if (vma->vm_ops && vma->vm_ops->get_policy)
 744                         pol = vma->vm_ops->get_policy(vma, addr);
 745                 else
 746                         pol = vma->vm_policy;
 747         } else if (addr)
 748                 return -EINVAL;
 749
 750         if (!pol)
 751                 pol = &default_policy;  /* indicates default behavior */
 752
 753         if (flags & MPOL_F_NODE) {
 754                 if (flags & MPOL_F_ADDR) {
 755                         err = lookup_node(mm, addr);
 756                         if (err < 0)
 757                                 goto out;
 758                         *policy = err;
 759                 } else if (pol == current->mempolicy &&
 760                                 pol->mode == MPOL_INTERLEAVE) {
 761                         *policy = current->il_next;
 762                 } else {
 763                         err = -EINVAL;
 764                         goto out;
 765                 }
 766         } else {
 767                 *policy = pol == &default_policy ? MPOL_DEFAULT :
 768                                                 pol->mode;
 769                 /*
 770                  * Internal mempolicy flags must be masked off before exposing
 771                  * the policy to userspace.
 772                  */
 773                 *policy |= (pol->flags & MPOL_MODE_FLAGS);
 774         }
 775
 776         if (vma) {
 777                 up_read(&current->mm->mmap_sem);
 778                 vma = NULL;
 779         }
 780
 781         err = 0;
 782         if (nmask) {
 783                 task_lock(current);
 784                 get_policy_nodemask(pol, nmask);
 785                 task_unlock(current);
 786         }
 787
 788  out:
 789         mpol_cond_put(pol);
 790         if (vma)
 791                 up_read(&current->mm->mmap_sem);
 792         return err;
 793 }
 794
 795 #ifdef CONFIG_MIGRATION
 796 /*
 797  * page migration
 798  */
 799 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 800                                 unsigned long flags)
 801 {
 802         /*
 803          * Avoid migrating a page that is shared with others.
 804          */
 805         if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
 806                 if (!isolate_lru_page(page)) {
 807                         list_add_tail(&page->lru, pagelist);
 808                         inc_zone_page_state(page, NR_ISOLATED_ANON +
 809                                             page_is_file_cache(page));
 810                 }
 811         }
 812 }
 813
 814 static struct page *new_node_page(struct page *page, unsigned long node, int **x)
 815 {
 816         return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
 817 }
 818
 819 /*
 820  * Migrate pages from one node to a target node.
 821  * Returns error or the number of pages not migrated.
 822  */
 823 static int migrate_to_node(struct mm_struct *mm, int source, int dest,
 824                            int flags)
 825 {
 826         nodemask_t nmask;
 827         LIST_HEAD(pagelist);
 828         int err = 0;
 829
 830         nodes_clear(nmask);
 831         node_set(source, nmask);
 832
 833         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
 834                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
 835
 836         if (!list_empty(&pagelist))
 837                 err = migrate_pages(&pagelist, new_node_page, dest, 0);
 838
 839         return err;
 840 }
 841
 842 /*
 843  * Move pages between the two nodesets so as to preserve the physical
 844  * layout as much as possible.
 845  *
 846  * Returns the number of page that could not be moved.
 847  */
 848 int do_migrate_pages(struct mm_struct *mm,
 849         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 850 {
 851         int busy = 0;
 852         int err;
 853         nodemask_t tmp;
 854
 855         err = migrate_prep();
 856         if (err)
 857                 return err;
 858
 859         down_read(&mm->mmap_sem);
 860
 861         err = migrate_vmas(mm, from_nodes, to_nodes, flags);
 862         if (err)
 863                 goto out;
 864
 865 /*
 866  * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
 867  * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
 868  * bit in 'tmp', and return that <source, dest> pair for migration.
 869  * The pair of nodemasks 'to' and 'from' define the map.
 870  *
 871  * If no pair of bits is found that way, fallback to picking some
 872  * pair of 'source' and 'dest' bits that are not the same.  If the
 873  * 'source' and 'dest' bits are the same, this represents a node
 874  * that will be migrating to itself, so no pages need move.
 875  *
 876  * If no bits are left in 'tmp', or if all remaining bits left
 877  * in 'tmp' correspond to the same bit in 'to', return false
 878  * (nothing left to migrate).
 879  *
 880  * This lets us pick a pair of nodes to migrate between, such that
 881  * if possible the dest node is not already occupied by some other
 882  * source node, minimizing the risk of overloading the memory on a
 883  * node that would happen if we migrated incoming memory to a node
 884  * before migrating outgoing memory source that same node.
 885  *
 886  * A single scan of tmp is sufficient.  As we go, we remember the
 887  * most recent <s, d> pair that moved (s != d).  If we find a pair
 888  * that not only moved, but what's better, moved to an empty slot
 889  * (d is not set in tmp), then we break out then, with that pair.
 890  * Otherwise when we finish scannng from_tmp, we at least have the
 891  * most recent <s, d> pair that moved.  If we get all the way through
 892  * the scan of tmp without finding any node that moved, much less
 893  * moved to an empty node, then there is nothing left worth migrating.
 894  */
 895
 896         tmp = *from_nodes;
 897         while (!nodes_empty(tmp)) {
 898                 int s,d;
 899                 int source = -1;
 900                 int dest = 0;
 901
 902                 for_each_node_mask(s, tmp) {
 903                         d = node_remap(s, *from_nodes, *to_nodes);
 904                         if (s == d)
 905                                 continue;
 906
 907                         source = s;     /* Node moved. Memorize */
 908                         dest = d;
 909
 910                         /* dest not in remaining from nodes? */
 911                         if (!node_isset(dest, tmp))
 912                                 break;
 913                 }
 914                 if (source == -1)
 915                         break;
 916
 917                 node_clear(source, tmp);
 918                 err = migrate_to_node(mm, source, dest, flags);
 919                 if (err > 0)
 920                         busy += err;
 921                 if (err < 0)
 922                         break;
 923         }
 924 out:
 925         up_read(&mm->mmap_sem);
 926         if (err < 0)
 927                 return err;
 928         return busy;
 929
 930 }
 931
 932 /*
 933  * Allocate a new page for page migration based on vma policy.
 934  * Start assuming that page is mapped by vma pointed to by @private.
 935  * Search forward from there, if not.  N.B., this assumes that the
 936  * list of pages handed to migrate_pages()--which is how we get here--
 937  * is in virtual address order.
 938  */
 939 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 940 {
 941         struct vm_area_struct *vma = (struct vm_area_struct *)private;
 942         unsigned long uninitialized_var(address);
 943
 944         while (vma) {
 945                 address = page_address_in_vma(page, vma);
 946                 if (address != -EFAULT)
 947                         break;
 948                 vma = vma->vm_next;
 949         }
 950
 951         /*
 952          * if !vma, alloc_page_vma() will use task or system default policy
 953          */
 954         return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
 955 }
 956 #else
 957
 958 static void migrate_page_add(struct page *page, struct list_head *pagelist,
 959                                 unsigned long flags)
 960 {
 961 }
 962
 963 int do_migrate_pages(struct mm_struct *mm,
 964         const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
 965 {
 966         return -ENOSYS;
 967 }
 968
 969 static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
 970 {
 971         return NULL;
 972 }
 973 #endif
 974
 975 static long do_mbind(unsigned long start, unsigned long len,
 976                      unsigned short mode, unsigned short mode_flags,
 977                      nodemask_t *nmask, unsigned long flags)
 978 {
 979         struct vm_area_struct *vma;
 980         struct mm_struct *mm = current->mm;
 981         struct mempolicy *new;
 982         unsigned long end;
 983         int err;
 984         LIST_HEAD(pagelist);
 985
 986         if (flags & ~(unsigned long)(MPOL_MF_STRICT |
 987                                      MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
 988                 return -EINVAL;
 989         if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
 990                 return -EPERM;
 991
 992         if (start & ~PAGE_MASK)
 993                 return -EINVAL;
 994
 995         if (mode == MPOL_DEFAULT)
 996                 flags &= ~MPOL_MF_STRICT;
 997
 998         len = (len + PAGE_SIZE - 1) & PAGE_MASK;
 999         end = start + len;
1000
1001         if (end < start)
1002                 return -EINVAL;
1003         if (end == start)
1004                 return 0;
1005
1006         new = mpol_new(mode, mode_flags, nmask);
1007         if (IS_ERR(new))
1008                 return PTR_ERR(new);
1009
1010         /*
1011          * If we are using the default policy then operation
1012          * on discontinuous address spaces is okay after all
1013          */
1014         if (!new)
1015                 flags |= MPOL_MF_DISCONTIG_OK;
1016
1017         pr_debug("mbind %lx-%lx mode:%d flags:%d nodes:%lx\n",
1018                  start, start + len, mode, mode_flags,
1019                  nmask ? nodes_addr(*nmask)[0] : -1);
1020
1021         if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
1022
1023                 err = migrate_prep();
1024                 if (err)
1025                         goto mpol_out;
1026         }
1027         {
1028                 NODEMASK_SCRATCH(scratch);
1029                 if (scratch) {
1030                         down_write(&mm->mmap_sem);
1031                         task_lock(current);
1032                         err = mpol_set_nodemask(new, nmask, scratch);
1033                         task_unlock(current);
1034                         if (err)
1035                                 up_write(&mm->mmap_sem);
1036                 } else
1037                         err = -ENOMEM;
1038                 NODEMASK_SCRATCH_FREE(scratch);
1039         }
1040         if (err)
1041                 goto mpol_out;
1042
1043         vma = check_range(mm, start, end, nmask,
1044                           flags | MPOL_MF_INVERT, &pagelist);
1045
1046         err = PTR_ERR(vma);
1047         if (!IS_ERR(vma)) {
1048                 int nr_failed = 0;
1049
1050                 err = mbind_range(vma, start, end, new);
1051
1052                 if (!list_empty(&pagelist))
1053                         nr_failed = migrate_pages(&pagelist, new_vma_page,
1054                                                 (unsigned long)vma, 0);
1055
1056                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
1057                         err = -EIO;
1058         } else
1059                 putback_lru_pages(&pagelist);
1060
1061         up_write(&mm->mmap_sem);
1062  mpol_out:
1063         mpol_put(new);
1064         return err;
1065 }
1066
1067 /*
1068  * User space interface with variable sized bitmaps for nodelists.
1069  */
1070
1071 /* Copy a node mask from user space. */
1072 static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask,
1073                      unsigned long maxnode)
1074 {
1075         unsigned long k;
1076         unsigned long nlongs;
1077         unsigned long endmask;
1078
1079         --maxnode;
1080         nodes_clear(*nodes);
1081         if (maxnode == 0 || !nmask)
1082                 return 0;
1083         if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
1084                 return -EINVAL;
1085
1086         nlongs = BITS_TO_LONGS(maxnode);
1087         if ((maxnode % BITS_PER_LONG) == 0)
1088                 endmask = ~0UL;
1089         else
1090                 endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1;
1091
1092         /* When the user specified more nodes than supported just check
1093            if the non supported part is all zero. */
1094         if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) {
1095                 if (nlongs > PAGE_SIZE/sizeof(long))
1096                         return -EINVAL;
1097                 for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) {
1098                         unsigned long t;
1099                         if (get_user(t, nmask + k))
1100                                 return -EFAULT;
1101                         if (k == nlongs - 1) {
1102                                 if (t & endmask)
1103                                         return -EINVAL;
1104                         } else if (t)
1105                                 return -EINVAL;
1106                 }
1107                 nlongs = BITS_TO_LONGS(MAX_NUMNODES);
1108                 endmask = ~0UL;
1109         }
1110
1111         if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long)))
1112                 return -EFAULT;
1113         nodes_addr(*nodes)[nlongs-1] &= endmask;
1114         return 0;
1115 }
1116
1117 /* Copy a kernel node mask to user space */
1118 static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode,
1119                               nodemask_t *nodes)
1120 {
1121         unsigned long copy = ALIGN(maxnode-1, 64) / 8;
1122         const int nbytes = BITS_TO_LONGS(MAX_NUMNODES) * sizeof(long);
1123
1124         if (copy > nbytes) {
1125                 if (copy > PAGE_SIZE)
1126                         return -EINVAL;
1127                 if (clear_user((char __user *)mask + nbytes, copy - nbytes))
1128                         return -EFAULT;
1129                 copy = nbytes;
1130         }
1131         return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
1132 }
1133
1134 SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
1135                 unsigned long, mode, unsigned long __user *, nmask,
1136                 unsigned long, maxnode, unsigned, flags)
1137 {
1138         nodemask_t nodes;
1139         int err;
1140         unsigned short mode_flags;
1141
1142         mode_flags = mode & MPOL_MODE_FLAGS;
1143         mode &= ~MPOL_MODE_FLAGS;
1144         if (mode >= MPOL_MAX)
1145                 return -EINVAL;
1146         if ((mode_flags & MPOL_F_STATIC_NODES) &&
1147             (mode_flags & MPOL_F_RELATIVE_NODES))
1148                 return -EINVAL;
1149         err = get_nodes(&nodes, nmask, maxnode);
1150         if (err)
1151                 return err;
1152         return do_mbind(start, len, mode, mode_flags, &nodes, flags);
1153 }
1154
1155 /* Set the process memory policy */
1156 SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
1157                 unsigned long, maxnode)
1158 {
1159         int err;
1160         nodemask_t nodes;
1161         unsigned short flags;
1162
1163         flags = mode & MPOL_MODE_FLAGS;
1164         mode &= ~MPOL_MODE_FLAGS;
1165         if ((unsigned int)mode >= MPOL_MAX)
1166                 return -EINVAL;
1167         if ((flags & MPOL_F_STATIC_NODES) && (flags & MPOL_F_RELATIVE_NODES))
1168                 return -EINVAL;
1169         err = get_nodes(&nodes, nmask, maxnode);
1170         if (err)
1171                 return err;
1172         return do_set_mempolicy(mode, flags, &nodes);
1173 }
1174
1175 SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
1176                 const unsigned long __user *, old_nodes,
1177                 const unsigned long __user *, new_nodes)
1178 {
1179         const struct cred *cred = current_cred(), *tcred;
1180         struct mm_struct *mm;
1181         struct task_struct *task;
1182         nodemask_t old;
1183         nodemask_t new;
1184         nodemask_t task_nodes;
1185         int err;
1186
1187         err = get_nodes(&old, old_nodes, maxnode);
1188         if (err)
1189                 return err;
1190
1191         err = get_nodes(&new, new_nodes, maxnode);
1192         if (err)
1193                 return err;
1194
1195         /* Find the mm_struct */
1196         read_lock(&tasklist_lock);
1197         task = pid ? find_task_by_vpid(pid) : current;
1198         if (!task) {
1199                 read_unlock(&tasklist_lock);
1200                 return -ESRCH;
1201         }
1202         mm = get_task_mm(task);
1203         read_unlock(&tasklist_lock);
1204
1205         if (!mm)
1206                 return -EINVAL;
1207
1208         /*
1209          * Check if this process has the right to modify the specified
1210          * process. The right exists if the process has administrative
1211          * capabilities, superuser privileges or the same
1212          * userid as the target process.
1213          */
1214         rcu_read_lock();
1215         tcred = __task_cred(task);
1216         if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
1217             cred->uid  != tcred->suid && cred->uid  != tcred->uid &&
1218             !capable(CAP_SYS_NICE)) {
1219                 rcu_read_unlock();
1220                 err = -EPERM;
1221                 goto out;
1222         }
1223         rcu_read_unlock();
1224
1225         task_nodes = cpuset_mems_allowed(task);
1226         /* Is the user allowed to access the target nodes? */
1227         if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
1228                 err = -EPERM;
1229                 goto out;
1230         }
1231
1232         if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
1233                 err = -EINVAL;
1234                 goto out;
1235         }
1236
1237         err = security_task_movememory(task);
1238         if (err)
1239                 goto out;
1240
1241         err = do_migrate_pages(mm, &old, &new,
1242                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
1243 out:
1244         mmput(mm);
1245         return err;
1246 }
1247
1248
1249 /* Retrieve NUMA policy */
1250 SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
1251                 unsigned long __user *, nmask, unsigned long, maxnode,
1252                 unsigned long, addr, unsigned long, flags)
1253 {
1254         int err;
1255         int uninitialized_var(pval);
1256         nodemask_t nodes;
1257
1258         if (nmask != NULL && maxnode < MAX_NUMNODES)
1259                 return -EINVAL;
1260
1261         err = do_get_mempolicy(&pval, &nodes, addr, flags);
1262
1263         if (err)
1264                 return err;
1265
1266         if (policy && put_user(pval, policy))
1267                 return -EFAULT;
1268
1269         if (nmask)
1270                 err = copy_nodes_to_user(nmask, maxnode, &nodes);
1271
1272         return err;
1273 }
1274
1275 #ifdef CONFIG_COMPAT
1276
1277 asmlinkage long compat_sys_get_mempolicy(int __user *policy,
1278                                      compat_ulong_t __user *nmask,
1279                                      compat_ulong_t maxnode,
1280                                      compat_ulong_t addr, compat_ulong_t flags)
1281 {
1282         long err;
1283         unsigned long __user *nm = NULL;
1284         unsigned long nr_bits, alloc_size;
1285         DECLARE_BITMAP(bm, MAX_NUMNODES);
1286
1287         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1288         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1289
1290         if (nmask)
1291                 nm = compat_alloc_user_space(alloc_size);
1292
1293         err = sys_get_mempolicy(policy, nm, nr_bits+1, addr, flags);
1294
1295         if (!err && nmask) {
1296                 err = copy_from_user(bm, nm, alloc_size);
1297                 /* ensure entire bitmap is zeroed */
1298                 err |= clear_user(nmask, ALIGN(maxnode-1, 8) / 8);
1299                 err |= compat_put_bitmap(nmask, bm, nr_bits);
1300         }
1301
1302         return err;
1303 }
1304
1305 asmlinkage long compat_sys_set_mempolicy(int mode, compat_ulong_t __user *nmask,
1306                                      compat_ulong_t maxnode)
1307 {
1308         long err = 0;
1309         unsigned long __user *nm = NULL;
1310         unsigned long nr_bits, alloc_size;
1311         DECLARE_BITMAP(bm, MAX_NUMNODES);
1312
1313         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1314         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1315
1316         if (nmask) {
1317                 err = compat_get_bitmap(bm, nmask, nr_bits);
1318                 nm = compat_alloc_user_space(alloc_size);
1319                 err |= copy_to_user(nm, bm, alloc_size);
1320         }
1321
1322         if (err)
1323                 return -EFAULT;
1324
1325         return sys_set_mempolicy(mode, nm, nr_bits+1);
1326 }
1327
1328 asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
1329                              compat_ulong_t mode, compat_ulong_t __user *nmask,
1330                              compat_ulong_t maxnode, compat_ulong_t flags)
1331 {
1332         long err = 0;
1333         unsigned long __user *nm = NULL;
1334         unsigned long nr_bits, alloc_size;
1335         nodemask_t bm;
1336
1337         nr_bits = min_t(unsigned long, maxnode-1, MAX_NUMNODES);
1338         alloc_size = ALIGN(nr_bits, BITS_PER_LONG) / 8;
1339
1340         if (nmask) {
1341                 err = compat_get_bitmap(nodes_addr(bm), nmask, nr_bits);
1342                 nm = compat_alloc_user_space(alloc_size);
1343                 err |= copy_to_user(nm, nodes_addr(bm), alloc_size);
1344         }
1345
1346         if (err)
1347                 return -EFAULT;
1348
1349         return sys_mbind(start, len, mode, nm, nr_bits+1, flags);
1350 }
1351
1352 #endif
1353
1354 /*
1355  * get_vma_policy(@task, @vma, @addr)
1356  * @task - task for fallback if vma policy == default
1357  * @vma   - virtual memory area whose policy is sought
1358  * @addr  - address in @vma for shared policy lookup
1359  *
1360  * Returns effective policy for a VMA at specified address.
1361  * Falls back to @task or system default policy, as necessary.
1362  * Current or other task's task mempolicy and non-shared vma policies
1363  * are protected by the task's mmap_sem, which must be held for read by
1364  * the caller.
1365  * Shared policies [those marked as MPOL_F_SHARED] require an extra reference
1366  * count--added by the get_policy() vm_op, as appropriate--to protect against
1367  * freeing by another task.  It is the caller's responsibility to free the
1368  * extra reference for shared policies.
1369  */
1370 static struct mempolicy *get_vma_policy(struct task_struct *task,
1371                 struct vm_area_struct *vma, unsigned long addr)
1372 {
1373         struct mempolicy *pol = task->mempolicy;
1374
1375         if (vma) {
1376                 if (vma->vm_ops && vma->vm_ops->get_policy) {
1377                         struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
1378                                                                         addr);
1379                         if (vpol)
1380                                 pol = vpol;
1381                 } else if (vma->vm_policy)
1382                         pol = vma->vm_policy;
1383         }
1384         if (!pol)
1385                 pol = &default_policy;
1386         return pol;
1387 }
1388
1389 /*
1390  * Return a nodemask representing a mempolicy for filtering nodes for
1391  * page allocation
1392  */
1393 static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
1394 {
1395         /* Lower zones don't get a nodemask applied for MPOL_BIND */
1396         if (unlikely(policy->mode == MPOL_BIND) &&
1397                         gfp_zone(gfp) >= policy_zone &&
1398                         cpuset_nodemask_valid_mems_allowed(&policy->v.nodes))
1399                 return &policy->v.nodes;
1400
1401         return NULL;
1402 }
1403
1404 /* Return a zonelist indicated by gfp for node representing a mempolicy */
1405 static struct zonelist *policy_zonelist(gfp_t gfp, struct mempolicy *policy)
1406 {
1407         int nd = numa_node_id();
1408
1409         switch (policy->mode) {
1410         case MPOL_PREFERRED:
1411                 if (!(policy->flags & MPOL_F_LOCAL))
1412                         nd = policy->v.preferred_node;
1413                 break;
1414         case MPOL_BIND:
1415                 /*
1416                  * Normally, MPOL_BIND allocations are node-local within the
1417                  * allowed nodemask.  However, if __GFP_THISNODE is set and the
1418                  * current node is part of the mask, we use the zonelist for
1419                  * the first node in the mask instead.
1420                  */
1421                 if (unlikely(gfp & __GFP_THISNODE) &&
1422                                 unlikely(!node_isset(nd, policy->v.nodes)))
1423                         nd = first_node(policy->v.nodes);
1424                 break;
1425         case MPOL_INTERLEAVE: /* should not happen */
1426                 break;
1427         default:
1428                 BUG();
1429         }
1430         return node_zonelist(nd, gfp);
1431 }
1432
1433 /* Do dynamic interleaving for a process */
1434 static unsigned interleave_nodes(struct mempolicy *policy)
1435 {
1436         unsigned nid, next;
1437         struct task_struct *me = current;
1438
1439         nid = me->il_next;
1440         next = next_node(nid, policy->v.nodes);
1441         if (next >= MAX_NUMNODES)
1442                 next = first_node(policy->v.nodes);
1443         if (next < MAX_NUMNODES)
1444                 me->il_next = next;
1445         return nid;
1446 }
1447
1448 /*
1449  * Depending on the memory policy provide a node from which to allocate the
1450  * next slab entry.
1451  * @policy must be protected by freeing by the caller.  If @policy is
1452  * the current task's mempolicy, this protection is implicit, as only the
1453  * task can change it's policy.  The system default policy requires no
1454  * such protection.
1455  */
1456 unsigned slab_node(struct mempolicy *policy)
1457 {
1458         if (!policy || policy->flags & MPOL_F_LOCAL)
1459                 return numa_node_id();
1460
1461         switch (policy->mode) {
1462         case MPOL_PREFERRED:
1463                 /*
1464                  * handled MPOL_F_LOCAL above
1465                  */
1466                 return policy->v.preferred_node;
1467
1468         case MPOL_INTERLEAVE:
1469                 return interleave_nodes(policy);
1470
1471         case MPOL_BIND: {
1472                 /*
1473                  * Follow bind policy behavior and start allocation at the
1474                  * first node.
1475                  */
1476                 struct zonelist *zonelist;
1477                 struct zone *zone;
1478                 enum zone_type highest_zoneidx = gfp_zone(GFP_KERNEL);
1479                 zonelist = &NODE_DATA(numa_node_id())->node_zonelists[0];
1480                 (void)first_zones_zonelist(zonelist, highest_zoneidx,
1481                                                         &policy->v.nodes,
1482                                                         &zone);
1483                 return zone->node;
1484         }
1485
1486         default:
1487                 BUG();
1488         }
1489 }
1490
1491 /* Do static interleaving for a VMA with known offset. */
1492 static unsigned offset_il_node(struct mempolicy *pol,
1493                 struct vm_area_struct *vma, unsigned long off)
1494 {
1495         unsigned nnodes = nodes_weight(pol->v.nodes);
1496         unsigned target;
1497         int c;
1498         int nid = -1;
1499
1500         if (!nnodes)
1501                 return numa_node_id();
1502         target = (unsigned int)off % nnodes;
1503         c = 0;
1504         do {
1505                 nid = next_node(nid, pol->v.nodes);
1506                 c++;
1507         } while (c <= target);
1508         return nid;
1509 }
1510
1511 /* Determine a node number for interleave */
1512 static inline unsigned interleave_nid(struct mempolicy *pol,
1513                  struct vm_area_struct *vma, unsigned long addr, int shift)
1514 {
1515         if (vma) {
1516                 unsigned long off;
1517
1518                 /*
1519                  * for small pages, there is no difference between
1520                  * shift and PAGE_SHIFT, so the bit-shift is safe.
1521                  * for huge pages, since vm_pgoff is in units of small
1522                  * pages, we need to shift off the always 0 bits to get
1523                  * a useful offset.
1524                  */
1525                 BUG_ON(shift < PAGE_SHIFT);
1526                 off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
1527                 off += (addr - vma->vm_start) >> shift;
1528                 return offset_il_node(pol, vma, off);
1529         } else
1530                 return interleave_nodes(pol);
1531 }
1532
1533 #ifdef CONFIG_HUGETLBFS
1534 /*
1535  * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
1536  * @vma = virtual memory area whose policy is sought
1537  * @addr = address in @vma for shared policy lookup and interleave policy
1538  * @gfp_flags = for requested zone
1539  * @mpol = pointer to mempolicy pointer for reference counted mempolicy
1540  * @nodemask = pointer to nodemask pointer for MPOL_BIND nodemask
1541  *
1542  * Returns a zonelist suitable for a huge page allocation and a pointer
1543  * to the struct mempolicy for conditional unref after allocation.
1544  * If the effective policy is 'BIND, returns a pointer to the mempolicy's
1545  * @nodemask for filtering the zonelist.
1546  */
1547 struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
1548                                 gfp_t gfp_flags, struct mempolicy **mpol,
1549                                 nodemask_t **nodemask)
1550 {
1551         struct zonelist *zl;
1552
1553         *mpol = get_vma_policy(current, vma, addr);
1554         *nodemask = NULL;       /* assume !MPOL_BIND */
1555
1556         if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
1557                 zl = node_zonelist(interleave_nid(*mpol, vma, addr,
1558                                 huge_page_shift(hstate_vma(vma))), gfp_flags);
1559         } else {
1560                 zl = policy_zonelist(gfp_flags, *mpol);
1561                 if ((*mpol)->mode == MPOL_BIND)
1562                         *nodemask = &(*mpol)->v.nodes;
1563         }
1564         return zl;
1565 }
1566
1567 /*
1568  * init_nodemask_of_mempolicy
1569  *
1570  * If the current task's mempolicy is "default" [NULL], return 'false'
1571  * to indicate default policy.  Otherwise, extract the policy nodemask
1572  * for 'bind' or 'interleave' policy into the argument nodemask, or
1573  * initialize the argument nodemask to contain the single node for
1574  * 'preferred' or 'local' policy and return 'true' to indicate presence
1575  * of non-default mempolicy.
1576  *
1577  * We don't bother with reference counting the mempolicy [mpol_get/put]
1578  * because the current task is examining it's own mempolicy and a task's
1579  * mempolicy is only ever changed by the task itself.
1580  *
1581  * N.B., it is the caller's responsibility to free a returned nodemask.
1582  */
1583 bool init_nodemask_of_mempolicy(nodemask_t *mask)
1584 {
1585         struct mempolicy *mempolicy;
1586         int nid;
1587
1588         if (!(mask && current->mempolicy))
1589                 return false;
1590
1591         mempolicy = current->mempolicy;
1592         switch (mempolicy->mode) {
1593         case MPOL_PREFERRED:
1594                 if (mempolicy->flags & MPOL_F_LOCAL)
1595                         nid = numa_node_id();
1596                 else
1597                         nid = mempolicy->v.preferred_node;
1598                 init_nodemask_of_node(mask, nid);
1599                 break;
1600
1601         case MPOL_BIND:
1602                 /* Fall through */
1603         case MPOL_INTERLEAVE:
1604                 *mask =  mempolicy->v.nodes;
1605                 break;
1606
1607         default:
1608                 BUG();
1609         }
1610
1611         return true;
1612 }
1613 #endif
1614
1615 /* Allocate a page in interleaved policy.
1616    Own path because it needs to do special accounting. */
1617 static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
1618                                         unsigned nid)
1619 {
1620         struct zonelist *zl;
1621         struct page *page;
1622
1623         zl = node_zonelist(nid, gfp);
1624         page = __alloc_pages(gfp, order, zl);
1625         if (page && page_zone(page) == zonelist_zone(&zl->_zonerefs[0]))
1626                 inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
1627         return page;
1628 }
1629
1630 /**
1631  *      alloc_page_vma  - Allocate a page for a VMA.
1632  *
1633  *      @gfp:
1634  *      %GFP_USER    user allocation.
1635  *      %GFP_KERNEL  kernel allocations,
1636  *      %GFP_HIGHMEM highmem/user allocations,
1637  *      %GFP_FS      allocation should not call back into a file system.
1638  *      %GFP_ATOMIC  don't sleep.
1639  *
1640  *      @vma:  Pointer to VMA or NULL if not available.
1641  *      @addr: Virtual Address of the allocation. Must be inside the VMA.
1642  *
1643  *      This function allocates a page from the kernel page pool and applies
1644  *      a NUMA policy associated with the VMA or the current process.
1645  *      When VMA is not NULL caller must hold down_read on the mmap_sem of the
1646  *      mm_struct of the VMA to prevent it from going away. Should be used for
1647  *      all allocations for pages that will be mapped into
1648  *      user space. Returns NULL when no page can be allocated.
1649  *
1650  *      Should be called with the mm_sem of the vma hold.
1651  */
1652 struct page *
1653 alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
1654 {
1655         struct mempolicy *pol = get_vma_policy(current, vma, addr);
1656         struct zonelist *zl;
1657
1658         if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
1659                 unsigned nid;
1660
1661                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
1662                 mpol_cond_put(pol);
1663                 return alloc_page_interleave(gfp, 0, nid);
1664         }
1665         zl = policy_zonelist(gfp, pol);
1666         if (unlikely(mpol_needs_cond_ref(pol))) {
1667                 /*
1668                  * slow path: ref counted shared policy
1669                  */
1670                 struct page *page =  __alloc_pages_nodemask(gfp, 0,
1671                                                 zl, policy_nodemask(gfp, pol));
1672                 __mpol_put(pol);
1673                 return page;
1674         }
1675         /*
1676          * fast path:  default or task policy
1677          */
1678         return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
1679 }
1680
1681 /**
1682  *      alloc_pages_current - Allocate pages.
1683  *
1684  *      @gfp:
1685  *              %GFP_USER   user allocation,
1686  *              %GFP_KERNEL kernel allocation,
1687  *              %GFP_HIGHMEM highmem allocation,
1688  *              %GFP_FS     don't call back into a file system.
1689  *              %GFP_ATOMIC don't sleep.
1690  *      @order: Power of two of allocation size in pages. 0 is a single page.
1691  *
1692  *      Allocate a page from the kernel page pool.  When not in
1693  *      interrupt context and apply the current process NUMA policy.
1694  *      Returns NULL when no page can be allocated.
1695  *
1696  *      Don't call cpuset_update_task_memory_state() unless
1697  *      1) it's ok to take cpuset_sem (can WAIT), and
1698  *      2) allocating for current task (not interrupt).
1699  */
1700 struct page *alloc_pages_current(gfp_t gfp, unsigned order)
1701 {
1702         struct mempolicy *pol = current->mempolicy;
1703
1704         if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
1705                 pol = &default_policy;
1706
1707         /*
1708          * No reference counting needed for current->mempolicy
1709          * nor system default_policy
1710          */
1711         if (pol->mode == MPOL_INTERLEAVE)
1712                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
1713         return __alloc_pages_nodemask(gfp, order,
1714                         policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
1715 }
1716 EXPORT_SYMBOL(alloc_pages_current);
1717
1718 /*
1719  * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
1720  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
1721  * with the mems_allowed returned by cpuset_mems_allowed().  This
1722  * keeps mempolicies cpuset relative after its cpuset moves.  See
1723  * further kernel/cpuset.c update_nodemask().
1724  */
1725
1726 /* Slow path of a mempolicy duplicate */
1727 struct mempolicy *__mpol_dup(struct mempolicy *old)
1728 {
1729         struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
1730
1731         if (!new)
1732                 return ERR_PTR(-ENOMEM);
1733         rcu_read_lock();
1734         if (current_cpuset_is_being_rebound()) {
1735                 nodemask_t mems = cpuset_mems_allowed(current);
1736                 mpol_rebind_policy(old, &mems);
1737         }
1738         rcu_read_unlock();
1739         *new = *old;
1740         atomic_set(&new->refcnt, 1);
1741         return new;
1742 }
1743
1744 /*
1745  * If *frompol needs [has] an extra ref, copy *frompol to *tompol ,
1746  * eliminate the * MPOL_F_* flags that require conditional ref and
1747  * [NOTE!!!] drop the extra ref.  Not safe to reference *frompol directly
1748  * after return.  Use the returned value.
1749  *
1750  * Allows use of a mempolicy for, e.g., multiple allocations with a single
1751  * policy lookup, even if the policy needs/has extra ref on lookup.
1752  * shmem_readahead needs this.
1753  */
1754 struct mempolicy *__mpol_cond_copy(struct mempolicy *tompol,
1755                                                 struct mempolicy *frompol)
1756 {
1757         if (!mpol_needs_cond_ref(frompol))
1758                 return frompol;
1759
1760         *tompol = *frompol;
1761         tompol->flags &= ~MPOL_F_SHARED;        /* copy doesn't need unref */
1762         __mpol_put(frompol);
1763         return tompol;
1764 }
1765
1766 static int mpol_match_intent(const struct mempolicy *a,
1767                              const struct mempolicy *b)
1768 {
1769         if (a->flags != b->flags)
1770                 return 0;
1771         if (!mpol_store_user_nodemask(a))
1772                 return 1;
1773         return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
1774 }
1775
1776 /* Slow path of a mempolicy comparison */
1777 int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
1778 {
1779         if (!a || !b)
1780                 return 0;
1781         if (a->mode != b->mode)
1782                 return 0;
1783         if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
1784                 return 0;
1785         switch (a->mode) {
1786         case MPOL_BIND:
1787                 /* Fall through */
1788         case MPOL_INTERLEAVE:
1789                 return nodes_equal(a->v.nodes, b->v.nodes);
1790         case MPOL_PREFERRED:
1791                 return a->v.preferred_node == b->v.preferred_node &&
1792                         a->flags == b->flags;
1793         default:
1794                 BUG();
1795                 return 0;
1796         }
1797 }
1798
1799 /*
1800  * Shared memory backing store policy support.
1801  *
1802  * Remember policies even when nobody has shared memory mapped.
1803  * The policies are kept in Red-Black tree linked from the inode.
1804  * They are protected by the sp->lock spinlock, which should be held
1805  * for any accesses to the tree.
1806  */
1807
1808 /* lookup first element intersecting start-end */
1809 /* Caller holds sp->lock */
1810 static struct sp_node *
1811 sp_lookup(struct shared_policy *sp, unsigned long start, unsigned long end)
1812 {
1813         struct rb_node *n = sp->root.rb_node;
1814
1815         while (n) {
1816                 struct sp_node *p = rb_entry(n, struct sp_node, nd);
1817
1818                 if (start >= p->end)
1819                         n = n->rb_right;
1820                 else if (end <= p->start)
1821                         n = n->rb_left;
1822                 else
1823                         break;
1824         }
1825         if (!n)
1826                 return NULL;
1827         for (;;) {
1828                 struct sp_node *w = NULL;
1829                 struct rb_node *prev = rb_prev(n);
1830                 if (!prev)
1831                         break;
1832                 w = rb_entry(prev, struct sp_node, nd);
1833                 if (w->end <= start)
1834                         break;
1835                 n = prev;
1836         }
1837         return rb_entry(n, struct sp_node, nd);
1838 }
1839
1840 /* Insert a new shared policy into the list. */
1841 /* Caller holds sp->lock */
1842 static void sp_insert(struct shared_policy *sp, struct sp_node *new)
1843 {
1844         struct rb_node **p = &sp->root.rb_node;
1845         struct rb_node *parent = NULL;
1846         struct sp_node *nd;
1847
1848         while (*p) {
1849                 parent = *p;
1850                 nd = rb_entry(parent, struct sp_node, nd);
1851                 if (new->start < nd->start)
1852                         p = &(*p)->rb_left;
1853                 else if (new->end > nd->end)
1854                         p = &(*p)->rb_right;
1855                 else
1856                         BUG();
1857         }
1858         rb_link_node(&new->nd, parent, p);
1859         rb_insert_color(&new->nd, &sp->root);
1860         pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
1861                  new->policy ? new->policy->mode : 0);
1862 }
1863
1864 /* Find shared policy intersecting idx */
1865 struct mempolicy *
1866 mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
1867 {
1868         struct mempolicy *pol = NULL;
1869         struct sp_node *sn;
1870
1871         if (!sp->root.rb_node)
1872                 return NULL;
1873         spin_lock(&sp->lock);
1874         sn = sp_lookup(sp, idx, idx+1);
1875         if (sn) {
1876                 mpol_get(sn->policy);
1877                 pol = sn->policy;
1878         }
1879         spin_unlock(&sp->lock);
1880         return pol;
1881 }
1882
1883 static void sp_delete(struct shared_policy *sp, struct sp_node *n)
1884 {
1885         pr_debug("deleting %lx-l%lx\n", n->start, n->end);
1886         rb_erase(&n->nd, &sp->root);
1887         mpol_put(n->policy);
1888         kmem_cache_free(sn_cache, n);
1889 }
1890
1891 static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
1892                                 struct mempolicy *pol)
1893 {
1894         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
1895
1896         if (!n)
1897                 return NULL;
1898         n->start = start;
1899         n->end = end;
1900         mpol_get(pol);
1901         pol->flags |= MPOL_F_SHARED;    /* for unref */
1902         n->policy = pol;
1903         return n;
1904 }
1905
1906 /* Replace a policy range. */
1907 static int shared_policy_replace(struct shared_policy *sp, unsigned long start,
1908                                  unsigned long end, struct sp_node *new)
1909 {
1910         struct sp_node *n, *new2 = NULL;
1911
1912 restart:
1913         spin_lock(&sp->lock);
1914         n = sp_lookup(sp, start, end);
1915         /* Take care of old policies in the same range. */
1916         while (n && n->start < end) {
1917                 struct rb_node *next = rb_next(&n->nd);
1918                 if (n->start >= start) {
1919                         if (n->end <= end)
1920                                 sp_delete(sp, n);
1921                         else
1922                                 n->start = end;
1923                 } else {
1924                         /* Old policy spanning whole new range. */
1925                         if (n->end > end) {
1926                                 if (!new2) {
1927                                         spin_unlock(&sp->lock);
1928                                         new2 = sp_alloc(end, n->end, n->policy);
1929                                         if (!new2)
1930                                                 return -ENOMEM;
1931                                         goto restart;
1932                                 }
1933                                 n->end = start;
1934                                 sp_insert(sp, new2);
1935                                 new2 = NULL;
1936                                 break;
1937                         } else
1938                                 n->end = start;
1939                 }
1940                 if (!next)
1941                         break;
1942                 n = rb_entry(next, struct sp_node, nd);
1943         }
1944         if (new)
1945                 sp_insert(sp, new);
1946         spin_unlock(&sp->lock);
1947         if (new2) {
1948                 mpol_put(new2->policy);
1949                 kmem_cache_free(sn_cache, new2);
1950         }
1951         return 0;
1952 }
1953
1954 /**
1955  * mpol_shared_policy_init - initialize shared policy for inode
1956  * @sp: pointer to inode shared policy
1957  * @mpol:  struct mempolicy to install
1958  *
1959  * Install non-NULL @mpol in inode's shared policy rb-tree.
1960  * On entry, the current task has a reference on a non-NULL @mpol.
1961  * This must be released on exit.
1962  * This is called at get_inode() calls and we can use GFP_KERNEL.
1963  */
1964 void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
1965 {
1966         int ret;
1967
1968         sp->root = RB_ROOT;             /* empty tree == default mempolicy */
1969         spin_lock_init(&sp->lock);
1970
1971         if (mpol) {
1972                 struct vm_area_struct pvma;
1973                 struct mempolicy *new;
1974                 NODEMASK_SCRATCH(scratch);
1975
1976                 if (!scratch)
1977                         return;
1978                 /* contextualize the tmpfs mount point mempolicy */
1979                 new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
1980                 if (IS_ERR(new)) {
1981                         mpol_put(mpol); /* drop our ref on sb mpol */
1982                         NODEMASK_SCRATCH_FREE(scratch);
1983                         return;         /* no valid nodemask intersection */
1984                 }
1985
1986                 task_lock(current);
1987                 ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
1988                 task_unlock(current);
1989                 mpol_put(mpol); /* drop our ref on sb mpol */
1990                 if (ret) {
1991                         NODEMASK_SCRATCH_FREE(scratch);
1992                         mpol_put(new);
1993                         return;
1994                 }
1995
1996                 /* Create pseudo-vma that contains just the policy */
1997                 memset(&pvma, 0, sizeof(struct vm_area_struct));
1998                 pvma.vm_end = TASK_SIZE;        /* policy covers entire file */
1999                 mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
2000                 mpol_put(new);                  /* drop initial ref */
2001                 NODEMASK_SCRATCH_FREE(scratch);
2002         }
2003 }
2004
2005 int mpol_set_shared_policy(struct shared_policy *info,
2006                         struct vm_area_struct *vma, struct mempolicy *npol)
2007 {
2008         int err;
2009         struct sp_node *new = NULL;
2010         unsigned long sz = vma_pages(vma);
2011
2012         pr_debug("set_shared_policy %lx sz %lu %d %d %lx\n",
2013                  vma->vm_pgoff,
2014                  sz, npol ? npol->mode : -1,
2015                  npol ? npol->flags : -1,
2016                  npol ? nodes_addr(npol->v.nodes)[0] : -1);
2017
2018         if (npol) {
2019                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
2020                 if (!new)
2021                         return -ENOMEM;
2022         }
2023         err = shared_policy_replace(info, vma->vm_pgoff, vma->vm_pgoff+sz, new);
2024         if (err && new)
2025                 kmem_cache_free(sn_cache, new);
2026         return err;
2027 }
2028
2029 /* Free a backing policy store on inode delete. */
2030 void mpol_free_shared_policy(struct shared_policy *p)
2031 {
2032         struct sp_node *n;
2033         struct rb_node *next;
2034
2035         if (!p->root.rb_node)
2036                 return;
2037         spin_lock(&p->lock);
2038         next = rb_first(&p->root);
2039         while (next) {
2040                 n = rb_entry(next, struct sp_node, nd);
2041                 next = rb_next(&n->nd);
2042                 rb_erase(&n->nd, &p->root);
2043                 mpol_put(n->policy);
2044                 kmem_cache_free(sn_cache, n);
2045         }
2046         spin_unlock(&p->lock);
2047 }
2048
2049 /* assumes fs == KERNEL_DS */
2050 void __init numa_policy_init(void)
2051 {
2052         nodemask_t interleave_nodes;
2053         unsigned long largest = 0;
2054         int nid, prefer = 0;
2055
2056         policy_cache = kmem_cache_create("numa_policy",
2057                                          sizeof(struct mempolicy),
2058                                          0, SLAB_PANIC, NULL);
2059
2060         sn_cache = kmem_cache_create("shared_policy_node",
2061                                      sizeof(struct sp_node),
2062                                      0, SLAB_PANIC, NULL);
2063
2064         /*
2065          * Set interleaving policy for system init. Interleaving is only
2066          * enabled across suitably sized nodes (default is >= 16MB), or
2067          * fall back to the largest node if they're all smaller.
2068          */
2069         nodes_clear(interleave_nodes);
2070         for_each_node_state(nid, N_HIGH_MEMORY) {
2071                 unsigned long total_pages = node_present_pages(nid);
2072
2073                 /* Preserve the largest node */
2074                 if (largest < total_pages) {
2075                         largest = total_pages;
2076                         prefer = nid;
2077                 }
2078
2079                 /* Interleave this node? */
2080                 if ((total_pages << PAGE_SHIFT) >= (16 << 20))
2081                         node_set(nid, interleave_nodes);
2082         }
2083
2084         /* All too small, use the largest */
2085         if (unlikely(nodes_empty(interleave_nodes)))
2086                 node_set(prefer, interleave_nodes);
2087
2088         if (do_set_mempolicy(MPOL_INTERLEAVE, 0, &interleave_nodes))
2089                 printk("numa_policy_init: interleaving failed\n");
2090 }
2091
2092 /* Reset policy of current process to default */
2093 void numa_default_policy(void)
2094 {
2095         do_set_mempolicy(MPOL_DEFAULT, 0, NULL);
2096 }
2097
2098 /*
2099  * Parse and format mempolicy from/to strings
2100  */
2101
2102 /*
2103  * "local" is pseudo-policy:  MPOL_PREFERRED with MPOL_F_LOCAL flag
2104  * Used only for mpol_parse_str() and mpol_to_str()
2105  */
2106 #define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
2107 static const char * const policy_types[] =
2108         { "default", "prefer", "bind", "interleave", "local" };
2109
2110
2111 #ifdef CONFIG_TMPFS
2112 /**
2113  * mpol_parse_str - parse string to mempolicy
2114  * @str:  string containing mempolicy to parse
2115  * @mpol:  pointer to struct mempolicy pointer, returned on success.
2116  * @no_context:  flag whether to "contextualize" the mempolicy
2117  *
2118  * Format of input:
2119  *      <mode>[=<flags>][:<nodelist>]
2120  *
2121  * if @no_context is true, save the input nodemask in w.user_nodemask in
2122  * the returned mempolicy.  This will be used to "clone" the mempolicy in
2123  * a specific context [cpuset] at a later time.  Used to parse tmpfs mpol
2124  * mount option.  Note that if 'static' or 'relative' mode flags were
2125  * specified, the input nodemask will already have been saved.  Saving
2126  * it again is redundant, but safe.
2127  *
2128  * On success, returns 0, else 1
2129  */
2130 int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
2131 {
2132         struct mempolicy *new = NULL;
2133         unsigned short uninitialized_var(mode);
2134         unsigned short uninitialized_var(mode_flags);
2135         nodemask_t nodes;
2136         char *nodelist = strchr(str, ':');
2137         char *flags = strchr(str, '=');
2138         int i;
2139         int err = 1;
2140
2141         if (nodelist) {
2142                 /* NUL-terminate mode or flags string */
2143                 *nodelist++ = '\0';
2144                 if (nodelist_parse(nodelist, nodes))
2145                         goto out;
2146                 if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
2147                         goto out;
2148         } else
2149                 nodes_clear(nodes);
2150
2151         if (flags)
2152                 *flags++ = '\0';        /* terminate mode string */
2153
2154         for (i = 0; i <= MPOL_LOCAL; i++) {
2155                 if (!strcmp(str, policy_types[i])) {
2156                         mode = i;
2157                         break;
2158                 }
2159         }
2160         if (i > MPOL_LOCAL)
2161                 goto out;
2162
2163         switch (mode) {
2164         case MPOL_PREFERRED:
2165                 /*
2166                  * Insist on a nodelist of one node only
2167                  */
2168                 if (nodelist) {
2169                         char *rest = nodelist;
2170                         while (isdigit(*rest))
2171                                 rest++;
2172                         if (!*rest)
2173                                 err = 0;
2174                 }
2175                 break;
2176         case MPOL_INTERLEAVE:
2177                 /*
2178                  * Default to online nodes with memory if no nodelist
2179                  */
2180                 if (!nodelist)
2181                         nodes = node_states[N_HIGH_MEMORY];
2182                 err = 0;
2183                 break;
2184         case MPOL_LOCAL:
2185                 /*
2186                  * Don't allow a nodelist;  mpol_new() checks flags
2187                  */
2188                 if (nodelist)
2189                         goto out;
2190                 mode = MPOL_PREFERRED;
2191                 break;
2192
2193         /*
2194          * case MPOL_BIND:    mpol_new() enforces non-empty nodemask.
2195          * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
2196          */
2197         }
2198
2199         mode_flags = 0;
2200         if (flags) {
2201                 /*
2202                  * Currently, we only support two mutually exclusive
2203                  * mode flags.
2204                  */
2205                 if (!strcmp(flags, "static"))
2206                         mode_flags |= MPOL_F_STATIC_NODES;
2207                 else if (!strcmp(flags, "relative"))
2208                         mode_flags |= MPOL_F_RELATIVE_NODES;
2209                 else
2210                         err = 1;
2211         }
2212
2213         new = mpol_new(mode, mode_flags, &nodes);
2214         if (IS_ERR(new))
2215                 err = 1;
2216         else {
2217                 int ret;
2218                 NODEMASK_SCRATCH(scratch);
2219                 if (scratch) {
2220                         task_lock(current);
2221                         ret = mpol_set_nodemask(new, &nodes, scratch);
2222                         task_unlock(current);
2223                 } else
2224                         ret = -ENOMEM;
2225                 NODEMASK_SCRATCH_FREE(scratch);
2226                 if (ret) {
2227                         err = 1;
2228                         mpol_put(new);
2229                 } else if (no_context) {
2230                         /* save for contextualization */
2231                         new->w.user_nodemask = nodes;
2232                 }
2233         }
2234
2235 out:
2236         /* Restore string for error message */
2237         if (nodelist)
2238                 *--nodelist = ':';
2239         if (flags)
2240                 *--flags = '=';
2241         if (!err)
2242                 *mpol = new;
2243         return err;
2244 }
2245 #endif /* CONFIG_TMPFS */
2246
2247 /**
2248  * mpol_to_str - format a mempolicy structure for printing
2249  * @buffer:  to contain formatted mempolicy string
2250  * @maxlen:  length of @buffer
2251  * @pol:  pointer to mempolicy to be formatted
2252  * @no_context:  "context free" mempolicy - use nodemask in w.user_nodemask
2253  *
2254  * Convert a mempolicy into a string.
2255  * Returns the number of characters in buffer (if positive)
2256  * or an error (negative)
2257  */
2258 int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
2259 {
2260         char *p = buffer;
2261         int l;
2262         nodemask_t nodes;
2263         unsigned short mode;
2264         unsigned short flags = pol ? pol->flags : 0;
2265
2266         /*
2267          * Sanity check:  room for longest mode, flag and some nodes
2268          */
2269         VM_BUG_ON(maxlen < strlen("interleave") + strlen("relative") + 16);
2270
2271         if (!pol || pol == &default_policy)
2272                 mode = MPOL_DEFAULT;
2273         else
2274                 mode = pol->mode;
2275
2276         switch (mode) {
2277         case MPOL_DEFAULT:
2278                 nodes_clear(nodes);
2279                 break;
2280
2281         case MPOL_PREFERRED:
2282                 nodes_clear(nodes);
2283                 if (flags & MPOL_F_LOCAL)
2284                         mode = MPOL_LOCAL;      /* pseudo-policy */
2285                 else
2286                         node_set(pol->v.preferred_node, nodes);
2287                 break;
2288
2289         case MPOL_BIND:
2290                 /* Fall through */
2291         case MPOL_INTERLEAVE:
2292                 if (no_context)
2293                         nodes = pol->w.user_nodemask;
2294                 else
2295                         nodes = pol->v.nodes;
2296                 break;
2297
2298         default:
2299                 BUG();
2300         }
2301
2302         l = strlen(policy_types[mode]);
2303         if (buffer + maxlen < p + l + 1)
2304                 return -ENOSPC;
2305
2306         strcpy(p, policy_types[mode]);
2307         p += l;
2308
2309         if (flags & MPOL_MODE_FLAGS) {
2310                 if (buffer + maxlen < p + 2)
2311                         return -ENOSPC;
2312                 *p++ = '=';
2313
2314                 /*
2315                  * Currently, the only defined flags are mutually exclusive
2316                  */
2317                 if (flags & MPOL_F_STATIC_NODES)
2318                         p += snprintf(p, buffer + maxlen - p, "static");
2319                 else if (flags & MPOL_F_RELATIVE_NODES)
2320                         p += snprintf(p, buffer + maxlen - p, "relative");
2321         }
2322
2323         if (!nodes_empty(nodes)) {
2324                 if (buffer + maxlen < p + 2)
2325                         return -ENOSPC;
2326                 *p++ = ':';
2327                 p += nodelist_scnprintf(p, buffer + maxlen - p, nodes);
2328         }
2329         return p - buffer;
2330 }
2331
2332 struct numa_maps {
2333         unsigned long pages;
2334         unsigned long anon;
2335         unsigned long active;
2336         unsigned long writeback;
2337         unsigned long mapcount_max;
2338         unsigned long dirty;
2339         unsigned long swapcache;
2340         unsigned long node[MAX_NUMNODES];
2341 };
2342
2343 static void gather_stats(struct page *page, void *private, int pte_dirty)
2344 {
2345         struct numa_maps *md = private;
2346         int count = page_mapcount(page);
2347
2348         md->pages++;
2349         if (pte_dirty || PageDirty(page))
2350                 md->dirty++;
2351
2352         if (PageSwapCache(page))
2353                 md->swapcache++;
2354
2355         if (PageActive(page) || PageUnevictable(page))
2356                 md->active++;
2357
2358         if (PageWriteback(page))
2359                 md->writeback++;
2360
2361         if (PageAnon(page))
2362                 md->anon++;
2363
2364         if (count > md->mapcount_max)
2365                 md->mapcount_max = count;
2366
2367         md->node[page_to_nid(page)]++;
2368 }
2369
2370 #ifdef CONFIG_HUGETLB_PAGE
2371 static void check_huge_range(struct vm_area_struct *vma,
2372                 unsigned long start, unsigned long end,
2373                 struct numa_maps *md)
2374 {
2375         unsigned long addr;
2376         struct page *page;
2377         struct hstate *h = hstate_vma(vma);
2378         unsigned long sz = huge_page_size(h);
2379
2380         for (addr = start; addr < end; addr += sz) {
2381                 pte_t *ptep = huge_pte_offset(vma->vm_mm,
2382                                                 addr & huge_page_mask(h));
2383                 pte_t pte;
2384
2385                 if (!ptep)
2386                         continue;
2387
2388                 pte = *ptep;
2389                 if (pte_none(pte))
2390                         continue;
2391
2392                 page = pte_page(pte);
2393                 if (!page)
2394                         continue;
2395
2396                 gather_stats(page, md, pte_dirty(*ptep));
2397         }
2398 }
2399 #else
2400 static inline void check_huge_range(struct vm_area_struct *vma,
2401                 unsigned long start, unsigned long end,
2402                 struct numa_maps *md)
2403 {
2404 }
2405 #endif
2406
2407 /*
2408  * Display pages allocated per node and memory policy via /proc.
2409  */
2410 int show_numa_map(struct seq_file *m, void *v)
2411 {
2412         struct proc_maps_private *priv = m->private;
2413         struct vm_area_struct *vma = v;
2414         struct numa_maps *md;
2415         struct file *file = vma->vm_file;
2416         struct mm_struct *mm = vma->vm_mm;
2417         struct mempolicy *pol;
2418         int n;
2419         char buffer[50];
2420
2421         if (!mm)
2422                 return 0;
2423
2424         md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
2425         if (!md)
2426                 return 0;
2427
2428         pol = get_vma_policy(priv->task, vma, vma->vm_start);
2429         mpol_to_str(buffer, sizeof(buffer), pol, 0);
2430         mpol_cond_put(pol);
2431
2432         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
2433
2434         if (file) {
2435                 seq_printf(m, " file=");
2436                 seq_path(m, &file->f_path, "\n\t= ");
2437         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
2438                 seq_printf(m, " heap");
2439         } else if (vma->vm_start <= mm->start_stack &&
2440                         vma->vm_end >= mm->start_stack) {
2441                 seq_printf(m, " stack");
2442         }
2443
2444         if (is_vm_hugetlb_page(vma)) {
2445                 check_huge_range(vma, vma->vm_start, vma->vm_end, md);
2446                 seq_printf(m, " huge");
2447         } else {
2448                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
2449                         &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
2450         }
2451
2452         if (!md->pages)
2453                 goto out;
2454
2455         if (md->anon)
2456                 seq_printf(m," anon=%lu",md->anon);
2457
2458         if (md->dirty)
2459                 seq_printf(m," dirty=%lu",md->dirty);
2460
2461         if (md->pages != md->anon && md->pages != md->dirty)
2462                 seq_printf(m, " mapped=%lu", md->pages);
2463
2464         if (md->mapcount_max > 1)
2465                 seq_printf(m, " mapmax=%lu", md->mapcount_max);
2466
2467         if (md->swapcache)
2468                 seq_printf(m," swapcache=%lu", md->swapcache);
2469
2470         if (md->active < md->pages && !is_vm_hugetlb_page(vma))
2471                 seq_printf(m," active=%lu", md->active);
2472
2473         if (md->writeback)
2474                 seq_printf(m," writeback=%lu", md->writeback);
2475
2476         for_each_node_state(n, N_HIGH_MEMORY)
2477                 if (md->node[n])
2478                         seq_printf(m, " N%d=%lu", n, md->node[n]);
2479 out:
2480         seq_putc(m, '\n');
2481         kfree(md);
2482
2483         if (m->count < m->size)
2484                 m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
2485         return 0;
2486 }