dm: table use uninitialized_var

[safe/jmp/linux-2.6] / mm / mempolicy.c
diff --git a/mm/mempolicy.c b/mm/mempolicy.c

index 2a82060..83c69f8 100644 (file)
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -72,20 +72,23 @@
  #include <linux/hugetlb.h>
  #include <linux/kernel.h>
  #include <linux/sched.h>
-#include <linux/mm.h>
  #include <linux/nodemask.h>
  #include <linux/cpuset.h>
  #include <linux/gfp.h>
  #include <linux/slab.h>
  #include <linux/string.h>
  #include <linux/module.h>
+#include <linux/nsproxy.h>
  #include <linux/interrupt.h>
  #include <linux/init.h>
  #include <linux/compat.h>
-#include <linux/mempolicy.h>
  #include <linux/swap.h>
  #include <linux/seq_file.h>
  #include <linux/proc_fs.h>
+#include <linux/migrate.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
  
  #include <asm/tlbflush.h>
  #include <asm/uaccess.h>
@@ -95,23 +98,21 @@
  #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1)         /* Invert check for nodemask */
  #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2)          /* Gather statistics */
  
-/* The number of pages to migrate per call to migrate_pages() */
-#define MIGRATE_CHUNK_SIZE 256
-
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
-
-#define PDprintk(fmt...)
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
  
  /* Highest zone. An specific allocation for a zone below that is not
     policied. */
-int policy_zone = ZONE_DMA;
+enum zone_type policy_zone = 0;
  
  struct mempolicy default_policy = {
         .refcnt = ATOMIC_INIT(1), /* never free it */
         .policy = MPOL_DEFAULT,
  };
  
+static void mpol_rebind_policy(struct mempolicy *pol,
+                               const nodemask_t *newmask);
+
  /* Do sanity checking on a policy */
  static int mpol_check_policy(int mode, nodemask_t *nodes)
  {
@@ -130,30 +131,41 @@ static int mpol_check_policy(int mode, nodemask_t *nodes)
                         return -EINVAL;
                 break;
         }
-       return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
+       return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
  }
  
  /* Generate a custom zonelist for the BIND policy. */
  static struct zonelist *bind_zonelist(nodemask_t *nodes)
  {
         struct zonelist *zl;
-       int num, max, nd, k;
+       int num, max, nd;
+       enum zone_type k;
  
         max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+       max++;                  /* space for zlcache_ptr (see mmzone.h) */
         zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
         if (!zl)
-               return NULL;
+               return ERR_PTR(-ENOMEM);
+       zl->zlcache_ptr = NULL;
         num = 0;
         /* First put in the highest zones from all nodes, then all the next 
            lower zones etc. Avoid empty zones because the memory allocator
            doesn't like them. If you implement node hot removal you
            have to fix that. */
-       for (k = policy_zone; k >= 0; k--) { 
+       k = MAX_NR_ZONES - 1;
+       while (1) {
                 for_each_node_mask(nd, *nodes) { 
                         struct zone *z = &NODE_DATA(nd)->node_zones[k];
                         if (z->present_pages > 0) 
                                 zl->zones[num++] = z;
                 }
+               if (k == 0)
+                       break;
+               k--;
+       }
+       if (num == 0) {
+               kfree(zl);
+               return ERR_PTR(-EINVAL);
         }
         zl->zones[num] = NULL;
         return zl;
@@ -164,7 +176,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
  {
         struct mempolicy *policy;
  
-       PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
+       pr_debug("setting mode %d nodes[0] %lx\n",
+                mode, nodes ? nodes_addr(*nodes)[0] : -1);
+
         if (mode == MPOL_DEFAULT)
                 return NULL;
         policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
@@ -174,7 +188,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
         switch (mode) {
         case MPOL_INTERLEAVE:
                 policy->v.nodes = *nodes;
-               if (nodes_weight(*nodes) == 0) {
+               nodes_and(policy->v.nodes, policy->v.nodes,
+                                       node_states[N_HIGH_MEMORY]);
+               if (nodes_weight(policy->v.nodes) == 0) {
                         kmem_cache_free(policy_cache, policy);
                         return ERR_PTR(-EINVAL);
                 }
@@ -186,9 +202,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes)
                 break;
         case MPOL_BIND:
                 policy->v.zonelist = bind_zonelist(nodes);
-               if (policy->v.zonelist == NULL) {
+               if (IS_ERR(policy->v.zonelist)) {
+                       void *error_code = policy->v.zonelist;
                         kmem_cache_free(policy_cache, policy);
-                       return ERR_PTR(-ENOMEM);
+                       return error_code;
                 }
                 break;
         }
@@ -214,7 +231,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
         orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
         do {
                 struct page *page;
-               unsigned int nid;
+               int nid;
  
                 if (!pte_present(*pte))
                         continue;
@@ -309,15 +326,6 @@ static inline int check_pgd_range(struct vm_area_struct *vma,
         return 0;
  }
  
-/* Check if a vma is migratable */
-static inline int vma_migratable(struct vm_area_struct *vma)
-{
-       if (vma->vm_flags & (
-               VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
-               return 0;
-       return 1;
-}
-
  /*
   * Check if all pages in a range are on a set of nodes.
   * If pagelist != NULL then isolate pages from the LRU and
@@ -330,9 +338,12 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end,
         int err;
         struct vm_area_struct *first, *vma, *prev;
  
-       /* Clear the LRU lists so pages can be isolated */
-       if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
-               lru_add_drain_all();
+       if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+               err = migrate_prep();
+               if (err)
+                       return ERR_PTR(err);
+       }
  
         first = find_vma(mm, start);
         if (!first)
@@ -373,7 +384,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
         int err = 0;
         struct mempolicy *old = vma->vm_policy;
  
-       PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+       pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
                  vma->vm_start, vma->vm_end, vma->vm_pgoff,
                  vma->vm_ops, vma->vm_file,
                  vma->vm_ops ? vma->vm_ops->set_policy : NULL);
@@ -421,8 +432,39 @@ static int contextualize_policy(int mode, nodemask_t *nodes)
         return mpol_check_policy(mode, nodes);
  }
  
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy.  Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+       if (p->mempolicy)
+               p->flags |= PF_MEMPOLICY;
+       else
+               p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+       mpol_fix_fork_child_flag(current);
+}
+
  /* Set the process memory policy */
-long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(int mode, nodemask_t *nodes)
  {
         struct mempolicy *new;
  
@@ -433,6 +475,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes)
                 return PTR_ERR(new);
         mpol_free(current->mempolicy);
         current->mempolicy = new;
+       mpol_set_task_struct_flag();
         if (new && new->policy == MPOL_INTERLEAVE)
                 current->il_next = first_node(new->v.nodes);
         return 0;
@@ -447,7 +490,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
         switch (p->policy) {
         case MPOL_BIND:
                 for (i = 0; p->v.zonelist->zones[i]; i++)
-                       node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+                       node_set(zone_to_nid(p->v.zonelist->zones[i]),
                                 *nodes);
                 break;
         case MPOL_DEFAULT:
@@ -456,9 +499,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes)
                 *nodes = p->v.nodes;
                 break;
         case MPOL_PREFERRED:
-               /* or use current node instead of online map? */
+               /* or use current node instead of memory_map? */
                 if (p->v.preferred_node < 0)
-                       *nodes = node_online_map;
+                       *nodes = node_states[N_HIGH_MEMORY];
                 else
                         node_set(p->v.preferred_node, *nodes);
                 break;
@@ -481,8 +524,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
  }
  
  /* Retrieve NUMA policy */
-long do_get_mempolicy(int *policy, nodemask_t *nmask,
-                       unsigned long addr, unsigned long flags)
+static long do_get_mempolicy(int *policy, nodemask_t *nmask,
+                            unsigned long addr, unsigned long flags)
  {
         int err;
         struct mm_struct *mm = current->mm;
@@ -490,8 +533,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
         struct mempolicy *pol = current->mempolicy;
  
         cpuset_update_task_memory_state();
-       if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
+       if (flags &
+               ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
                 return -EINVAL;
+
+       if (flags & MPOL_F_MEMS_ALLOWED) {
+               if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
+                       return -EINVAL;
+               *policy = 0;    /* just so it's initialized */
+               *nmask  = cpuset_current_mems_allowed;
+               return 0;
+       }
+
         if (flags & MPOL_F_ADDR) {
                 down_read(&mm->mmap_sem);
                 vma = find_vma_intersection(mm, addr, addr+1);
@@ -540,99 +593,31 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask,
         return err;
  }
  
+#ifdef CONFIG_MIGRATION
  /*
   * page migration
   */
-
  static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                 unsigned long flags)
  {
         /*
          * Avoid migrating a page that is shared with others.
          */
-       if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
-               if (isolate_lru_page(page))
-                       list_add_tail(&page->lru, pagelist);
-       }
+       if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+               isolate_lru_page(page, pagelist);
  }
  
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
-                       struct vm_area_struct *vma, int dest)
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
  {
-       LIST_HEAD(newlist);
-       LIST_HEAD(moved);
-       LIST_HEAD(failed);
-       int err = 0;
-       unsigned long offset = 0;
-       int nr_pages;
-       struct page *page;
-       struct list_head *p;
-
-redo:
-       nr_pages = 0;
-       list_for_each(p, pagelist) {
-               if (vma) {
-                       /*
-                        * The address passed to alloc_page_vma is used to
-                        * generate the proper interleave behavior. We fake
-                        * the address here by an increasing offset in order
-                        * to get the proper distribution of pages.
-                        *
-                        * No decision has been made as to which page
-                        * a certain old page is moved to so we cannot
-                        * specify the correct address.
-                        */
-                       page = alloc_page_vma(GFP_HIGHUSER, vma,
-                                       offset + vma->vm_start);
-                       offset += PAGE_SIZE;
-               }
-               else
-                       page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
-               if (!page) {
-                       err = -ENOMEM;
-                       goto out;
-               }
-               list_add_tail(&page->lru, &newlist);
-               nr_pages++;
-               if (nr_pages > MIGRATE_CHUNK_SIZE)
-                       break;
-       }
-       err = migrate_pages(pagelist, &newlist, &moved, &failed);
-
-       putback_lru_pages(&moved);      /* Call release pages instead ?? */
-
-       if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
-               goto redo;
-out:
-       /* Return leftover allocated pages */
-       while (!list_empty(&newlist)) {
-               page = list_entry(newlist.next, struct page, lru);
-               list_del(&page->lru);
-               __free_page(page);
-       }
-       list_splice(&failed, pagelist);
-       if (err < 0)
-               return err;
-
-       /* Calculate number of leftover pages */
-       nr_pages = 0;
-       list_for_each(p, pagelist)
-               nr_pages++;
-       return nr_pages;
+       return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
  }
  
  /*
   * Migrate pages from one node to a target node.
   * Returns error or the number of pages not migrated.
   */
-int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+static int migrate_to_node(struct mm_struct *mm, int source, int dest,
+                          int flags)
  {
         nodemask_t nmask;
         LIST_HEAD(pagelist);
@@ -644,11 +629,9 @@ int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
         check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
                         flags | MPOL_MF_DISCONTIG_OK, &pagelist);
  
-       if (!list_empty(&pagelist)) {
-               err = migrate_pages_to(&pagelist, NULL, dest);
-               if (!list_empty(&pagelist))
-                       putback_lru_pages(&pagelist);
-       }
+       if (!list_empty(&pagelist))
+               err = migrate_pages(&pagelist, new_node_page, dest);
+
         return err;
  }
  
@@ -668,6 +651,10 @@ int do_migrate_pages(struct mm_struct *mm,
  
         down_read(&mm->mmap_sem);
  
+       err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+       if (err)
+               goto out;
+
  /*
   * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
   * bit in 'to' is not also set in 'tmp'.  Clear the found 'source'
@@ -727,15 +714,60 @@ int do_migrate_pages(struct mm_struct *mm,
                 if (err < 0)
                         break;
         }
-
+out:
         up_read(&mm->mmap_sem);
         if (err < 0)
                 return err;
         return busy;
+
  }
  
-long do_mbind(unsigned long start, unsigned long len,
-               unsigned long mode, nodemask_t *nmask, unsigned long flags)
+/*
+ * Allocate a new page for page migration based on vma policy.
+ * Start assuming that page is mapped by vma pointed to by @private.
+ * Search forward from there, if not.  N.B., this assumes that the
+ * list of pages handed to migrate_pages()--which is how we get here--
+ * is in virtual address order.
+ */
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+       struct vm_area_struct *vma = (struct vm_area_struct *)private;
+       unsigned long uninitialized_var(address);
+
+       while (vma) {
+               address = page_address_in_vma(page, vma);
+               if (address != -EFAULT)
+                       break;
+               vma = vma->vm_next;
+       }
+
+       /*
+        * if !vma, alloc_page_vma() will use task or system default policy
+        */
+       return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+}
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+                               unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+       const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+       return -ENOSYS;
+}
+
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+       return NULL;
+}
+#endif
+
+static long do_mbind(unsigned long start, unsigned long len,
+                    unsigned long mode, nodemask_t *nmask,
+                    unsigned long flags)
  {
         struct vm_area_struct *vma;
         struct mm_struct *mm = current->mm;
@@ -779,8 +811,8 @@ long do_mbind(unsigned long start, unsigned long len,
         if (!new)
                 flags |= MPOL_MF_DISCONTIG_OK;
  
-       PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
-                       mode,nodes_addr(nodes)[0]);
+       pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+                mode, nmask ? nodes_addr(*nmask)[0] : -1);
  
         down_write(&mm->mmap_sem);
         vma = check_range(mm, start, end, nmask,
@@ -793,13 +825,12 @@ long do_mbind(unsigned long start, unsigned long len,
                 err = mbind_range(vma, start, end, new);
  
                 if (!list_empty(&pagelist))
-                       nr_failed = migrate_pages_to(&pagelist, vma, -1);
+                       nr_failed = migrate_pages(&pagelist, new_vma_page,
+                                               (unsigned long)vma);
  
                 if (!err && nr_failed && (flags & MPOL_MF_STRICT))
                         err = -EIO;
         }
-       if (!list_empty(&pagelist))
-               putback_lru_pages(&pagelist);
  
         up_write(&mm->mmap_sem);
         mpol_free(new);
@@ -884,6 +915,10 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len,
         err = get_nodes(&nodes, nmask, maxnode);
         if (err)
                 return err;
+#ifdef CONFIG_CPUSETS
+       /* Restrict the nodes to the allowed nodes in the cpuset */
+       nodes_and(nodes, nodes, current->mems_allowed);
+#endif
         return do_mbind(start, len, mode, &nodes, flags);
  }
  
@@ -923,7 +958,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
  
         /* Find the mm_struct */
         read_lock(&tasklist_lock);
-       task = pid ? find_task_by_pid(pid) : current;
+       task = pid ? find_task_by_vpid(pid) : current;
         if (!task) {
                 read_unlock(&tasklist_lock);
                 return -ESRCH;
@@ -937,7 +972,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
         /*
          * Check if this process has the right to modify the specified
          * process. The right exists if the process has administrative
-        * capabilities, superuser priviledges or the same
+        * capabilities, superuser privileges or the same
          * userid as the target process.
          */
         if ((current->euid != task->suid) && (current->euid != task->uid) &&
@@ -954,6 +989,15 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
                 goto out;
         }
  
+       if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
+               err = -EINVAL;
+               goto out;
+       }
+
+       err = security_task_movememory(task);
+       if (err)
+               goto out;
+
         err = do_migrate_pages(mm, &old, &new,
                 capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
  out:
@@ -968,7 +1012,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy,
                                 unsigned long maxnode,
                                 unsigned long addr, unsigned long flags)
  {
-       int err, pval;
+       int err;
+       int uninitialized_var(pval);
         nodemask_t nodes;
  
         if (nmask != NULL && maxnode < MAX_NUMNODES)
@@ -1067,21 +1112,37 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
  
  #endif
  
-/* Return effective policy for a VMA */
+/*
+ * get_vma_policy(@task, @vma, @addr)
+ * @task - task for fallback if vma policy == default
+ * @vma   - virtual memory area whose policy is sought
+ * @addr  - address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to @task or system default policy, as necessary.
+ * Returned policy has extra reference count if shared, vma,
+ * or some other task's policy [show_numa_maps() can pass
+ * @task != current].  It is the caller's responsibility to
+ * free the reference in these cases.
+ */
  static struct mempolicy * get_vma_policy(struct task_struct *task,
                 struct vm_area_struct *vma, unsigned long addr)
  {
         struct mempolicy *pol = task->mempolicy;
+       int shared_pol = 0;
  
         if (vma) {
-               if (vma->vm_ops && vma->vm_ops->get_policy)
+               if (vma->vm_ops && vma->vm_ops->get_policy) {
                         pol = vma->vm_ops->get_policy(vma, addr);
-               else if (vma->vm_policy &&
+                       shared_pol = 1; /* if pol non-NULL, add ref below */
+               } else if (vma->vm_policy &&
                                 vma->vm_policy->policy != MPOL_DEFAULT)
                         pol = vma->vm_policy;
         }
         if (!pol)
                 pol = &default_policy;
+       else if (!shared_pol && pol != current->mempolicy)
+               mpol_get(pol);  /* vma or other task's policy */
         return pol;
  }
  
@@ -1134,7 +1195,9 @@ static unsigned interleave_nodes(struct mempolicy *policy)
   */
  unsigned slab_node(struct mempolicy *policy)
  {
-       switch (policy->policy) {
+       int pol = policy ? policy->policy : MPOL_DEFAULT;
+
+       switch (pol) {
         case MPOL_INTERLEAVE:
                 return interleave_nodes(policy);
  
@@ -1143,7 +1206,7 @@ unsigned slab_node(struct mempolicy *policy)
                  * Follow bind policy behavior and start allocation at the
                  * first node.
                  */
-               return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+               return zone_to_nid(policy->v.zonelist->zones[0]);
  
         case MPOL_PREFERRED:
                 if (policy->v.preferred_node >= 0)
@@ -1179,7 +1242,15 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
         if (vma) {
                 unsigned long off;
  
-               off = vma->vm_pgoff;
+               /*
+                * for small pages, there is no difference between
+                * shift and PAGE_SHIFT, so the bit-shift is safe.
+                * for huge pages, since vm_pgoff is in units of small
+                * pages, we need to shift off the always 0 bits to get
+                * a useful offset.
+                */
+               BUG_ON(shift < PAGE_SHIFT);
+               off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
                 off += (addr - vma->vm_start) >> shift;
                 return offset_il_node(pol, vma, off);
         } else
@@ -1187,18 +1258,45 @@ static inline unsigned interleave_nid(struct mempolicy *pol,
  }
  
  #ifdef CONFIG_HUGETLBFS
-/* Return a zonelist suitable for a huge page allocation. */
-struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+/*
+ * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
+ * @vma = virtual memory area whose policy is sought
+ * @addr = address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags = for requested zone
+ * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
+ *
+ * Returns a zonelist suitable for a huge page allocation.
+ * If the effective policy is 'BIND, returns pointer to policy's zonelist.
+ * If it is also a policy for which get_vma_policy() returns an extra
+ * reference, we must hold that reference until after allocation.
+ * In that case, return policy via @mpol so hugetlb allocation can drop
+ * the reference.  For non-'BIND referenced policies, we can/do drop the
+ * reference here, so the caller doesn't need to know about the special case
+ * for default and current task policy.
+ */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+                               gfp_t gfp_flags, struct mempolicy **mpol)
  {
         struct mempolicy *pol = get_vma_policy(current, vma, addr);
+       struct zonelist *zl;
  
+       *mpol = NULL;           /* probably no unref needed */
         if (pol->policy == MPOL_INTERLEAVE) {
                 unsigned nid;
  
                 nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
-               return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+               __mpol_free(pol);               /* finished with pol */
+               return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
         }
-       return zonelist_policy(GFP_HIGHUSER, pol);
+
+       zl = zonelist_policy(GFP_HIGHUSER, pol);
+       if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
+               if (pol->policy != MPOL_BIND)
+                       __mpol_free(pol);       /* finished with pol */
+               else
+                       *mpol = pol;    /* unref needed after allocation */
+       }
+       return zl;
  }
  #endif
  
@@ -1212,10 +1310,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
  
         zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
         page = __alloc_pages(gfp, order, zl);
-       if (page && page_zone(page) == zl->zones[0]) {
-               zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
-               put_cpu();
-       }
+       if (page && page_zone(page) == zl->zones[0])
+               inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
         return page;
  }
  
@@ -1245,6 +1341,7 @@ struct page *
  alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
  {
         struct mempolicy *pol = get_vma_policy(current, vma, addr);
+       struct zonelist *zl;
  
         cpuset_update_task_memory_state();
  
@@ -1254,7 +1351,19 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
                 nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
                 return alloc_page_interleave(gfp, 0, nid);
         }
-       return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
+       zl = zonelist_policy(gfp, pol);
+       if (pol != &default_policy && pol != current->mempolicy) {
+               /*
+                * slow path: ref counted policy -- shared or vma
+                */
+               struct page *page =  __alloc_pages(gfp, 0, zl);
+               __mpol_free(pol);
+               return page;
+       }
+       /*
+        * fast path:  default or task policy
+        */
+       return __alloc_pages(gfp, 0, zl);
  }
  
  /**
@@ -1282,7 +1391,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
  
         if ((gfp & __GFP_WAIT) && !in_interrupt())
                 cpuset_update_task_memory_state();
-       if (!pol || in_interrupt())
+       if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
                 pol = &default_policy;
         if (pol->policy == MPOL_INTERLEAVE)
                 return alloc_page_interleave(gfp, order, interleave_nodes(pol));
@@ -1297,7 +1406,6 @@ EXPORT_SYMBOL(alloc_pages_current);
   * keeps mempolicies cpuset relative after its cpuset moves.  See
   * further kernel/cpuset.c update_nodemask().
   */
-void *cpuset_being_rebound;
  
  /* Slow path of a mempolicy copy */
  struct mempolicy *__mpol_copy(struct mempolicy *old)
@@ -1314,12 +1422,11 @@ struct mempolicy *__mpol_copy(struct mempolicy *old)
         atomic_set(&new->refcnt, 1);
         if (new->policy == MPOL_BIND) {
                 int sz = ksize(old->v.zonelist);
-               new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
+               new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
                 if (!new->v.zonelist) {
                         kmem_cache_free(policy_cache, new);
                         return ERR_PTR(-ENOMEM);
                 }
-               memcpy(new->v.zonelist, old->v.zonelist, sz);
         }
         return new;
  }
@@ -1423,7 +1530,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new)
         }
         rb_link_node(&new->nd, parent, p);
         rb_insert_color(&new->nd, &sp->root);
-       PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
+       pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
                  new->policy ? new->policy->policy : 0);
  }
  
@@ -1448,14 +1555,14 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx)
  
  static void sp_delete(struct shared_policy *sp, struct sp_node *n)
  {
-       PDprintk("deleting %lx-l%x\n", n->start, n->end);
+       pr_debug("deleting %lx-l%lx\n", n->start, n->end);
         rb_erase(&n->nd, &sp->root);
         mpol_free(n->policy);
         kmem_cache_free(sn_cache, n);
  }
  
-struct sp_node *
-sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
+static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
+                               struct mempolicy *pol)
  {
         struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
  
@@ -1547,10 +1654,10 @@ int mpol_set_shared_policy(struct shared_policy *info,
         struct sp_node *new = NULL;
         unsigned long sz = vma_pages(vma);
  
-       PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
+       pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
                  vma->vm_pgoff,
                  sz, npol? npol->policy : -1,
-               npol ? nodes_addr(npol->v.nodes)[0] : -1);
+                npol ? nodes_addr(npol->v.nodes)[0] : -1);
  
         if (npol) {
                 new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
@@ -1586,18 +1693,43 @@ void mpol_free_shared_policy(struct shared_policy *p)
  /* assumes fs == KERNEL_DS */
  void __init numa_policy_init(void)
  {
+       nodemask_t interleave_nodes;
+       unsigned long largest = 0;
+       int nid, prefer = 0;
+
         policy_cache = kmem_cache_create("numa_policy",
                                          sizeof(struct mempolicy),
-                                        0, SLAB_PANIC, NULL, NULL);
+                                        0, SLAB_PANIC, NULL);
  
         sn_cache = kmem_cache_create("shared_policy_node",
                                      sizeof(struct sp_node),
-                                    0, SLAB_PANIC, NULL, NULL);
+                                    0, SLAB_PANIC, NULL);
  
-       /* Set interleaving policy for system init. This way not all
-          the data structures allocated at system boot end up in node zero. */
+       /*
+        * Set interleaving policy for system init. Interleaving is only
+        * enabled across suitably sized nodes (default is >= 16MB), or
+        * fall back to the largest node if they're all smaller.
+        */
+       nodes_clear(interleave_nodes);
+       for_each_node_state(nid, N_HIGH_MEMORY) {
+               unsigned long total_pages = node_present_pages(nid);
+
+               /* Preserve the largest node */
+               if (largest < total_pages) {
+                       largest = total_pages;
+                       prefer = nid;
+               }
+
+               /* Interleave this node? */
+               if ((total_pages << PAGE_SHIFT) >= (16 << 20))
+                       node_set(nid, interleave_nodes);
+       }
  
-       if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
+       /* All too small, use the largest */
+       if (unlikely(nodes_empty(interleave_nodes)))
+               node_set(prefer, interleave_nodes);
+
+       if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
                 printk("numa_policy_init: interleaving failed\n");
  }
  
@@ -1608,7 +1740,8 @@ void numa_default_policy(void)
  }
  
  /* Migrate a policy to a different set of nodes */
-void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
+static void mpol_rebind_policy(struct mempolicy *pol,
+                              const nodemask_t *newmask)
  {
         nodemask_t *mpolmask;
         nodemask_t tmp;
@@ -1641,7 +1774,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
  
                 nodes_clear(nodes);
                 for (z = pol->v.zonelist->zones; *z; z++)
-                       node_set((*z)->zone_pgdat->node_id, nodes);
+                       node_set(zone_to_nid(*z), nodes);
                 nodes_remap(tmp, nodes, *mpolmask, *newmask);
                 nodes = tmp;
  
@@ -1652,7 +1785,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
                  * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
                  */
  
-               if (zonelist) {
+               if (!IS_ERR(zonelist)) {
                         /* Good - got mem - substitute new zonelist */
                         kfree(pol->v.zonelist);
                         pol->v.zonelist = zonelist;
@@ -1696,8 +1829,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
   * Display pages allocated per node and memory policy via /proc.
   */
  
-static const char *policy_types[] = { "default", "prefer", "bind",
-                                     "interleave" };
+static const char * const policy_types[] =
+       { "default", "prefer", "bind", "interleave" };
  
  /*
   * Convert a mempolicy into a string.
@@ -1786,7 +1919,6 @@ static void gather_stats(struct page *page, void *private, int pte_dirty)
                 md->mapcount_max = count;
  
         md->node[page_to_nid(page)]++;
-       cond_resched();
  }
  
  #ifdef CONFIG_HUGETLB_PAGE
@@ -1825,11 +1957,12 @@ static inline void check_huge_range(struct vm_area_struct *vma,
  
  int show_numa_map(struct seq_file *m, void *v)
  {
-       struct task_struct *task = m->private;
+       struct proc_maps_private *priv = m->private;
         struct vm_area_struct *vma = v;
         struct numa_maps *md;
         struct file *file = vma->vm_file;
         struct mm_struct *mm = vma->vm_mm;
+       struct mempolicy *pol;
         int n;
         char buffer[50];
  
@@ -1840,14 +1973,19 @@ int show_numa_map(struct seq_file *m, void *v)
         if (!md)
                 return 0;
  
-       mpol_to_str(buffer, sizeof(buffer),
-                       get_vma_policy(task, vma, vma->vm_start));
+       pol = get_vma_policy(priv->task, vma, vma->vm_start);
+       mpol_to_str(buffer, sizeof(buffer), pol);
+       /*
+        * unref shared or other task's mempolicy
+        */
+       if (pol != &default_policy && pol != current->mempolicy)
+               __mpol_free(pol);
  
         seq_printf(m, "%08lx %s", vma->vm_start, buffer);
  
         if (file) {
                 seq_printf(m, " file=");
-               seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+               seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
         } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
                 seq_printf(m, " heap");
         } else if (vma->vm_start <= mm->start_stack &&
@@ -1860,7 +1998,7 @@ int show_numa_map(struct seq_file *m, void *v)
                 seq_printf(m, " huge");
         } else {
                 check_pgd_range(vma, vma->vm_start, vma->vm_end,
-                               &node_online_map, MPOL_MF_STATS, md);
+                       &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
         }
  
         if (!md->pages)
@@ -1887,7 +2025,7 @@ int show_numa_map(struct seq_file *m, void *v)
         if (md->writeback)
                 seq_printf(m," writeback=%lu", md->writeback);
  
-       for_each_online_node(n)
+       for_each_node_state(n, N_HIGH_MEMORY)
                 if (md->node[n])
                         seq_printf(m, " N%d=%lu", n, md->node[n]);
  out:
@@ -1895,7 +2033,6 @@ out:
         kfree(md);
  
         if (m->count < m->size)
-               m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+               m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
         return 0;
  }
-