mempolicy: fixup Fallback for Default Shmem Policy
[safe/jmp/linux-2.6] / mm / mempolicy.c
index d44c524..8924aaf 100644 (file)
@@ -63,7 +63,6 @@
    grows down?
    make bind policy root only? It can trigger oom much faster and the
    kernel is not always grateful with that.
-   could replace all the switch()es with a mempolicy_ops structure.
 */
 
 #include <linux/mempolicy.h>
@@ -110,8 +109,13 @@ struct mempolicy default_policy = {
        .policy = MPOL_DEFAULT,
 };
 
+static const struct mempolicy_operations {
+       int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
+       void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
+} mpol_ops[MPOL_MAX];
+
 /* Check that the nodemask contains at least one populated zone */
-static int is_valid_nodemask(nodemask_t *nodemask)
+static int is_valid_nodemask(const nodemask_t *nodemask)
 {
        int nd, k;
 
@@ -144,125 +148,161 @@ static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
        nodes_onto(*ret, tmp, *rel);
 }
 
+static int mpol_new_interleave(struct mempolicy *pol, const nodemask_t *nodes)
+{
+       if (nodes_empty(*nodes))
+               return -EINVAL;
+       pol->v.nodes = *nodes;
+       return 0;
+}
+
+static int mpol_new_preferred(struct mempolicy *pol, const nodemask_t *nodes)
+{
+       if (!nodes)
+               pol->v.preferred_node = -1;     /* local allocation */
+       else if (nodes_empty(*nodes))
+               return -EINVAL;                 /*  no allowed nodes */
+       else
+               pol->v.preferred_node = first_node(*nodes);
+       return 0;
+}
+
+static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
+{
+       if (!is_valid_nodemask(nodes))
+               return -EINVAL;
+       pol->v.nodes = *nodes;
+       return 0;
+}
+
 /* Create a new policy */
 static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
                                  nodemask_t *nodes)
 {
        struct mempolicy *policy;
        nodemask_t cpuset_context_nmask;
+       int ret;
 
        pr_debug("setting mode %d flags %d nodes[0] %lx\n",
                 mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
 
-       if (mode == MPOL_DEFAULT)
-               return (nodes && nodes_weight(*nodes)) ? ERR_PTR(-EINVAL) :
-                                                        NULL;
+       if (mode == MPOL_DEFAULT) {
+               if (nodes && !nodes_empty(*nodes))
+                       return ERR_PTR(-EINVAL);
+               return NULL;
+       }
+       VM_BUG_ON(!nodes);
+
+       /*
+        * MPOL_PREFERRED cannot be used with MPOL_F_STATIC_NODES or
+        * MPOL_F_RELATIVE_NODES if the nodemask is empty (local allocation).
+        * All other modes require a valid pointer to a non-empty nodemask.
+        */
+       if (mode == MPOL_PREFERRED) {
+               if (nodes_empty(*nodes)) {
+                       if (((flags & MPOL_F_STATIC_NODES) ||
+                            (flags & MPOL_F_RELATIVE_NODES)))
+                               return ERR_PTR(-EINVAL);
+                       nodes = NULL;   /* flag local alloc */
+               }
+       } else if (nodes_empty(*nodes))
+               return ERR_PTR(-EINVAL);
        policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
        if (!policy)
                return ERR_PTR(-ENOMEM);
        atomic_set(&policy->refcnt, 1);
-       cpuset_update_task_memory_state();
-       if (flags & MPOL_F_RELATIVE_NODES)
-               mpol_relative_nodemask(&cpuset_context_nmask, nodes,
-                                      &cpuset_current_mems_allowed);
-       else
-               nodes_and(cpuset_context_nmask, *nodes,
-                         cpuset_current_mems_allowed);
-       switch (mode) {
-       case MPOL_INTERLEAVE:
-               if (nodes_empty(*nodes) || nodes_empty(cpuset_context_nmask))
-                       goto free;
-               policy->v.nodes = cpuset_context_nmask;
-               break;
-       case MPOL_PREFERRED:
-               policy->v.preferred_node = first_node(cpuset_context_nmask);
-               if (policy->v.preferred_node >= MAX_NUMNODES)
-                       goto free;
-               break;
-       case MPOL_BIND:
-               if (!is_valid_nodemask(&cpuset_context_nmask))
-                       goto free;
-               policy->v.nodes = cpuset_context_nmask;
-               break;
-       default:
-               BUG();
-       }
        policy->policy = mode;
        policy->flags = flags;
-       if (mpol_store_user_nodemask(policy))
-               policy->w.user_nodemask = *nodes;
-       else
-               policy->w.cpuset_mems_allowed = cpuset_mems_allowed(current);
+
+       if (nodes) {
+               /*
+                * cpuset related setup doesn't apply to local allocation
+                */
+               cpuset_update_task_memory_state();
+               if (flags & MPOL_F_RELATIVE_NODES)
+                       mpol_relative_nodemask(&cpuset_context_nmask, nodes,
+                                              &cpuset_current_mems_allowed);
+               else
+                       nodes_and(cpuset_context_nmask, *nodes,
+                                 cpuset_current_mems_allowed);
+               if (mpol_store_user_nodemask(policy))
+                       policy->w.user_nodemask = *nodes;
+               else
+                       policy->w.cpuset_mems_allowed =
+                                               cpuset_mems_allowed(current);
+       }
+
+       ret = mpol_ops[mode].create(policy,
+                               nodes ? &cpuset_context_nmask : NULL);
+       if (ret < 0) {
+               kmem_cache_free(policy_cache, policy);
+               return ERR_PTR(ret);
+       }
        return policy;
+}
 
-free:
-       kmem_cache_free(policy_cache, policy);
-       return ERR_PTR(-EINVAL);
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
+{
+}
+
+static void mpol_rebind_nodemask(struct mempolicy *pol,
+                                const nodemask_t *nodes)
+{
+       nodemask_t tmp;
+
+       if (pol->flags & MPOL_F_STATIC_NODES)
+               nodes_and(tmp, pol->w.user_nodemask, *nodes);
+       else if (pol->flags & MPOL_F_RELATIVE_NODES)
+               mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+       else {
+               nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
+                           *nodes);
+               pol->w.cpuset_mems_allowed = *nodes;
+       }
+
+       pol->v.nodes = tmp;
+       if (!node_isset(current->il_next, tmp)) {
+               current->il_next = next_node(current->il_next, tmp);
+               if (current->il_next >= MAX_NUMNODES)
+                       current->il_next = first_node(tmp);
+               if (current->il_next >= MAX_NUMNODES)
+                       current->il_next = numa_node_id();
+       }
+}
+
+static void mpol_rebind_preferred(struct mempolicy *pol,
+                                 const nodemask_t *nodes)
+{
+       nodemask_t tmp;
+
+       if (pol->flags & MPOL_F_STATIC_NODES) {
+               int node = first_node(pol->w.user_nodemask);
+
+               if (node_isset(node, *nodes))
+                       pol->v.preferred_node = node;
+               else
+                       pol->v.preferred_node = -1;
+       } else if (pol->flags & MPOL_F_RELATIVE_NODES) {
+               mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
+               pol->v.preferred_node = first_node(tmp);
+       } else if (pol->v.preferred_node != -1) {
+               pol->v.preferred_node = node_remap(pol->v.preferred_node,
+                                                  pol->w.cpuset_mems_allowed,
+                                                  *nodes);
+               pol->w.cpuset_mems_allowed = *nodes;
+       }
 }
 
 /* Migrate a policy to a different set of nodes */
 static void mpol_rebind_policy(struct mempolicy *pol,
                               const nodemask_t *newmask)
 {
-       nodemask_t tmp;
-       int static_nodes;
-       int relative_nodes;
-
        if (!pol)
                return;
-       static_nodes = pol->flags & MPOL_F_STATIC_NODES;
-       relative_nodes = pol->flags & MPOL_F_RELATIVE_NODES;
        if (!mpol_store_user_nodemask(pol) &&
            nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
                return;
-
-       switch (pol->policy) {
-       case MPOL_DEFAULT:
-               break;
-       case MPOL_BIND:
-               /* Fall through */
-       case MPOL_INTERLEAVE:
-               if (static_nodes)
-                       nodes_and(tmp, pol->w.user_nodemask, *newmask);
-               else if (relative_nodes)
-                       mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
-                                              newmask);
-               else {
-                       nodes_remap(tmp, pol->v.nodes,
-                                   pol->w.cpuset_mems_allowed, *newmask);
-                       pol->w.cpuset_mems_allowed = *newmask;
-               }
-               pol->v.nodes = tmp;
-               if (!node_isset(current->il_next, tmp)) {
-                       current->il_next = next_node(current->il_next, tmp);
-                       if (current->il_next >= MAX_NUMNODES)
-                               current->il_next = first_node(tmp);
-                       if (current->il_next >= MAX_NUMNODES)
-                               current->il_next = numa_node_id();
-               }
-               break;
-       case MPOL_PREFERRED:
-               if (static_nodes) {
-                       int node = first_node(pol->w.user_nodemask);
-
-                       if (node_isset(node, *newmask))
-                               pol->v.preferred_node = node;
-                       else
-                               pol->v.preferred_node = -1;
-               } else if (relative_nodes) {
-                       mpol_relative_nodemask(&tmp, &pol->w.user_nodemask,
-                                              newmask);
-                       pol->v.preferred_node = first_node(tmp);
-               } else {
-                       pol->v.preferred_node = node_remap(pol->v.preferred_node,
-                                       pol->w.cpuset_mems_allowed, *newmask);
-                       pol->w.cpuset_mems_allowed = *newmask;
-               }
-               break;
-       default:
-               BUG();
-               break;
-       }
+       mpol_ops[pol->policy].rebind(pol, newmask);
 }
 
 /*
@@ -291,6 +331,24 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
        up_write(&mm->mmap_sem);
 }
 
+static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
+       [MPOL_DEFAULT] = {
+               .rebind = mpol_rebind_default,
+       },
+       [MPOL_INTERLEAVE] = {
+               .create = mpol_new_interleave,
+               .rebind = mpol_rebind_nodemask,
+       },
+       [MPOL_PREFERRED] = {
+               .create = mpol_new_preferred,
+               .rebind = mpol_rebind_preferred,
+       },
+       [MPOL_BIND] = {
+               .create = mpol_new_bind,
+               .rebind = mpol_rebind_nodemask,
+       },
+};
+
 static void gather_stats(struct page *, void *, int pte_dirty);
 static void migrate_page_add(struct page *page, struct list_head *pagelist,
                                unsigned long flags);
@@ -471,7 +529,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new)
        if (!err) {
                mpol_get(new);
                vma->vm_policy = new;
-               mpol_free(old);
+               mpol_put(old);
        }
        return err;
 }
@@ -533,16 +591,29 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
                             nodemask_t *nodes)
 {
        struct mempolicy *new;
+       struct mm_struct *mm = current->mm;
 
        new = mpol_new(mode, flags, nodes);
        if (IS_ERR(new))
                return PTR_ERR(new);
-       mpol_free(current->mempolicy);
+
+       /*
+        * prevent changing our mempolicy while show_numa_maps()
+        * is using it.
+        * Note:  do_set_mempolicy() can be called at init time
+        * with no 'mm'.
+        */
+       if (mm)
+               down_write(&mm->mmap_sem);
+       mpol_put(current->mempolicy);
        current->mempolicy = new;
        mpol_set_task_struct_flag();
        if (new && new->policy == MPOL_INTERLEAVE &&
            nodes_weight(new->v.nodes))
                current->il_next = first_node(new->v.nodes);
+       if (mm)
+               up_write(&mm->mmap_sem);
+
        return 0;
 }
 
@@ -890,7 +961,7 @@ static long do_mbind(unsigned long start, unsigned long len,
        }
 
        up_write(&mm->mmap_sem);
-       mpol_free(new);
+       mpol_put(new);
        return err;
 }
 
@@ -1191,7 +1262,7 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len,
  * @task != current].  It is the caller's responsibility to
  * free the reference in these cases.
  */
-static struct mempolicy * get_vma_policy(struct task_struct *task,
+static struct mempolicy *get_vma_policy(struct task_struct *task,
                struct vm_area_struct *vma, unsigned long addr)
 {
        struct mempolicy *pol = task->mempolicy;
@@ -1199,7 +1270,10 @@ static struct mempolicy * get_vma_policy(struct task_struct *task,
 
        if (vma) {
                if (vma->vm_ops && vma->vm_ops->get_policy) {
-                       pol = vma->vm_ops->get_policy(vma, addr);
+                       struct mempolicy *vpol = vma->vm_ops->get_policy(vma,
+                                                                       addr);
+                       if (vpol)
+                               pol = vpol;
                        shared_pol = 1; /* if pol non-NULL, add ref below */
                } else if (vma->vm_policy &&
                                vma->vm_policy->policy != MPOL_DEFAULT)
@@ -1388,14 +1462,14 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
                nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
                if (unlikely(pol != &default_policy &&
                                pol != current->mempolicy))
-                       __mpol_free(pol);       /* finished with pol */
+                       __mpol_put(pol);        /* finished with pol */
                return node_zonelist(nid, gfp_flags);
        }
 
        zl = zonelist_policy(GFP_HIGHUSER, pol);
        if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
                if (pol->policy != MPOL_BIND)
-                       __mpol_free(pol);       /* finished with pol */
+                       __mpol_put(pol);        /* finished with pol */
                else
                        *mpol = pol;    /* unref needed after allocation */
        }
@@ -1454,7 +1528,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
                nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
                if (unlikely(pol != &default_policy &&
                                pol != current->mempolicy))
-                       __mpol_free(pol);       /* finished with pol */
+                       __mpol_put(pol);        /* finished with pol */
                return alloc_page_interleave(gfp, 0, nid);
        }
        zl = zonelist_policy(gfp, pol);
@@ -1464,7 +1538,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
                 */
                struct page *page =  __alloc_pages_nodemask(gfp, 0,
                                                zl, nodemask_policy(gfp, pol));
-               __mpol_free(pol);
+               __mpol_put(pol);
                return page;
        }
        /*
@@ -1508,15 +1582,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order)
 EXPORT_SYMBOL(alloc_pages_current);
 
 /*
- * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * If mpol_dup() sees current->cpuset == cpuset_being_rebound, then it
  * rebinds the mempolicy its copying by calling mpol_rebind_policy()
  * with the mems_allowed returned by cpuset_mems_allowed().  This
  * keeps mempolicies cpuset relative after its cpuset moves.  See
  * further kernel/cpuset.c update_nodemask().
  */
 
-/* Slow path of a mempolicy copy */
-struct mempolicy *__mpol_copy(struct mempolicy *old)
+/* Slow path of a mempolicy duplicate */
+struct mempolicy *__mpol_dup(struct mempolicy *old)
 {
        struct mempolicy *new = kmem_cache_alloc(policy_cache, GFP_KERNEL);
 
@@ -1566,7 +1640,7 @@ int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
 }
 
 /* Slow path of a mpol destructor. */
-void __mpol_free(struct mempolicy *p)
+void __mpol_put(struct mempolicy *p)
 {
        if (!atomic_dec_and_test(&p->refcnt))
                return;
@@ -1662,7 +1736,7 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n)
 {
        pr_debug("deleting %lx-l%lx\n", n->start, n->end);
        rb_erase(&n->nd, &sp->root);
-       mpol_free(n->policy);
+       mpol_put(n->policy);
        kmem_cache_free(sn_cache, n);
 }
 
@@ -1722,7 +1796,7 @@ restart:
                sp_insert(sp, new);
        spin_unlock(&sp->lock);
        if (new2) {
-               mpol_free(new2->policy);
+               mpol_put(new2->policy);
                kmem_cache_free(sn_cache, new2);
        }
        return 0;
@@ -1747,7 +1821,7 @@ void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
                        /* Policy covers entire file */
                        pvma.vm_end = TASK_SIZE;
                        mpol_set_shared_policy(info, &pvma, newpol);
-                       mpol_free(newpol);
+                       mpol_put(newpol);
                }
        }
 }
@@ -1790,7 +1864,7 @@ void mpol_free_shared_policy(struct shared_policy *p)
                n = rb_entry(next, struct sp_node, nd);
                next = rb_next(&n->nd);
                rb_erase(&n->nd, &p->root);
-               mpol_free(n->policy);
+               mpol_put(n->policy);
                kmem_cache_free(sn_cache, n);
        }
        spin_unlock(&p->lock);
@@ -1848,7 +1922,6 @@ void numa_default_policy(void)
 /*
  * Display pages allocated per node and memory policy via /proc.
  */
-
 static const char * const policy_types[] =
        { "default", "prefer", "bind", "interleave" };
 
@@ -2011,7 +2084,7 @@ int show_numa_map(struct seq_file *m, void *v)
         * unref shared or other task's mempolicy
         */
        if (pol != &default_policy && pol != current->mempolicy)
-               __mpol_free(pol);
+               __mpol_put(pol);
 
        seq_printf(m, "%08lx %s", vma->vm_start, buffer);