#include <linux/sched.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
-#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/module.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/migrate.h>
+#include <linux/ksm.h>
#include <linux/rmap.h>
#include <linux/security.h>
#include <linux/syscalls.h>
#include <linux/ctype.h>
+#include <linux/mm_inline.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
static const struct mempolicy_operations {
int (*create)(struct mempolicy *pol, const nodemask_t *nodes);
- void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes);
+ /*
+ * If read-side task has no lock to protect task->mempolicy, write-side
+ * task will rebind the task->mempolicy by two step. The first step is
+ * setting all the newly nodes, and the second step is cleaning all the
+ * disallowed nodes. In this way, we can avoid finding no node to alloc
+ * page.
+ * If we have a lock to protect task->mempolicy in read-side, we do
+ * rebind directly.
+ *
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+ void (*rebind)(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step);
} mpol_ops[MPOL_MAX];
/* Check that the nodemask contains at least one populated zone */
{
int nd, k;
- /* Check that there is something useful in this mask */
- k = policy_zone;
-
for_each_node_mask(nd, *nodemask) {
struct zone *z;
static inline int mpol_store_user_nodemask(const struct mempolicy *pol)
{
- return pol->flags & (MPOL_F_STATIC_NODES | MPOL_F_RELATIVE_NODES);
+ return pol->flags & MPOL_MODE_FLAGS;
}
static void mpol_relative_nodemask(nodemask_t *ret, const nodemask_t *orig,
return 0;
}
-/* Create a new policy */
+/*
+ * mpol_set_nodemask is called after mpol_new() to set up the nodemask, if
+ * any, for the new policy. mpol_new() has already validated the nodes
+ * parameter with respect to the policy mode and flags. But, we need to
+ * handle an empty nodemask with MPOL_PREFERRED here.
+ *
+ * Must be called holding task's alloc_lock to protect task's mems_allowed
+ * and mempolicy. May also be called holding the mmap_semaphore for write.
+ */
+static int mpol_set_nodemask(struct mempolicy *pol,
+ const nodemask_t *nodes, struct nodemask_scratch *nsc)
+{
+ int ret;
+
+ /* if mode is MPOL_DEFAULT, pol is NULL. This is right. */
+ if (pol == NULL)
+ return 0;
+ /* Check N_HIGH_MEMORY */
+ nodes_and(nsc->mask1,
+ cpuset_current_mems_allowed, node_states[N_HIGH_MEMORY]);
+
+ VM_BUG_ON(!nodes);
+ if (pol->mode == MPOL_PREFERRED && nodes_empty(*nodes))
+ nodes = NULL; /* explicit local allocation */
+ else {
+ if (pol->flags & MPOL_F_RELATIVE_NODES)
+ mpol_relative_nodemask(&nsc->mask2, nodes,&nsc->mask1);
+ else
+ nodes_and(nsc->mask2, *nodes, nsc->mask1);
+
+ if (mpol_store_user_nodemask(pol))
+ pol->w.user_nodemask = *nodes;
+ else
+ pol->w.cpuset_mems_allowed =
+ cpuset_current_mems_allowed;
+ }
+
+ if (nodes)
+ ret = mpol_ops[pol->mode].create(pol, &nsc->mask2);
+ else
+ ret = mpol_ops[pol->mode].create(pol, NULL);
+ return ret;
+}
+
+/*
+ * This function just creates a new policy, does some check and simple
+ * initialization. You must invoke mpol_set_nodemask() to set nodes.
+ */
static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
struct mempolicy *policy;
- nodemask_t cpuset_context_nmask;
- int ret;
pr_debug("setting mode %d flags %d nodes[0] %lx\n",
mode, flags, nodes ? nodes_addr(*nodes)[0] : -1);
if (((flags & MPOL_F_STATIC_NODES) ||
(flags & MPOL_F_RELATIVE_NODES)))
return ERR_PTR(-EINVAL);
- nodes = NULL; /* flag local alloc */
}
} else if (nodes_empty(*nodes))
return ERR_PTR(-EINVAL);
policy->mode = mode;
policy->flags = flags;
- if (nodes) {
- /*
- * cpuset related setup doesn't apply to local allocation
- */
- cpuset_update_task_memory_state();
- if (flags & MPOL_F_RELATIVE_NODES)
- mpol_relative_nodemask(&cpuset_context_nmask, nodes,
- &cpuset_current_mems_allowed);
- else
- nodes_and(cpuset_context_nmask, *nodes,
- cpuset_current_mems_allowed);
- if (mpol_store_user_nodemask(policy))
- policy->w.user_nodemask = *nodes;
- else
- policy->w.cpuset_mems_allowed =
- cpuset_mems_allowed(current);
- }
-
- ret = mpol_ops[mode].create(policy,
- nodes ? &cpuset_context_nmask : NULL);
- if (ret < 0) {
- kmem_cache_free(policy_cache, policy);
- return ERR_PTR(ret);
- }
return policy;
}
kmem_cache_free(policy_cache, p);
}
-static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes)
+static void mpol_rebind_default(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step)
{
}
-static void mpol_rebind_nodemask(struct mempolicy *pol,
- const nodemask_t *nodes)
+/*
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+static void mpol_rebind_nodemask(struct mempolicy *pol, const nodemask_t *nodes,
+ enum mpol_rebind_step step)
{
nodemask_t tmp;
else if (pol->flags & MPOL_F_RELATIVE_NODES)
mpol_relative_nodemask(&tmp, &pol->w.user_nodemask, nodes);
else {
- nodes_remap(tmp, pol->v.nodes, pol->w.cpuset_mems_allowed,
- *nodes);
- pol->w.cpuset_mems_allowed = *nodes;
+ /*
+ * if step == 1, we use ->w.cpuset_mems_allowed to cache the
+ * result
+ */
+ if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP1) {
+ nodes_remap(tmp, pol->v.nodes,
+ pol->w.cpuset_mems_allowed, *nodes);
+ pol->w.cpuset_mems_allowed = step ? tmp : *nodes;
+ } else if (step == MPOL_REBIND_STEP2) {
+ tmp = pol->w.cpuset_mems_allowed;
+ pol->w.cpuset_mems_allowed = *nodes;
+ } else
+ BUG();
}
- pol->v.nodes = tmp;
+ if (nodes_empty(tmp))
+ tmp = *nodes;
+
+ if (step == MPOL_REBIND_STEP1)
+ nodes_or(pol->v.nodes, pol->v.nodes, tmp);
+ else if (step == MPOL_REBIND_ONCE || step == MPOL_REBIND_STEP2)
+ pol->v.nodes = tmp;
+ else
+ BUG();
+
if (!node_isset(current->il_next, tmp)) {
current->il_next = next_node(current->il_next, tmp);
if (current->il_next >= MAX_NUMNODES)
}
static void mpol_rebind_preferred(struct mempolicy *pol,
- const nodemask_t *nodes)
+ const nodemask_t *nodes,
+ enum mpol_rebind_step step)
{
nodemask_t tmp;
}
}
-/* Migrate a policy to a different set of nodes */
-static void mpol_rebind_policy(struct mempolicy *pol,
- const nodemask_t *newmask)
+/*
+ * mpol_rebind_policy - Migrate a policy to a different set of nodes
+ *
+ * If read-side task has no lock to protect task->mempolicy, write-side
+ * task will rebind the task->mempolicy by two step. The first step is
+ * setting all the newly nodes, and the second step is cleaning all the
+ * disallowed nodes. In this way, we can avoid finding no node to alloc
+ * page.
+ * If we have a lock to protect task->mempolicy in read-side, we do
+ * rebind directly.
+ *
+ * step:
+ * MPOL_REBIND_ONCE - do rebind work at once
+ * MPOL_REBIND_STEP1 - set all the newly nodes
+ * MPOL_REBIND_STEP2 - clean all the disallowed nodes
+ */
+static void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask,
+ enum mpol_rebind_step step)
{
if (!pol)
return;
- if (!mpol_store_user_nodemask(pol) &&
+ if (!mpol_store_user_nodemask(pol) && step == 0 &&
nodes_equal(pol->w.cpuset_mems_allowed, *newmask))
return;
- mpol_ops[pol->mode].rebind(pol, newmask);
+
+ if (step == MPOL_REBIND_STEP1 && (pol->flags & MPOL_F_REBINDING))
+ return;
+
+ if (step == MPOL_REBIND_STEP2 && !(pol->flags & MPOL_F_REBINDING))
+ BUG();
+
+ if (step == MPOL_REBIND_STEP1)
+ pol->flags |= MPOL_F_REBINDING;
+ else if (step == MPOL_REBIND_STEP2)
+ pol->flags &= ~MPOL_F_REBINDING;
+ else if (step >= MPOL_REBIND_NSTEP)
+ BUG();
+
+ mpol_ops[pol->mode].rebind(pol, newmask, step);
}
/*
* Wrapper for mpol_rebind_policy() that just requires task
* pointer, and updates task mempolicy.
+ *
+ * Called with task's alloc_lock held.
*/
-void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new,
+ enum mpol_rebind_step step)
{
- mpol_rebind_policy(tsk->mempolicy, new);
+ mpol_rebind_policy(tsk->mempolicy, new, step);
}
/*
down_write(&mm->mmap_sem);
for (vma = mm->mmap; vma; vma = vma->vm_next)
- mpol_rebind_policy(vma->vm_policy, new);
+ mpol_rebind_policy(vma->vm_policy, new, MPOL_REBIND_ONCE);
up_write(&mm->mmap_sem);
}
if (!page)
continue;
/*
- * The check for PageReserved here is important to avoid
- * handling zero pages and other pages that may have been
- * marked special by the system.
- *
- * If the PageReserved would not be checked here then f.e.
- * the location of the zero page could have an influence
- * on MPOL_MF_STRICT, zero pages would be counted for
- * the per node stats, and there would be useless attempts
- * to put zero pages on the migration list.
+ * vm_normal_page() filters out zero pages, but there might
+ * still be PageReserved pages to skip, perhaps in a VDSO.
+ * And we cannot move PageKsm pages sensibly or safely yet.
*/
- if (PageReserved(page))
+ if (PageReserved(page) || PageKsm(page))
continue;
nid = page_to_nid(page);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
}
/* Step 2: apply policy to a range and do splits. */
-static int mbind_range(struct vm_area_struct *vma, unsigned long start,
- unsigned long end, struct mempolicy *new)
+static int mbind_range(struct mm_struct *mm, unsigned long start,
+ unsigned long end, struct mempolicy *new_pol)
{
struct vm_area_struct *next;
- int err;
+ struct vm_area_struct *prev;
+ struct vm_area_struct *vma;
+ int err = 0;
+ pgoff_t pgoff;
+ unsigned long vmstart;
+ unsigned long vmend;
- err = 0;
- for (; vma && vma->vm_start < end; vma = next) {
+ vma = find_vma_prev(mm, start, &prev);
+ if (!vma || vma->vm_start > start)
+ return -EFAULT;
+
+ for (; vma && vma->vm_start < end; prev = vma, vma = next) {
next = vma->vm_next;
- if (vma->vm_start < start)
- err = split_vma(vma->vm_mm, vma, start, 1);
- if (!err && vma->vm_end > end)
- err = split_vma(vma->vm_mm, vma, end, 0);
- if (!err)
- err = policy_vma(vma, new);
+ vmstart = max(start, vma->vm_start);
+ vmend = min(end, vma->vm_end);
+
+ pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
+ prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
+ vma->anon_vma, vma->vm_file, pgoff, new_pol);
+ if (prev) {
+ vma = prev;
+ next = vma->vm_next;
+ continue;
+ }
+ if (vma->vm_start != vmstart) {
+ err = split_vma(vma->vm_mm, vma, vmstart, 1);
+ if (err)
+ goto out;
+ }
+ if (vma->vm_end != vmend) {
+ err = split_vma(vma->vm_mm, vma, vmend, 0);
+ if (err)
+ goto out;
+ }
+ err = policy_vma(vma, new_pol);
if (err)
- break;
+ goto out;
}
+
+ out:
return err;
}
static long do_set_mempolicy(unsigned short mode, unsigned short flags,
nodemask_t *nodes)
{
- struct mempolicy *new;
+ struct mempolicy *new, *old;
struct mm_struct *mm = current->mm;
+ NODEMASK_SCRATCH(scratch);
+ int ret;
- new = mpol_new(mode, flags, nodes);
- if (IS_ERR(new))
- return PTR_ERR(new);
+ if (!scratch)
+ return -ENOMEM;
+ new = mpol_new(mode, flags, nodes);
+ if (IS_ERR(new)) {
+ ret = PTR_ERR(new);
+ goto out;
+ }
/*
* prevent changing our mempolicy while show_numa_maps()
* is using it.
*/
if (mm)
down_write(&mm->mmap_sem);
- mpol_put(current->mempolicy);
+ task_lock(current);
+ ret = mpol_set_nodemask(new, nodes, scratch);
+ if (ret) {
+ task_unlock(current);
+ if (mm)
+ up_write(&mm->mmap_sem);
+ mpol_put(new);
+ goto out;
+ }
+ old = current->mempolicy;
current->mempolicy = new;
mpol_set_task_struct_flag();
if (new && new->mode == MPOL_INTERLEAVE &&
nodes_weight(new->v.nodes))
current->il_next = first_node(new->v.nodes);
+ task_unlock(current);
if (mm)
up_write(&mm->mmap_sem);
- return 0;
+ mpol_put(old);
+ ret = 0;
+out:
+ NODEMASK_SCRATCH_FREE(scratch);
+ return ret;
}
/*
* Return nodemask for policy for get_mempolicy() query
+ *
+ * Called with task's alloc_lock held
*/
static void get_policy_nodemask(struct mempolicy *p, nodemask_t *nodes)
{
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
- cpuset_update_task_memory_state();
if (flags &
~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
*policy = 0; /* just so it's initialized */
+ task_lock(current);
*nmask = cpuset_current_mems_allowed;
+ task_unlock(current);
return 0;
}
}
err = 0;
- if (nmask)
- get_policy_nodemask(pol, nmask);
+ if (nmask) {
+ if (mpol_store_user_nodemask(pol)) {
+ *nmask = pol->w.user_nodemask;
+ } else {
+ task_lock(current);
+ get_policy_nodemask(pol, nmask);
+ task_unlock(current);
+ }
+ }
out:
mpol_cond_put(pol);
if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
if (!isolate_lru_page(page)) {
list_add_tail(&page->lru, pagelist);
+ inc_zone_page_state(page, NR_ISOLATED_ANON +
+ page_is_file_cache(page));
}
}
}
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
- return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
+ return alloc_pages_exact_node(node, GFP_HIGHUSER_MOVABLE, 0);
}
/*
nodes_clear(nmask);
node_set(source, nmask);
- check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+ check_range(mm, mm->mmap->vm_start, mm->task_size, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist))
- err = migrate_pages(&pagelist, new_node_page, dest);
+ err = migrate_pages(&pagelist, new_node_page, dest, 0);
return err;
}
if (err)
goto out;
-/*
- * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
- * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
- * bit in 'tmp', and return that <source, dest> pair for migration.
- * The pair of nodemasks 'to' and 'from' define the map.
- *
- * If no pair of bits is found that way, fallback to picking some
- * pair of 'source' and 'dest' bits that are not the same. If the
- * 'source' and 'dest' bits are the same, this represents a node
- * that will be migrating to itself, so no pages need move.
- *
- * If no bits are left in 'tmp', or if all remaining bits left
- * in 'tmp' correspond to the same bit in 'to', return false
- * (nothing left to migrate).
- *
- * This lets us pick a pair of nodes to migrate between, such that
- * if possible the dest node is not already occupied by some other
- * source node, minimizing the risk of overloading the memory on a
- * node that would happen if we migrated incoming memory to a node
- * before migrating outgoing memory source that same node.
- *
- * A single scan of tmp is sufficient. As we go, we remember the
- * most recent <s, d> pair that moved (s != d). If we find a pair
- * that not only moved, but what's better, moved to an empty slot
- * (d is not set in tmp), then we break out then, with that pair.
- * Otherwise when we finish scannng from_tmp, we at least have the
- * most recent <s, d> pair that moved. If we get all the way through
- * the scan of tmp without finding any node that moved, much less
- * moved to an empty node, then there is nothing left worth migrating.
- */
+ /*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same. If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient. As we go, we remember the
+ * most recent <s, d> pair that moved (s != d). If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved. If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
tmp = *from_nodes;
while (!nodes_empty(tmp)) {
err = migrate_prep();
if (err)
- return err;
+ goto mpol_out;
}
- down_write(&mm->mmap_sem);
+ {
+ NODEMASK_SCRATCH(scratch);
+ if (scratch) {
+ down_write(&mm->mmap_sem);
+ task_lock(current);
+ err = mpol_set_nodemask(new, nmask, scratch);
+ task_unlock(current);
+ if (err)
+ up_write(&mm->mmap_sem);
+ } else
+ err = -ENOMEM;
+ NODEMASK_SCRATCH_FREE(scratch);
+ }
+ if (err)
+ goto mpol_out;
+
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
if (!IS_ERR(vma)) {
int nr_failed = 0;
- err = mbind_range(vma, start, end, new);
+ err = mbind_range(mm, start, end, new);
if (!list_empty(&pagelist))
nr_failed = migrate_pages(&pagelist, new_vma_page,
- (unsigned long)vma);
+ (unsigned long)vma, 0);
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
- }
+ } else
+ putback_lru_pages(&pagelist);
up_write(&mm->mmap_sem);
+ mpol_out:
mpol_put(new);
return err;
}
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
-asmlinkage long sys_mbind(unsigned long start, unsigned long len,
- unsigned long mode,
- unsigned long __user *nmask, unsigned long maxnode,
- unsigned flags)
+SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
+ unsigned long, mode, unsigned long __user *, nmask,
+ unsigned long, maxnode, unsigned, flags)
{
nodemask_t nodes;
int err;
}
/* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
- unsigned long maxnode)
+SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
+ unsigned long, maxnode)
{
int err;
nodemask_t nodes;
return do_set_mempolicy(mode, flags, &nodes);
}
-asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
- const unsigned long __user *old_nodes,
- const unsigned long __user *new_nodes)
+SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
+ const unsigned long __user *, old_nodes,
+ const unsigned long __user *, new_nodes)
{
+ const struct cred *cred = current_cred(), *tcred;
struct mm_struct *mm;
struct task_struct *task;
nodemask_t old;
* capabilities, superuser privileges or the same
* userid as the target process.
*/
- if ((current->euid != task->suid) && (current->euid != task->uid) &&
- (current->uid != task->suid) && (current->uid != task->uid) &&
+ rcu_read_lock();
+ tcred = __task_cred(task);
+ if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
+ cred->uid != tcred->suid && cred->uid != tcred->uid &&
!capable(CAP_SYS_NICE)) {
+ rcu_read_unlock();
err = -EPERM;
goto out;
}
+ rcu_read_unlock();
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
/* Retrieve NUMA policy */
-asmlinkage long sys_get_mempolicy(int __user *policy,
- unsigned long __user *nmask,
- unsigned long maxnode,
- unsigned long addr, unsigned long flags)
+SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
+ unsigned long __user *, nmask, unsigned long, maxnode,
+ unsigned long, addr, unsigned long, flags)
{
int err;
int uninitialized_var(pval);
/*
* Normally, MPOL_BIND allocations are node-local within the
* allowed nodemask. However, if __GFP_THISNODE is set and the
- * current node is part of the mask, we use the zonelist for
+ * current node isn't part of the mask, we use the zonelist for
* the first node in the mask instead.
*/
if (unlikely(gfp & __GFP_THISNODE) &&
unlikely(!node_isset(nd, policy->v.nodes)))
nd = first_node(policy->v.nodes);
break;
- case MPOL_INTERLEAVE: /* should not happen */
- break;
default:
BUG();
}
* to the struct mempolicy for conditional unref after allocation.
* If the effective policy is 'BIND, returns a pointer to the mempolicy's
* @nodemask for filtering the zonelist.
+ *
+ * Must be protected by get_mems_allowed()
*/
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
gfp_t gfp_flags, struct mempolicy **mpol,
}
return zl;
}
+
+/*
+ * init_nodemask_of_mempolicy
+ *
+ * If the current task's mempolicy is "default" [NULL], return 'false'
+ * to indicate default policy. Otherwise, extract the policy nodemask
+ * for 'bind' or 'interleave' policy into the argument nodemask, or
+ * initialize the argument nodemask to contain the single node for
+ * 'preferred' or 'local' policy and return 'true' to indicate presence
+ * of non-default mempolicy.
+ *
+ * We don't bother with reference counting the mempolicy [mpol_get/put]
+ * because the current task is examining it's own mempolicy and a task's
+ * mempolicy is only ever changed by the task itself.
+ *
+ * N.B., it is the caller's responsibility to free a returned nodemask.
+ */
+bool init_nodemask_of_mempolicy(nodemask_t *mask)
+{
+ struct mempolicy *mempolicy;
+ int nid;
+
+ if (!(mask && current->mempolicy))
+ return false;
+
+ task_lock(current);
+ mempolicy = current->mempolicy;
+ switch (mempolicy->mode) {
+ case MPOL_PREFERRED:
+ if (mempolicy->flags & MPOL_F_LOCAL)
+ nid = numa_node_id();
+ else
+ nid = mempolicy->v.preferred_node;
+ init_nodemask_of_node(mask, nid);
+ break;
+
+ case MPOL_BIND:
+ /* Fall through */
+ case MPOL_INTERLEAVE:
+ *mask = mempolicy->v.nodes;
+ break;
+
+ default:
+ BUG();
+ }
+ task_unlock(current);
+
+ return true;
+}
#endif
/* Allocate a page in interleaved policy.
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
struct zonelist *zl;
+ struct page *page;
- cpuset_update_task_memory_state();
-
+ get_mems_allowed();
if (unlikely(pol->mode == MPOL_INTERLEAVE)) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
mpol_cond_put(pol);
- return alloc_page_interleave(gfp, 0, nid);
+ page = alloc_page_interleave(gfp, 0, nid);
+ put_mems_allowed();
+ return page;
}
zl = policy_zonelist(gfp, pol);
if (unlikely(mpol_needs_cond_ref(pol))) {
struct page *page = __alloc_pages_nodemask(gfp, 0,
zl, policy_nodemask(gfp, pol));
__mpol_put(pol);
+ put_mems_allowed();
return page;
}
/*
* fast path: default or task policy
*/
- return __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+ page = __alloc_pages_nodemask(gfp, 0, zl, policy_nodemask(gfp, pol));
+ put_mems_allowed();
+ return page;
}
/**
struct page *alloc_pages_current(gfp_t gfp, unsigned order)
{
struct mempolicy *pol = current->mempolicy;
+ struct page *page;
- if ((gfp & __GFP_WAIT) && !in_interrupt())
- cpuset_update_task_memory_state();
if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
+ get_mems_allowed();
/*
* No reference counting needed for current->mempolicy
* nor system default_policy
*/
if (pol->mode == MPOL_INTERLEAVE)
- return alloc_page_interleave(gfp, order, interleave_nodes(pol));
- return __alloc_pages_nodemask(gfp, order,
+ page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
+ else
+ page = __alloc_pages_nodemask(gfp, order,
policy_zonelist(gfp, pol), policy_nodemask(gfp, pol));
+ put_mems_allowed();
+ return page;
}
EXPORT_SYMBOL(alloc_pages_current);
* with the mems_allowed returned by cpuset_mems_allowed(). This
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
+ *
+ * current's mempolicy may be rebinded by the other task(the task that changes
+ * cpuset's mems), so we needn't do rebind work for current task.
*/
/* Slow path of a mempolicy duplicate */
if (!new)
return ERR_PTR(-ENOMEM);
+
+ /* task's mempolicy is protected by alloc_lock */
+ if (old == current->mempolicy) {
+ task_lock(current);
+ *new = *old;
+ task_unlock(current);
+ } else
+ *new = *old;
+
+ rcu_read_lock();
if (current_cpuset_is_being_rebound()) {
nodemask_t mems = cpuset_mems_allowed(current);
- mpol_rebind_policy(old, &mems);
+ if (new->flags & MPOL_F_REBINDING)
+ mpol_rebind_policy(new, &mems, MPOL_REBIND_STEP2);
+ else
+ mpol_rebind_policy(new, &mems, MPOL_REBIND_ONCE);
}
- *new = *old;
+ rcu_read_unlock();
atomic_set(&new->refcnt, 1);
return new;
}
return tompol;
}
-static int mpol_match_intent(const struct mempolicy *a,
- const struct mempolicy *b)
-{
- if (a->flags != b->flags)
- return 0;
- if (!mpol_store_user_nodemask(a))
- return 1;
- return nodes_equal(a->w.user_nodemask, b->w.user_nodemask);
-}
-
/* Slow path of a mempolicy comparison */
int __mpol_equal(struct mempolicy *a, struct mempolicy *b)
{
return 0;
if (a->mode != b->mode)
return 0;
- if (a->mode != MPOL_DEFAULT && !mpol_match_intent(a, b))
+ if (a->flags != b->flags)
return 0;
+ if (mpol_store_user_nodemask(a))
+ if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
+ return 0;
+
switch (a->mode) {
case MPOL_BIND:
/* Fall through */
* Install non-NULL @mpol in inode's shared policy rb-tree.
* On entry, the current task has a reference on a non-NULL @mpol.
* This must be released on exit.
+ * This is called at get_inode() calls and we can use GFP_KERNEL.
*/
void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
{
+ int ret;
+
sp->root = RB_ROOT; /* empty tree == default mempolicy */
spin_lock_init(&sp->lock);
if (mpol) {
struct vm_area_struct pvma;
struct mempolicy *new;
+ NODEMASK_SCRATCH(scratch);
+ if (!scratch)
+ return;
/* contextualize the tmpfs mount point mempolicy */
new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
- mpol_put(mpol); /* drop our ref on sb mpol */
if (IS_ERR(new))
- return; /* no valid nodemask intersection */
+ goto free_scratch; /* no valid nodemask intersection */
+
+ task_lock(current);
+ ret = mpol_set_nodemask(new, &mpol->w.user_nodemask, scratch);
+ task_unlock(current);
+ mpol_put(mpol); /* drop our ref on sb mpol */
+ if (ret)
+ goto put_free;
/* Create pseudo-vma that contains just the policy */
memset(&pvma, 0, sizeof(struct vm_area_struct));
pvma.vm_end = TASK_SIZE; /* policy covers entire file */
mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+
+put_free:
mpol_put(new); /* drop initial ref */
+free_scratch:
+ NODEMASK_SCRATCH_FREE(scratch);
}
}
* "local" is pseudo-policy: MPOL_PREFERRED with MPOL_F_LOCAL flag
* Used only for mpol_parse_str() and mpol_to_str()
*/
-#define MPOL_LOCAL (MPOL_INTERLEAVE + 1)
-static const char * const policy_types[] =
- { "default", "prefer", "bind", "interleave", "local" };
+#define MPOL_LOCAL MPOL_MAX
+static const char * const policy_modes[] =
+{
+ [MPOL_DEFAULT] = "default",
+ [MPOL_PREFERRED] = "prefer",
+ [MPOL_BIND] = "bind",
+ [MPOL_INTERLEAVE] = "interleave",
+ [MPOL_LOCAL] = "local"
+};
#ifdef CONFIG_TMPFS
int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
{
struct mempolicy *new = NULL;
- unsigned short uninitialized_var(mode);
+ unsigned short mode;
unsigned short uninitialized_var(mode_flags);
nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
- int i;
int err = 1;
if (nodelist) {
if (flags)
*flags++ = '\0'; /* terminate mode string */
- for (i = 0; i <= MPOL_LOCAL; i++) {
- if (!strcmp(str, policy_types[i])) {
- mode = i;
+ for (mode = 0; mode <= MPOL_LOCAL; mode++) {
+ if (!strcmp(str, policy_modes[mode])) {
break;
}
}
- if (i > MPOL_LOCAL)
+ if (mode > MPOL_LOCAL)
goto out;
switch (mode) {
char *rest = nodelist;
while (isdigit(*rest))
rest++;
- if (!*rest)
- err = 0;
+ if (*rest)
+ goto out;
}
break;
case MPOL_INTERLEAVE:
*/
if (!nodelist)
nodes = node_states[N_HIGH_MEMORY];
- err = 0;
break;
case MPOL_LOCAL:
/*
goto out;
mode = MPOL_PREFERRED;
break;
-
- /*
- * case MPOL_BIND: mpol_new() enforces non-empty nodemask.
- * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
- */
+ case MPOL_DEFAULT:
+ /*
+ * Insist on a empty nodelist
+ */
+ if (!nodelist)
+ err = 0;
+ goto out;
+ case MPOL_BIND:
+ /*
+ * Insist on a nodelist
+ */
+ if (!nodelist)
+ goto out;
}
mode_flags = 0;
else if (!strcmp(flags, "relative"))
mode_flags |= MPOL_F_RELATIVE_NODES;
else
- err = 1;
+ goto out;
}
new = mpol_new(mode, mode_flags, &nodes);
if (IS_ERR(new))
- err = 1;
- else if (no_context)
- new->w.user_nodemask = nodes; /* save for contextualization */
+ goto out;
+
+ if (no_context) {
+ /* save for contextualization */
+ new->w.user_nodemask = nodes;
+ } else {
+ int ret;
+ NODEMASK_SCRATCH(scratch);
+ if (scratch) {
+ task_lock(current);
+ ret = mpol_set_nodemask(new, &nodes, scratch);
+ task_unlock(current);
+ } else
+ ret = -ENOMEM;
+ NODEMASK_SCRATCH_FREE(scratch);
+ if (ret) {
+ mpol_put(new);
+ goto out;
+ }
+ }
+ err = 0;
out:
/* Restore string for error message */
BUG();
}
- l = strlen(policy_types[mode]);
+ l = strlen(policy_modes[mode]);
if (buffer + maxlen < p + l + 1)
return -ENOSPC;
- strcpy(p, policy_types[mode]);
+ strcpy(p, policy_modes[mode]);
p += l;
if (flags & MPOL_MODE_FLAGS) {