#include <asm/tlbflush.h>
#include <asm/uaccess.h>
+#include "internal.h"
+
/* Internal flags */
#define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
int err;
struct vm_area_struct *first, *vma, *prev;
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
-
- err = migrate_prep();
- if (err)
- return ERR_PTR(err);
- }
first = find_vma(mm, start);
if (!first)
} else {
*policy = pol == &default_policy ? MPOL_DEFAULT :
pol->mode;
- *policy |= pol->flags;
+ /*
+ * Internal mempolicy flags must be masked off before exposing
+ * the policy to userspace.
+ */
+ *policy |= (pol->flags & MPOL_MODE_FLAGS);
}
if (vma) {
/*
* Avoid migrating a page that is shared with others.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
- isolate_lru_page(page, pagelist);
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
+ if (!isolate_lru_page(page)) {
+ list_add_tail(&page->lru, pagelist);
+ }
+ }
}
static struct page *new_node_page(struct page *page, unsigned long node, int **x)
int do_migrate_pages(struct mm_struct *mm,
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
{
- LIST_HEAD(pagelist);
int busy = 0;
- int err = 0;
+ int err;
nodemask_t tmp;
+ err = migrate_prep();
+ if (err)
+ return err;
+
down_read(&mm->mmap_sem);
err = migrate_vmas(mm, from_nodes, to_nodes, flags);
start, start + len, mode, mode_flags,
nmask ? nodes_addr(*nmask)[0] : -1);
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+ err = migrate_prep();
+ if (err)
+ return err;
+ }
down_write(&mm->mmap_sem);
vma = check_range(mm, start, end, nmask,
flags | MPOL_MF_INVERT, &pagelist);
return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0;
}
-asmlinkage long sys_mbind(unsigned long start, unsigned long len,
- unsigned long mode,
- unsigned long __user *nmask, unsigned long maxnode,
- unsigned flags)
+SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
+ unsigned long, mode, unsigned long __user *, nmask,
+ unsigned long, maxnode, unsigned, flags)
{
nodemask_t nodes;
int err;
}
/* Set the process memory policy */
-asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
- unsigned long maxnode)
+SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask,
+ unsigned long, maxnode)
{
int err;
nodemask_t nodes;
return do_set_mempolicy(mode, flags, &nodes);
}
-asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
- const unsigned long __user *old_nodes,
- const unsigned long __user *new_nodes)
+SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode,
+ const unsigned long __user *, old_nodes,
+ const unsigned long __user *, new_nodes)
{
+ const struct cred *cred = current_cred(), *tcred;
struct mm_struct *mm;
struct task_struct *task;
nodemask_t old;
* capabilities, superuser privileges or the same
* userid as the target process.
*/
- if ((current->euid != task->suid) && (current->euid != task->uid) &&
- (current->uid != task->suid) && (current->uid != task->uid) &&
+ rcu_read_lock();
+ tcred = __task_cred(task);
+ if (cred->euid != tcred->suid && cred->euid != tcred->uid &&
+ cred->uid != tcred->suid && cred->uid != tcred->uid &&
!capable(CAP_SYS_NICE)) {
+ rcu_read_unlock();
err = -EPERM;
goto out;
}
+ rcu_read_unlock();
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
/* Retrieve NUMA policy */
-asmlinkage long sys_get_mempolicy(int __user *policy,
- unsigned long __user *nmask,
- unsigned long maxnode,
- unsigned long addr, unsigned long flags)
+SYSCALL_DEFINE5(get_mempolicy, int __user *, policy,
+ unsigned long __user *, nmask, unsigned long, maxnode,
+ unsigned long, addr, unsigned long, flags)
{
int err;
int uninitialized_var(pval);
if (unlikely((*mpol)->mode == MPOL_INTERLEAVE)) {
zl = node_zonelist(interleave_nid(*mpol, vma, addr,
- HPAGE_SHIFT), gfp_flags);
+ huge_page_shift(hstate_vma(vma))), gfp_flags);
} else {
zl = policy_zonelist(gfp_flags, *mpol);
if ((*mpol)->mode == MPOL_BIND)
return 0;
}
-void mpol_shared_policy_init(struct shared_policy *info, unsigned short policy,
- unsigned short flags, nodemask_t *policy_nodes)
-{
- info->root = RB_ROOT;
- spin_lock_init(&info->lock);
-
- if (policy != MPOL_DEFAULT) {
- struct mempolicy *newpol;
-
- /* Falls back to NULL policy [MPOL_DEFAULT] on any error */
- newpol = mpol_new(policy, flags, policy_nodes);
- if (!IS_ERR(newpol)) {
- /* Create pseudo-vma that contains just the policy */
- struct vm_area_struct pvma;
-
- memset(&pvma, 0, sizeof(struct vm_area_struct));
- /* Policy covers entire file */
- pvma.vm_end = TASK_SIZE;
- mpol_set_shared_policy(info, &pvma, newpol);
- mpol_put(newpol);
- }
+/**
+ * mpol_shared_policy_init - initialize shared policy for inode
+ * @sp: pointer to inode shared policy
+ * @mpol: struct mempolicy to install
+ *
+ * Install non-NULL @mpol in inode's shared policy rb-tree.
+ * On entry, the current task has a reference on a non-NULL @mpol.
+ * This must be released on exit.
+ */
+void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol)
+{
+ sp->root = RB_ROOT; /* empty tree == default mempolicy */
+ spin_lock_init(&sp->lock);
+
+ if (mpol) {
+ struct vm_area_struct pvma;
+ struct mempolicy *new;
+
+ /* contextualize the tmpfs mount point mempolicy */
+ new = mpol_new(mpol->mode, mpol->flags, &mpol->w.user_nodemask);
+ mpol_put(mpol); /* drop our ref on sb mpol */
+ if (IS_ERR(new))
+ return; /* no valid nodemask intersection */
+
+ /* Create pseudo-vma that contains just the policy */
+ memset(&pvma, 0, sizeof(struct vm_area_struct));
+ pvma.vm_end = TASK_SIZE; /* policy covers entire file */
+ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */
+ mpol_put(new); /* drop initial ref */
}
}
/**
* mpol_parse_str - parse string to mempolicy
* @str: string containing mempolicy to parse
- * @mode: pointer to returned policy mode
- * @mode_flags: pointer to returned flags
- * @policy_nodes: pointer to returned nodemask
+ * @mpol: pointer to struct mempolicy pointer, returned on success.
+ * @no_context: flag whether to "contextualize" the mempolicy
*
* Format of input:
* <mode>[=<flags>][:<nodelist>]
*
- * Currently only used for tmpfs/shmem mount options
+ * if @no_context is true, save the input nodemask in w.user_nodemask in
+ * the returned mempolicy. This will be used to "clone" the mempolicy in
+ * a specific context [cpuset] at a later time. Used to parse tmpfs mpol
+ * mount option. Note that if 'static' or 'relative' mode flags were
+ * specified, the input nodemask will already have been saved. Saving
+ * it again is redundant, but safe.
+ *
+ * On success, returns 0, else 1
*/
-int mpol_parse_str(char *str, unsigned short *mode, unsigned short *mode_flags,
- nodemask_t *policy_nodes)
+int mpol_parse_str(char *str, struct mempolicy **mpol, int no_context)
{
+ struct mempolicy *new = NULL;
+ unsigned short uninitialized_var(mode);
+ unsigned short uninitialized_var(mode_flags);
+ nodemask_t nodes;
char *nodelist = strchr(str, ':');
char *flags = strchr(str, '=');
int i;
if (nodelist) {
/* NUL-terminate mode or flags string */
*nodelist++ = '\0';
- if (nodelist_parse(nodelist, *policy_nodes))
+ if (nodelist_parse(nodelist, nodes))
goto out;
- if (!nodes_subset(*policy_nodes, node_states[N_HIGH_MEMORY]))
+ if (!nodes_subset(nodes, node_states[N_HIGH_MEMORY]))
goto out;
- }
+ } else
+ nodes_clear(nodes);
+
if (flags)
*flags++ = '\0'; /* terminate mode string */
for (i = 0; i <= MPOL_LOCAL; i++) {
if (!strcmp(str, policy_types[i])) {
- *mode = i;
+ mode = i;
break;
}
}
if (i > MPOL_LOCAL)
goto out;
- switch (*mode) {
+ switch (mode) {
case MPOL_PREFERRED:
- /* Insist on a nodelist of one node only */
+ /*
+ * Insist on a nodelist of one node only
+ */
if (nodelist) {
char *rest = nodelist;
while (isdigit(*rest))
err = 0;
}
break;
- case MPOL_BIND:
- /* Insist on a nodelist */
- if (nodelist)
- err = 0;
- break;
case MPOL_INTERLEAVE:
/*
* Default to online nodes with memory if no nodelist
*/
if (!nodelist)
- *policy_nodes = node_states[N_HIGH_MEMORY];
+ nodes = node_states[N_HIGH_MEMORY];
err = 0;
break;
- default:
+ case MPOL_LOCAL:
/*
- * MPOL_DEFAULT or MPOL_LOCAL
- * Don't allow a nodelist nor flags
+ * Don't allow a nodelist; mpol_new() checks flags
*/
- if (!nodelist && !flags)
- err = 0;
- if (*mode == MPOL_DEFAULT)
+ if (nodelist)
goto out;
- /* else MPOL_LOCAL */
- *mode = MPOL_PREFERRED;
- nodes_clear(*policy_nodes);
+ mode = MPOL_PREFERRED;
break;
+
+ /*
+ * case MPOL_BIND: mpol_new() enforces non-empty nodemask.
+ * case MPOL_DEFAULT: mpol_new() enforces empty nodemask, ignores flags.
+ */
}
- *mode_flags = 0;
+ mode_flags = 0;
if (flags) {
/*
* Currently, we only support two mutually exclusive
* mode flags.
*/
if (!strcmp(flags, "static"))
- *mode_flags |= MPOL_F_STATIC_NODES;
+ mode_flags |= MPOL_F_STATIC_NODES;
else if (!strcmp(flags, "relative"))
- *mode_flags |= MPOL_F_RELATIVE_NODES;
+ mode_flags |= MPOL_F_RELATIVE_NODES;
else
err = 1;
}
+
+ new = mpol_new(mode, mode_flags, &nodes);
+ if (IS_ERR(new))
+ err = 1;
+ else if (no_context)
+ new->w.user_nodemask = nodes; /* save for contextualization */
+
out:
/* Restore string for error message */
if (nodelist)
*--nodelist = ':';
if (flags)
*--flags = '=';
+ if (!err)
+ *mpol = new;
return err;
}
#endif /* CONFIG_TMPFS */
-/*
+/**
+ * mpol_to_str - format a mempolicy structure for printing
+ * @buffer: to contain formatted mempolicy string
+ * @maxlen: length of @buffer
+ * @pol: pointer to mempolicy to be formatted
+ * @no_context: "context free" mempolicy - use nodemask in w.user_nodemask
+ *
* Convert a mempolicy into a string.
* Returns the number of characters in buffer (if positive)
* or an error (negative)
*/
-int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol)
+int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol, int no_context)
{
char *p = buffer;
int l;
case MPOL_BIND:
/* Fall through */
case MPOL_INTERLEAVE:
- nodes = pol->v.nodes;
+ if (no_context)
+ nodes = pol->w.user_nodemask;
+ else
+ nodes = pol->v.nodes;
break;
default:
if (PageSwapCache(page))
md->swapcache++;
- if (PageActive(page))
+ if (PageActive(page) || PageUnevictable(page))
md->active++;
if (PageWriteback(page))
{
unsigned long addr;
struct page *page;
+ struct hstate *h = hstate_vma(vma);
+ unsigned long sz = huge_page_size(h);
- for (addr = start; addr < end; addr += HPAGE_SIZE) {
- pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+ for (addr = start; addr < end; addr += sz) {
+ pte_t *ptep = huge_pte_offset(vma->vm_mm,
+ addr & huge_page_mask(h));
pte_t pte;
if (!ptep)
return 0;
pol = get_vma_policy(priv->task, vma, vma->vm_start);
- mpol_to_str(buffer, sizeof(buffer), pol);
+ mpol_to_str(buffer, sizeof(buffer), pol, 0);
mpol_cond_put(pol);
seq_printf(m, "%08lx %s", vma->vm_start, buffer);