#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/migrate.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
#define PDprintk(fmt...)
}
return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
}
+
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
struct zonelist *zl;
- int num, max, nd;
+ int num, max, nd, k;
max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
- zl = kmalloc(sizeof(void *) * max, GFP_KERNEL);
+ zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
if (!zl)
return NULL;
num = 0;
- for_each_node_mask(nd, *nodes)
- zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone];
+ /* First put in the highest zones from all nodes, then all the next
+ lower zones etc. Avoid empty zones because the memory allocator
+ doesn't like them. If you implement node hot removal you
+ have to fix that. */
+ for (k = policy_zone; k >= 0; k--) {
+ for_each_node_mask(nd, *nodes) {
+ struct zone *z = &NODE_DATA(nd)->node_zones[k];
+ if (z->present_pages > 0)
+ zl->zones[num++] = z;
+ }
+ }
zl->zones[num] = NULL;
return zl;
}
break;
}
policy->policy = mode;
+ policy->cpuset_mems_allowed = cpuset_mems_allowed(current);
return policy;
}
-/* Check if we are the only process mapping the page in question */
-static inline int single_mm_mapping(struct mm_struct *mm,
- struct address_space *mapping)
-{
- struct vm_area_struct *vma;
- struct prio_tree_iter iter;
- int rc = 1;
-
- spin_lock(&mapping->i_mmap_lock);
- vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX)
- if (mm != vma->vm_mm) {
- rc = 0;
- goto out;
- }
- list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list)
- if (mm != vma->vm_mm) {
- rc = 0;
- goto out;
- }
-out:
- spin_unlock(&mapping->i_mmap_lock);
- return rc;
-}
-
-/*
- * Add a page to be migrated to the pagelist
- */
-static void migrate_page_add(struct vm_area_struct *vma,
- struct page *page, struct list_head *pagelist, unsigned long flags)
-{
- /*
- * Avoid migrating a page that is shared by others and not writable.
- */
- if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) ||
- mapping_writably_mapped(page->mapping) ||
- single_mm_mapping(vma->vm_mm, page->mapping)) {
- int rc = isolate_lru_page(page);
-
- if (rc == 1)
- list_add(&page->lru, pagelist);
- /*
- * If the isolate attempt was not successful then we just
- * encountered an unswappable page. Something must be wrong.
- */
- WARN_ON(rc == 0);
- }
-}
-
-static void gather_stats(struct page *, void *);
+static void gather_stats(struct page *, void *, int pte_dirty);
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags);
/* Scan through pages checking if pages follow certain conditions. */
static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
page = vm_normal_page(vma, addr, *pte);
if (!page)
continue;
+ /*
+ * The check for PageReserved here is important to avoid
+ * handling zero pages and other pages that may have been
+ * marked special by the system.
+ *
+ * If the PageReserved would not be checked here then f.e.
+ * the location of the zero page could have an influence
+ * on MPOL_MF_STRICT, zero pages would be counted for
+ * the per node stats, and there would be useless attempts
+ * to put zero pages on the migration list.
+ */
+ if (PageReserved(page))
+ continue;
nid = page_to_nid(page);
if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT))
continue;
if (flags & MPOL_MF_STATS)
- gather_stats(page, private);
+ gather_stats(page, private, pte_dirty(*pte));
else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- migrate_page_add(vma, page, private, flags);
+ migrate_page_add(page, private, flags);
else
break;
} while (pte++, addr += PAGE_SIZE, addr != end);
static inline int vma_migratable(struct vm_area_struct *vma)
{
if (vma->vm_flags & (
- VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP))
+ VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
return 0;
return 1;
}
int err;
struct vm_area_struct *first, *vma, *prev;
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+ err = migrate_prep();
+ if (err)
+ return ERR_PTR(err);
+ }
+
first = find_vma(mm, start);
if (!first)
return ERR_PTR(-EFAULT);
if (!nodes)
return 0;
- /* Update current mems_allowed */
- cpuset_update_current_mems_allowed();
- /* Ignore nodes not set in current->mems_allowed */
- cpuset_restrict_to_mems_allowed(nodes->bits);
+ cpuset_update_task_memory_state();
+ if (!cpuset_nodes_subset_current_mems_allowed(*nodes))
+ return -EINVAL;
return mpol_check_policy(mode, nodes);
}
-static int swap_pages(struct list_head *pagelist)
-{
- LIST_HEAD(moved);
- LIST_HEAD(failed);
- int n;
- n = migrate_pages(pagelist, NULL, &moved, &failed);
- putback_lru_pages(&failed);
- putback_lru_pages(&moved);
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy. Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
- return n;
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+ if (p->mempolicy)
+ p->flags |= PF_MEMPOLICY;
+ else
+ p->flags &= ~PF_MEMPOLICY;
}
-long do_mbind(unsigned long start, unsigned long len,
- unsigned long mode, nodemask_t *nmask, unsigned long flags)
+static void mpol_set_task_struct_flag(void)
{
- struct vm_area_struct *vma;
- struct mm_struct *mm = current->mm;
- struct mempolicy *new;
- unsigned long end;
- int err;
- LIST_HEAD(pagelist);
-
- if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
- MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- || mode > MPOL_MAX)
- return -EINVAL;
- if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
- return -EPERM;
-
- if (start & ~PAGE_MASK)
- return -EINVAL;
-
- if (mode == MPOL_DEFAULT)
- flags &= ~MPOL_MF_STRICT;
-
- len = (len + PAGE_SIZE - 1) & PAGE_MASK;
- end = start + len;
-
- if (end < start)
- return -EINVAL;
- if (end == start)
- return 0;
-
- if (mpol_check_policy(mode, nmask))
- return -EINVAL;
-
- new = mpol_new(mode, nmask);
- if (IS_ERR(new))
- return PTR_ERR(new);
-
- /*
- * If we are using the default policy then operation
- * on discontinuous address spaces is okay after all
- */
- if (!new)
- flags |= MPOL_MF_DISCONTIG_OK;
-
- PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
- mode,nodes_addr(nodes)[0]);
-
- down_write(&mm->mmap_sem);
- vma = check_range(mm, start, end, nmask,
- flags | MPOL_MF_INVERT, &pagelist);
-
- err = PTR_ERR(vma);
- if (!IS_ERR(vma)) {
- int nr_failed = 0;
-
- err = mbind_range(vma, start, end, new);
- if (!list_empty(&pagelist))
- nr_failed = swap_pages(&pagelist);
-
- if (!err && nr_failed && (flags & MPOL_MF_STRICT))
- err = -EIO;
- }
- if (!list_empty(&pagelist))
- putback_lru_pages(&pagelist);
-
- up_write(&mm->mmap_sem);
- mpol_free(new);
- return err;
+ mpol_fix_fork_child_flag(current);
}
/* Set the process memory policy */
return PTR_ERR(new);
mpol_free(current->mempolicy);
current->mempolicy = new;
+ mpol_set_task_struct_flag();
if (new && new->policy == MPOL_INTERLEAVE)
current->il_next = first_node(new->v.nodes);
return 0;
struct vm_area_struct *vma = NULL;
struct mempolicy *pol = current->mempolicy;
- cpuset_update_current_mems_allowed();
+ cpuset_update_task_memory_state();
if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
return -EINVAL;
if (flags & MPOL_F_ADDR) {
return err;
}
+#ifdef CONFIG_MIGRATION
+/*
+ * page migration
+ */
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+ /*
+ * Avoid migrating a page that is shared with others.
+ */
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+ isolate_lru_page(page, pagelist);
+}
+
+/*
+ * Migrate pages from one node to a target node.
+ * Returns error or the number of pages not migrated.
+ */
+int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+{
+ nodemask_t nmask;
+ LIST_HEAD(pagelist);
+ int err = 0;
+
+ nodes_clear(nmask);
+ node_set(source, nmask);
+
+ check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
+ flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+
+ if (!list_empty(&pagelist)) {
+ err = migrate_pages_to(&pagelist, NULL, dest);
+ if (!list_empty(&pagelist))
+ putback_lru_pages(&pagelist);
+ }
+ return err;
+}
+
/*
- * For now migrate_pages simply swaps out the pages from nodes that are in
- * the source set but not in the target set. In the future, we would
- * want a function that moves pages between the two nodesets in such
- * a way as to preserve the physical layout as much as possible.
+ * Move pages between the two nodesets so as to preserve the physical
+ * layout as much as possible.
*
* Returns the number of page that could not be moved.
*/
const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
{
LIST_HEAD(pagelist);
- int count = 0;
- nodemask_t nodes;
+ int busy = 0;
+ int err = 0;
+ nodemask_t tmp;
- nodes_andnot(nodes, *from_nodes, *to_nodes);
+ down_read(&mm->mmap_sem);
- down_read(&mm->mmap_sem);
- check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes,
- flags | MPOL_MF_DISCONTIG_OK, &pagelist);
+/*
+ * Find a 'source' bit set in 'tmp' whose corresponding 'dest'
+ * bit in 'to' is not also set in 'tmp'. Clear the found 'source'
+ * bit in 'tmp', and return that <source, dest> pair for migration.
+ * The pair of nodemasks 'to' and 'from' define the map.
+ *
+ * If no pair of bits is found that way, fallback to picking some
+ * pair of 'source' and 'dest' bits that are not the same. If the
+ * 'source' and 'dest' bits are the same, this represents a node
+ * that will be migrating to itself, so no pages need move.
+ *
+ * If no bits are left in 'tmp', or if all remaining bits left
+ * in 'tmp' correspond to the same bit in 'to', return false
+ * (nothing left to migrate).
+ *
+ * This lets us pick a pair of nodes to migrate between, such that
+ * if possible the dest node is not already occupied by some other
+ * source node, minimizing the risk of overloading the memory on a
+ * node that would happen if we migrated incoming memory to a node
+ * before migrating outgoing memory source that same node.
+ *
+ * A single scan of tmp is sufficient. As we go, we remember the
+ * most recent <s, d> pair that moved (s != d). If we find a pair
+ * that not only moved, but what's better, moved to an empty slot
+ * (d is not set in tmp), then we break out then, with that pair.
+ * Otherwise when we finish scannng from_tmp, we at least have the
+ * most recent <s, d> pair that moved. If we get all the way through
+ * the scan of tmp without finding any node that moved, much less
+ * moved to an empty node, then there is nothing left worth migrating.
+ */
- if (!list_empty(&pagelist)) {
- count = swap_pages(&pagelist);
- putback_lru_pages(&pagelist);
+ tmp = *from_nodes;
+ while (!nodes_empty(tmp)) {
+ int s,d;
+ int source = -1;
+ int dest = 0;
+
+ for_each_node_mask(s, tmp) {
+ d = node_remap(s, *from_nodes, *to_nodes);
+ if (s == d)
+ continue;
+
+ source = s; /* Node moved. Memorize */
+ dest = d;
+
+ /* dest not in remaining from nodes? */
+ if (!node_isset(dest, tmp))
+ break;
+ }
+ if (source == -1)
+ break;
+
+ node_clear(source, tmp);
+ err = migrate_to_node(mm, source, dest, flags);
+ if (err > 0)
+ busy += err;
+ if (err < 0)
+ break;
}
up_read(&mm->mmap_sem);
- return count;
+ if (err < 0)
+ return err;
+ return busy;
+
+}
+
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+ return -ENOSYS;
+}
+#endif
+
+long do_mbind(unsigned long start, unsigned long len,
+ unsigned long mode, nodemask_t *nmask, unsigned long flags)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm = current->mm;
+ struct mempolicy *new;
+ unsigned long end;
+ int err;
+ LIST_HEAD(pagelist);
+
+ if ((flags & ~(unsigned long)(MPOL_MF_STRICT |
+ MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
+ || mode > MPOL_MAX)
+ return -EINVAL;
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
+ return -EPERM;
+
+ if (start & ~PAGE_MASK)
+ return -EINVAL;
+
+ if (mode == MPOL_DEFAULT)
+ flags &= ~MPOL_MF_STRICT;
+
+ len = (len + PAGE_SIZE - 1) & PAGE_MASK;
+ end = start + len;
+
+ if (end < start)
+ return -EINVAL;
+ if (end == start)
+ return 0;
+
+ if (mpol_check_policy(mode, nmask))
+ return -EINVAL;
+
+ new = mpol_new(mode, nmask);
+ if (IS_ERR(new))
+ return PTR_ERR(new);
+
+ /*
+ * If we are using the default policy then operation
+ * on discontinuous address spaces is okay after all
+ */
+ if (!new)
+ flags |= MPOL_MF_DISCONTIG_OK;
+
+ PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+ mode,nodes_addr(nodes)[0]);
+
+ down_write(&mm->mmap_sem);
+ vma = check_range(mm, start, end, nmask,
+ flags | MPOL_MF_INVERT, &pagelist);
+
+ err = PTR_ERR(vma);
+ if (!IS_ERR(vma)) {
+ int nr_failed = 0;
+
+ err = mbind_range(vma, start, end, new);
+
+ if (!list_empty(&pagelist))
+ nr_failed = migrate_pages_to(&pagelist, vma, -1);
+
+ if (!err && nr_failed && (flags & MPOL_MF_STRICT))
+ err = -EIO;
+ }
+
+ if (!list_empty(&pagelist))
+ putback_lru_pages(&pagelist);
+
+ up_write(&mm->mmap_sem);
+ mpol_free(new);
+ return err;
}
/*
nodes_clear(*nodes);
if (maxnode == 0 || !nmask)
return 0;
+ if (maxnode > PAGE_SIZE*BITS_PER_BYTE)
+ return -EINVAL;
nlongs = BITS_TO_LONGS(maxnode);
if ((maxnode % BITS_PER_LONG) == 0)
return do_set_mempolicy(mode, &nodes);
}
-/* Macro needed until Paul implements this function in kernel/cpusets.c */
-#define cpuset_mems_allowed(task) node_online_map
-
asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
const unsigned long __user *old_nodes,
const unsigned long __user *new_nodes)
/*
* Check if this process has the right to modify the specified
* process. The right exists if the process has administrative
- * capabilities, superuser priviledges or the same
+ * capabilities, superuser privileges or the same
* userid as the target process.
*/
if ((current->euid != task->suid) && (current->euid != task->uid) &&
(current->uid != task->suid) && (current->uid != task->uid) &&
- !capable(CAP_SYS_ADMIN)) {
+ !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
- if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+ if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
- err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE);
+ err = do_migrate_pages(mm, &old, &new,
+ capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
mmput(mm);
return err;
#endif
/* Return effective policy for a VMA */
-struct mempolicy *
-get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr)
+static struct mempolicy * get_vma_policy(struct task_struct *task,
+ struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
return nid;
}
+/*
+ * Depending on the memory policy provide a node from which to allocate the
+ * next slab entry.
+ */
+unsigned slab_node(struct mempolicy *policy)
+{
+ switch (policy->policy) {
+ case MPOL_INTERLEAVE:
+ return interleave_nodes(policy);
+
+ case MPOL_BIND:
+ /*
+ * Follow bind policy behavior and start allocation at the
+ * first node.
+ */
+ return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+
+ case MPOL_PREFERRED:
+ if (policy->v.preferred_node >= 0)
+ return policy->v.preferred_node;
+ /* Fall through */
+
+ default:
+ return numa_node_id();
+ }
+}
+
/* Do static interleaving for a VMA with known offset. */
static unsigned offset_il_node(struct mempolicy *pol,
struct vm_area_struct *vma, unsigned long off)
return interleave_nodes(pol);
}
+#ifdef CONFIG_HUGETLBFS
/* Return a zonelist suitable for a huge page allocation. */
struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
{
}
return zonelist_policy(GFP_HIGHUSER, pol);
}
+#endif
/* Allocate a page in interleaved policy.
Own path because it needs to do special accounting. */
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
- cpuset_update_current_mems_allowed();
+ cpuset_update_task_memory_state();
if (unlikely(pol->policy == MPOL_INTERLEAVE)) {
unsigned nid;
* interrupt context and apply the current process NUMA policy.
* Returns NULL when no page can be allocated.
*
- * Don't call cpuset_update_current_mems_allowed() unless
+ * Don't call cpuset_update_task_memory_state() unless
* 1) it's ok to take cpuset_sem (can WAIT), and
* 2) allocating for current task (not interrupt).
*/
struct mempolicy *pol = current->mempolicy;
if ((gfp & __GFP_WAIT) && !in_interrupt())
- cpuset_update_current_mems_allowed();
+ cpuset_update_task_memory_state();
if (!pol || in_interrupt())
pol = &default_policy;
if (pol->policy == MPOL_INTERLEAVE)
}
EXPORT_SYMBOL(alloc_pages_current);
+/*
+ * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it
+ * rebinds the mempolicy its copying by calling mpol_rebind_policy()
+ * with the mems_allowed returned by cpuset_mems_allowed(). This
+ * keeps mempolicies cpuset relative after its cpuset moves. See
+ * further kernel/cpuset.c update_nodemask().
+ */
+void *cpuset_being_rebound;
+
/* Slow path of a mempolicy copy */
struct mempolicy *__mpol_copy(struct mempolicy *old)
{
if (!new)
return ERR_PTR(-ENOMEM);
+ if (current_cpuset_is_being_rebound()) {
+ nodemask_t mems = cpuset_mems_allowed(current);
+ mpol_rebind_policy(old, &mems);
+ }
*new = *old;
atomic_set(&new->refcnt, 1);
if (new->policy == MPOL_BIND) {
return 0;
}
+void mpol_shared_policy_init(struct shared_policy *info, int policy,
+ nodemask_t *policy_nodes)
+{
+ info->root = RB_ROOT;
+ spin_lock_init(&info->lock);
+
+ if (policy != MPOL_DEFAULT) {
+ struct mempolicy *newpol;
+
+ /* Falls back to MPOL_DEFAULT on any error */
+ newpol = mpol_new(policy, policy_nodes);
+ if (!IS_ERR(newpol)) {
+ /* Create pseudo-vma that contains just the policy */
+ struct vm_area_struct pvma;
+
+ memset(&pvma, 0, sizeof(struct vm_area_struct));
+ /* Policy covers entire file */
+ pvma.vm_end = TASK_SIZE;
+ mpol_set_shared_policy(info, &pvma, newpol);
+ mpol_free(newpol);
+ }
+ }
+}
+
int mpol_set_shared_policy(struct shared_policy *info,
struct vm_area_struct *vma, struct mempolicy *npol)
{
}
/* Migrate a policy to a different set of nodes */
-static void rebind_policy(struct mempolicy *pol, const nodemask_t *old,
- const nodemask_t *new)
+void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
{
+ nodemask_t *mpolmask;
nodemask_t tmp;
if (!pol)
return;
+ mpolmask = &pol->cpuset_mems_allowed;
+ if (nodes_equal(*mpolmask, *newmask))
+ return;
switch (pol->policy) {
case MPOL_DEFAULT:
break;
case MPOL_INTERLEAVE:
- nodes_remap(tmp, pol->v.nodes, *old, *new);
+ nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask);
pol->v.nodes = tmp;
- current->il_next = node_remap(current->il_next, *old, *new);
+ *mpolmask = *newmask;
+ current->il_next = node_remap(current->il_next,
+ *mpolmask, *newmask);
break;
case MPOL_PREFERRED:
pol->v.preferred_node = node_remap(pol->v.preferred_node,
- *old, *new);
+ *mpolmask, *newmask);
+ *mpolmask = *newmask;
break;
case MPOL_BIND: {
nodemask_t nodes;
nodes_clear(nodes);
for (z = pol->v.zonelist->zones; *z; z++)
node_set((*z)->zone_pgdat->node_id, nodes);
- nodes_remap(tmp, nodes, *old, *new);
+ nodes_remap(tmp, nodes, *mpolmask, *newmask);
nodes = tmp;
zonelist = bind_zonelist(&nodes);
kfree(pol->v.zonelist);
pol->v.zonelist = zonelist;
}
+ *mpolmask = *newmask;
break;
}
default:
}
/*
- * Someone moved this task to different nodes. Fixup mempolicies.
+ * Wrapper for mpol_rebind_policy() that just requires task
+ * pointer, and updates task mempolicy.
+ */
+
+void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
+{
+ mpol_rebind_policy(tsk->mempolicy, new);
+}
+
+/*
+ * Rebind each vma in mm to new nodemask.
*
- * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well,
- * once we have a cpuset mechanism to mark which cpuset subtree is migrating.
+ * Call holding a reference to mm. Takes mm->mmap_sem during call.
*/
-void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new)
+
+void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
- rebind_policy(current->mempolicy, old, new);
+ struct vm_area_struct *vma;
+
+ down_write(&mm->mmap_sem);
+ for (vma = mm->mmap; vma; vma = vma->vm_next)
+ mpol_rebind_policy(vma->vm_policy, new);
+ up_write(&mm->mmap_sem);
}
/*
struct numa_maps {
unsigned long pages;
unsigned long anon;
- unsigned long mapped;
+ unsigned long active;
+ unsigned long writeback;
unsigned long mapcount_max;
+ unsigned long dirty;
+ unsigned long swapcache;
unsigned long node[MAX_NUMNODES];
};
-static void gather_stats(struct page *page, void *private)
+static void gather_stats(struct page *page, void *private, int pte_dirty)
{
struct numa_maps *md = private;
int count = page_mapcount(page);
- if (count)
- md->mapped++;
+ md->pages++;
+ if (pte_dirty || PageDirty(page))
+ md->dirty++;
- if (count > md->mapcount_max)
- md->mapcount_max = count;
+ if (PageSwapCache(page))
+ md->swapcache++;
- md->pages++;
+ if (PageActive(page))
+ md->active++;
+
+ if (PageWriteback(page))
+ md->writeback++;
if (PageAnon(page))
md->anon++;
+ if (count > md->mapcount_max)
+ md->mapcount_max = count;
+
md->node[page_to_nid(page)]++;
cond_resched();
}
+#ifdef CONFIG_HUGETLB_PAGE
+static void check_huge_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct numa_maps *md)
+{
+ unsigned long addr;
+ struct page *page;
+
+ for (addr = start; addr < end; addr += HPAGE_SIZE) {
+ pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+ pte_t pte;
+
+ if (!ptep)
+ continue;
+
+ pte = *ptep;
+ if (pte_none(pte))
+ continue;
+
+ page = pte_page(pte);
+ if (!page)
+ continue;
+
+ gather_stats(page, md, pte_dirty(*ptep));
+ }
+}
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct numa_maps *md)
+{
+}
+#endif
+
int show_numa_map(struct seq_file *m, void *v)
{
struct task_struct *task = m->private;
struct vm_area_struct *vma = v;
struct numa_maps *md;
+ struct file *file = vma->vm_file;
+ struct mm_struct *mm = vma->vm_mm;
int n;
char buffer[50];
- if (!vma->vm_mm)
+ if (!mm)
return 0;
md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
if (!md)
return 0;
- check_pgd_range(vma, vma->vm_start, vma->vm_end,
- &node_online_map, MPOL_MF_STATS, md);
+ mpol_to_str(buffer, sizeof(buffer),
+ get_vma_policy(task, vma, vma->vm_start));
- if (md->pages) {
- mpol_to_str(buffer, sizeof(buffer),
- get_vma_policy(task, vma, vma->vm_start));
+ seq_printf(m, "%08lx %s", vma->vm_start, buffer);
- seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
- vma->vm_start, buffer, md->pages,
- md->mapped, md->mapcount_max);
+ if (file) {
+ seq_printf(m, " file=");
+ seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+ } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_printf(m, " heap");
+ } else if (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack) {
+ seq_printf(m, " stack");
+ }
- if (md->anon)
- seq_printf(m," anon=%lu",md->anon);
+ if (is_vm_hugetlb_page(vma)) {
+ check_huge_range(vma, vma->vm_start, vma->vm_end, md);
+ seq_printf(m, " huge");
+ } else {
+ check_pgd_range(vma, vma->vm_start, vma->vm_end,
+ &node_online_map, MPOL_MF_STATS, md);
+ }
- for_each_online_node(n)
- if (md->node[n])
- seq_printf(m, " N%d=%lu", n, md->node[n]);
+ if (!md->pages)
+ goto out;
- seq_putc(m, '\n');
- }
+ if (md->anon)
+ seq_printf(m," anon=%lu",md->anon);
+
+ if (md->dirty)
+ seq_printf(m," dirty=%lu",md->dirty);
+
+ if (md->pages != md->anon && md->pages != md->dirty)
+ seq_printf(m, " mapped=%lu", md->pages);
+
+ if (md->mapcount_max > 1)
+ seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+ if (md->swapcache)
+ seq_printf(m," swapcache=%lu", md->swapcache);
+
+ if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+ seq_printf(m," active=%lu", md->active);
+
+ if (md->writeback)
+ seq_printf(m," writeback=%lu", md->writeback);
+
+ for_each_online_node(n)
+ if (md->node[n])
+ seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+ seq_putc(m, '\n');
kfree(md);
if (m->count < m->size)