#include <linux/hugetlb.h>
#include <linux/kernel.h>
#include <linux/sched.h>
-#include <linux/mm.h>
#include <linux/nodemask.h>
#include <linux/cpuset.h>
#include <linux/gfp.h>
#include <linux/slab.h>
#include <linux/string.h>
#include <linux/module.h>
+#include <linux/nsproxy.h>
#include <linux/interrupt.h>
#include <linux/init.h>
#include <linux/compat.h>
-#include <linux/mempolicy.h>
#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/migrate.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
+#include <linux/syscalls.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
-/* The number of pages to migrate per call to migrate_pages() */
-#define MIGRATE_CHUNK_SIZE 256
-
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
-
-#define PDprintk(fmt...)
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
/* Highest zone. An specific allocation for a zone below that is not
policied. */
-int policy_zone = ZONE_DMA;
+enum zone_type policy_zone = 0;
struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
.policy = MPOL_DEFAULT,
};
+static void mpol_rebind_policy(struct mempolicy *pol,
+ const nodemask_t *newmask);
+
/* Do sanity checking on a policy */
static int mpol_check_policy(int mode, nodemask_t *nodes)
{
return -EINVAL;
break;
}
- return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL;
+ return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL;
}
/* Generate a custom zonelist for the BIND policy. */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
struct zonelist *zl;
- int num, max, nd, k;
+ int num, max, nd;
+ enum zone_type k;
max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
+ max++; /* space for zlcache_ptr (see mmzone.h) */
zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
if (!zl)
- return NULL;
+ return ERR_PTR(-ENOMEM);
+ zl->zlcache_ptr = NULL;
num = 0;
/* First put in the highest zones from all nodes, then all the next
lower zones etc. Avoid empty zones because the memory allocator
doesn't like them. If you implement node hot removal you
have to fix that. */
- for (k = policy_zone; k >= 0; k--) {
+ k = MAX_NR_ZONES - 1;
+ while (1) {
for_each_node_mask(nd, *nodes) {
struct zone *z = &NODE_DATA(nd)->node_zones[k];
if (z->present_pages > 0)
zl->zones[num++] = z;
}
+ if (k == 0)
+ break;
+ k--;
+ }
+ if (num == 0) {
+ kfree(zl);
+ return ERR_PTR(-EINVAL);
}
zl->zones[num] = NULL;
return zl;
{
struct mempolicy *policy;
- PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]);
+ pr_debug("setting mode %d nodes[0] %lx\n",
+ mode, nodes ? nodes_addr(*nodes)[0] : -1);
+
if (mode == MPOL_DEFAULT)
return NULL;
policy = kmem_cache_alloc(policy_cache, GFP_KERNEL);
switch (mode) {
case MPOL_INTERLEAVE:
policy->v.nodes = *nodes;
- if (nodes_weight(*nodes) == 0) {
+ nodes_and(policy->v.nodes, policy->v.nodes,
+ node_states[N_HIGH_MEMORY]);
+ if (nodes_weight(policy->v.nodes) == 0) {
kmem_cache_free(policy_cache, policy);
return ERR_PTR(-EINVAL);
}
break;
case MPOL_BIND:
policy->v.zonelist = bind_zonelist(nodes);
- if (policy->v.zonelist == NULL) {
+ if (IS_ERR(policy->v.zonelist)) {
+ void *error_code = policy->v.zonelist;
kmem_cache_free(policy_cache, policy);
- return ERR_PTR(-ENOMEM);
+ return error_code;
}
break;
}
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
do {
struct page *page;
- unsigned int nid;
+ int nid;
if (!pte_present(*pte))
continue;
return 0;
}
-/* Check if a vma is migratable */
-static inline int vma_migratable(struct vm_area_struct *vma)
-{
- if (vma->vm_flags & (
- VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED))
- return 0;
- return 1;
-}
-
/*
* Check if all pages in a range are on a set of nodes.
* If pagelist != NULL then isolate pages from the LRU and
int err;
struct vm_area_struct *first, *vma, *prev;
- /* Clear the LRU lists so pages can be isolated */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- lru_add_drain_all();
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+ err = migrate_prep();
+ if (err)
+ return ERR_PTR(err);
+ }
first = find_vma(mm, start);
if (!first)
int err = 0;
struct mempolicy *old = vma->vm_policy;
- PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
+ pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n",
vma->vm_start, vma->vm_end, vma->vm_pgoff,
vma->vm_ops, vma->vm_file,
vma->vm_ops ? vma->vm_ops->set_policy : NULL);
return mpol_check_policy(mode, nodes);
}
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy. Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+ if (p->mempolicy)
+ p->flags |= PF_MEMPOLICY;
+ else
+ p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+ mpol_fix_fork_child_flag(current);
+}
+
/* Set the process memory policy */
-long do_set_mempolicy(int mode, nodemask_t *nodes)
+static long do_set_mempolicy(int mode, nodemask_t *nodes)
{
struct mempolicy *new;
return PTR_ERR(new);
mpol_free(current->mempolicy);
current->mempolicy = new;
+ mpol_set_task_struct_flag();
if (new && new->policy == MPOL_INTERLEAVE)
current->il_next = first_node(new->v.nodes);
return 0;
switch (p->policy) {
case MPOL_BIND:
for (i = 0; p->v.zonelist->zones[i]; i++)
- node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+ node_set(zone_to_nid(p->v.zonelist->zones[i]),
*nodes);
break;
case MPOL_DEFAULT:
*nodes = p->v.nodes;
break;
case MPOL_PREFERRED:
- /* or use current node instead of online map? */
+ /* or use current node instead of memory_map? */
if (p->v.preferred_node < 0)
- *nodes = node_online_map;
+ *nodes = node_states[N_HIGH_MEMORY];
else
node_set(p->v.preferred_node, *nodes);
break;
}
/* Retrieve NUMA policy */
-long do_get_mempolicy(int *policy, nodemask_t *nmask,
- unsigned long addr, unsigned long flags)
+static long do_get_mempolicy(int *policy, nodemask_t *nmask,
+ unsigned long addr, unsigned long flags)
{
int err;
struct mm_struct *mm = current->mm;
struct mempolicy *pol = current->mempolicy;
cpuset_update_task_memory_state();
- if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR))
+ if (flags &
+ ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED))
return -EINVAL;
+
+ if (flags & MPOL_F_MEMS_ALLOWED) {
+ if (flags & (MPOL_F_NODE|MPOL_F_ADDR))
+ return -EINVAL;
+ *policy = 0; /* just so it's initialized */
+ *nmask = cpuset_current_mems_allowed;
+ return 0;
+ }
+
if (flags & MPOL_F_ADDR) {
down_read(&mm->mmap_sem);
vma = find_vma_intersection(mm, addr, addr+1);
return err;
}
+#ifdef CONFIG_MIGRATION
/*
* page migration
*/
-
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
/*
* Avoid migrating a page that is shared with others.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
- if (isolate_lru_page(page))
- list_add_tail(&page->lru, pagelist);
- }
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+ isolate_lru_page(page, pagelist);
}
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
- struct vm_area_struct *vma, int dest)
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
- LIST_HEAD(newlist);
- LIST_HEAD(moved);
- LIST_HEAD(failed);
- int err = 0;
- unsigned long offset = 0;
- int nr_pages;
- struct page *page;
- struct list_head *p;
-
-redo:
- nr_pages = 0;
- list_for_each(p, pagelist) {
- if (vma) {
- /*
- * The address passed to alloc_page_vma is used to
- * generate the proper interleave behavior. We fake
- * the address here by an increasing offset in order
- * to get the proper distribution of pages.
- *
- * No decision has been made as to which page
- * a certain old page is moved to so we cannot
- * specify the correct address.
- */
- page = alloc_page_vma(GFP_HIGHUSER, vma,
- offset + vma->vm_start);
- offset += PAGE_SIZE;
- }
- else
- page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
- list_add_tail(&page->lru, &newlist);
- nr_pages++;
- if (nr_pages > MIGRATE_CHUNK_SIZE)
- break;
- }
- err = migrate_pages(pagelist, &newlist, &moved, &failed);
-
- putback_lru_pages(&moved); /* Call release pages instead ?? */
-
- if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
- goto redo;
-out:
- /* Return leftover allocated pages */
- while (!list_empty(&newlist)) {
- page = list_entry(newlist.next, struct page, lru);
- list_del(&page->lru);
- __free_page(page);
- }
- list_splice(&failed, pagelist);
- if (err < 0)
- return err;
-
- /* Calculate number of leftover pages */
- nr_pages = 0;
- list_for_each(p, pagelist)
- nr_pages++;
- return nr_pages;
+ return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0);
}
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
*/
-int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags)
+static int migrate_to_node(struct mm_struct *mm, int source, int dest,
+ int flags)
{
nodemask_t nmask;
LIST_HEAD(pagelist);
check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
- if (!list_empty(&pagelist)) {
- err = migrate_pages_to(&pagelist, NULL, dest);
- if (!list_empty(&pagelist))
- putback_lru_pages(&pagelist);
- }
+ if (!list_empty(&pagelist))
+ err = migrate_pages(&pagelist, new_node_page, dest);
+
return err;
}
down_read(&mm->mmap_sem);
+ err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+ if (err)
+ goto out;
+
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
if (err < 0)
break;
}
-
+out:
up_read(&mm->mmap_sem);
if (err < 0)
return err;
return busy;
+
}
-long do_mbind(unsigned long start, unsigned long len,
- unsigned long mode, nodemask_t *nmask, unsigned long flags)
+/*
+ * Allocate a new page for page migration based on vma policy.
+ * Start assuming that page is mapped by vma pointed to by @private.
+ * Search forward from there, if not. N.B., this assumes that the
+ * list of pages handed to migrate_pages()--which is how we get here--
+ * is in virtual address order.
+ */
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+ struct vm_area_struct *vma = (struct vm_area_struct *)private;
+ unsigned long uninitialized_var(address);
+
+ while (vma) {
+ address = page_address_in_vma(page, vma);
+ if (address != -EFAULT)
+ break;
+ vma = vma->vm_next;
+ }
+
+ /*
+ * if !vma, alloc_page_vma() will use task or system default policy
+ */
+ return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+}
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+ return -ENOSYS;
+}
+
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+ return NULL;
+}
+#endif
+
+static long do_mbind(unsigned long start, unsigned long len,
+ unsigned long mode, nodemask_t *nmask,
+ unsigned long flags)
{
struct vm_area_struct *vma;
struct mm_struct *mm = current->mm;
if (!new)
flags |= MPOL_MF_DISCONTIG_OK;
- PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
- mode,nodes_addr(nodes)[0]);
+ pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len,
+ mode, nmask ? nodes_addr(*nmask)[0] : -1);
down_write(&mm->mmap_sem);
vma = check_range(mm, start, end, nmask,
err = mbind_range(vma, start, end, new);
if (!list_empty(&pagelist))
- nr_failed = migrate_pages_to(&pagelist, vma, -1);
+ nr_failed = migrate_pages(&pagelist, new_vma_page,
+ (unsigned long)vma);
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
}
- if (!list_empty(&pagelist))
- putback_lru_pages(&pagelist);
up_write(&mm->mmap_sem);
mpol_free(new);
err = get_nodes(&nodes, nmask, maxnode);
if (err)
return err;
+#ifdef CONFIG_CPUSETS
+ /* Restrict the nodes to the allowed nodes in the cpuset */
+ nodes_and(nodes, nodes, current->mems_allowed);
+#endif
return do_mbind(start, len, mode, &nodes, flags);
}
/* Find the mm_struct */
read_lock(&tasklist_lock);
- task = pid ? find_task_by_pid(pid) : current;
+ task = pid ? find_task_by_vpid(pid) : current;
if (!task) {
read_unlock(&tasklist_lock);
return -ESRCH;
/*
* Check if this process has the right to modify the specified
* process. The right exists if the process has administrative
- * capabilities, superuser priviledges or the same
+ * capabilities, superuser privileges or the same
* userid as the target process.
*/
if ((current->euid != task->suid) && (current->euid != task->uid) &&
goto out;
}
+ if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = security_task_movememory(task);
+ if (err)
+ goto out;
+
err = do_migrate_pages(mm, &old, &new,
capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
unsigned long maxnode,
unsigned long addr, unsigned long flags)
{
- int err, pval;
+ int err;
+ int uninitialized_var(pval);
nodemask_t nodes;
if (nmask != NULL && maxnode < MAX_NUMNODES)
#endif
-/* Return effective policy for a VMA */
+/*
+ * get_vma_policy(@task, @vma, @addr)
+ * @task - task for fallback if vma policy == default
+ * @vma - virtual memory area whose policy is sought
+ * @addr - address in @vma for shared policy lookup
+ *
+ * Returns effective policy for a VMA at specified address.
+ * Falls back to @task or system default policy, as necessary.
+ * Returned policy has extra reference count if shared, vma,
+ * or some other task's policy [show_numa_maps() can pass
+ * @task != current]. It is the caller's responsibility to
+ * free the reference in these cases.
+ */
static struct mempolicy * get_vma_policy(struct task_struct *task,
struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = task->mempolicy;
+ int shared_pol = 0;
if (vma) {
- if (vma->vm_ops && vma->vm_ops->get_policy)
+ if (vma->vm_ops && vma->vm_ops->get_policy) {
pol = vma->vm_ops->get_policy(vma, addr);
- else if (vma->vm_policy &&
+ shared_pol = 1; /* if pol non-NULL, add ref below */
+ } else if (vma->vm_policy &&
vma->vm_policy->policy != MPOL_DEFAULT)
pol = vma->vm_policy;
}
if (!pol)
pol = &default_policy;
+ else if (!shared_pol && pol != current->mempolicy)
+ mpol_get(pol); /* vma or other task's policy */
return pol;
}
*/
unsigned slab_node(struct mempolicy *policy)
{
- switch (policy->policy) {
+ int pol = policy ? policy->policy : MPOL_DEFAULT;
+
+ switch (pol) {
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
* Follow bind policy behavior and start allocation at the
* first node.
*/
- return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+ return zone_to_nid(policy->v.zonelist->zones[0]);
case MPOL_PREFERRED:
if (policy->v.preferred_node >= 0)
if (vma) {
unsigned long off;
- off = vma->vm_pgoff;
+ /*
+ * for small pages, there is no difference between
+ * shift and PAGE_SHIFT, so the bit-shift is safe.
+ * for huge pages, since vm_pgoff is in units of small
+ * pages, we need to shift off the always 0 bits to get
+ * a useful offset.
+ */
+ BUG_ON(shift < PAGE_SHIFT);
+ off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
off += (addr - vma->vm_start) >> shift;
return offset_il_node(pol, vma, off);
} else
}
#ifdef CONFIG_HUGETLBFS
-/* Return a zonelist suitable for a huge page allocation. */
-struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr)
+/*
+ * huge_zonelist(@vma, @addr, @gfp_flags, @mpol)
+ * @vma = virtual memory area whose policy is sought
+ * @addr = address in @vma for shared policy lookup and interleave policy
+ * @gfp_flags = for requested zone
+ * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy
+ *
+ * Returns a zonelist suitable for a huge page allocation.
+ * If the effective policy is 'BIND, returns pointer to policy's zonelist.
+ * If it is also a policy for which get_vma_policy() returns an extra
+ * reference, we must hold that reference until after allocation.
+ * In that case, return policy via @mpol so hugetlb allocation can drop
+ * the reference. For non-'BIND referenced policies, we can/do drop the
+ * reference here, so the caller doesn't need to know about the special case
+ * for default and current task policy.
+ */
+struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr,
+ gfp_t gfp_flags, struct mempolicy **mpol)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct zonelist *zl;
+ *mpol = NULL; /* probably no unref needed */
if (pol->policy == MPOL_INTERLEAVE) {
unsigned nid;
nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT);
- return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER);
+ __mpol_free(pol); /* finished with pol */
+ return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags);
}
- return zonelist_policy(GFP_HIGHUSER, pol);
+
+ zl = zonelist_policy(GFP_HIGHUSER, pol);
+ if (unlikely(pol != &default_policy && pol != current->mempolicy)) {
+ if (pol->policy != MPOL_BIND)
+ __mpol_free(pol); /* finished with pol */
+ else
+ *mpol = pol; /* unref needed after allocation */
+ }
+ return zl;
}
#endif
zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
page = __alloc_pages(gfp, order, zl);
- if (page && page_zone(page) == zl->zones[0]) {
- zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
- put_cpu();
- }
+ if (page && page_zone(page) == zl->zones[0])
+ inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
return page;
}
alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr)
{
struct mempolicy *pol = get_vma_policy(current, vma, addr);
+ struct zonelist *zl;
cpuset_update_task_memory_state();
nid = interleave_nid(pol, vma, addr, PAGE_SHIFT);
return alloc_page_interleave(gfp, 0, nid);
}
- return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol));
+ zl = zonelist_policy(gfp, pol);
+ if (pol != &default_policy && pol != current->mempolicy) {
+ /*
+ * slow path: ref counted policy -- shared or vma
+ */
+ struct page *page = __alloc_pages(gfp, 0, zl);
+ __mpol_free(pol);
+ return page;
+ }
+ /*
+ * fast path: default or task policy
+ */
+ return __alloc_pages(gfp, 0, zl);
}
/**
if ((gfp & __GFP_WAIT) && !in_interrupt())
cpuset_update_task_memory_state();
- if (!pol || in_interrupt())
+ if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
if (pol->policy == MPOL_INTERLEAVE)
return alloc_page_interleave(gfp, order, interleave_nodes(pol));
* keeps mempolicies cpuset relative after its cpuset moves. See
* further kernel/cpuset.c update_nodemask().
*/
-void *cpuset_being_rebound;
/* Slow path of a mempolicy copy */
struct mempolicy *__mpol_copy(struct mempolicy *old)
atomic_set(&new->refcnt, 1);
if (new->policy == MPOL_BIND) {
int sz = ksize(old->v.zonelist);
- new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
+ new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL);
if (!new->v.zonelist) {
kmem_cache_free(policy_cache, new);
return ERR_PTR(-ENOMEM);
}
- memcpy(new->v.zonelist, old->v.zonelist, sz);
}
return new;
}
}
rb_link_node(&new->nd, parent, p);
rb_insert_color(&new->nd, &sp->root);
- PDprintk("inserting %lx-%lx: %d\n", new->start, new->end,
+ pr_debug("inserting %lx-%lx: %d\n", new->start, new->end,
new->policy ? new->policy->policy : 0);
}
static void sp_delete(struct shared_policy *sp, struct sp_node *n)
{
- PDprintk("deleting %lx-l%x\n", n->start, n->end);
+ pr_debug("deleting %lx-l%lx\n", n->start, n->end);
rb_erase(&n->nd, &sp->root);
mpol_free(n->policy);
kmem_cache_free(sn_cache, n);
}
-struct sp_node *
-sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol)
+static struct sp_node *sp_alloc(unsigned long start, unsigned long end,
+ struct mempolicy *pol)
{
struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL);
struct sp_node *new = NULL;
unsigned long sz = vma_pages(vma);
- PDprintk("set_shared_policy %lx sz %lu %d %lx\n",
+ pr_debug("set_shared_policy %lx sz %lu %d %lx\n",
vma->vm_pgoff,
sz, npol? npol->policy : -1,
- npol ? nodes_addr(npol->v.nodes)[0] : -1);
+ npol ? nodes_addr(npol->v.nodes)[0] : -1);
if (npol) {
new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol);
/* assumes fs == KERNEL_DS */
void __init numa_policy_init(void)
{
+ nodemask_t interleave_nodes;
+ unsigned long largest = 0;
+ int nid, prefer = 0;
+
policy_cache = kmem_cache_create("numa_policy",
sizeof(struct mempolicy),
- 0, SLAB_PANIC, NULL, NULL);
+ 0, SLAB_PANIC, NULL);
sn_cache = kmem_cache_create("shared_policy_node",
sizeof(struct sp_node),
- 0, SLAB_PANIC, NULL, NULL);
+ 0, SLAB_PANIC, NULL);
- /* Set interleaving policy for system init. This way not all
- the data structures allocated at system boot end up in node zero. */
+ /*
+ * Set interleaving policy for system init. Interleaving is only
+ * enabled across suitably sized nodes (default is >= 16MB), or
+ * fall back to the largest node if they're all smaller.
+ */
+ nodes_clear(interleave_nodes);
+ for_each_node_state(nid, N_HIGH_MEMORY) {
+ unsigned long total_pages = node_present_pages(nid);
+
+ /* Preserve the largest node */
+ if (largest < total_pages) {
+ largest = total_pages;
+ prefer = nid;
+ }
+
+ /* Interleave this node? */
+ if ((total_pages << PAGE_SHIFT) >= (16 << 20))
+ node_set(nid, interleave_nodes);
+ }
- if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map))
+ /* All too small, use the largest */
+ if (unlikely(nodes_empty(interleave_nodes)))
+ node_set(prefer, interleave_nodes);
+
+ if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes))
printk("numa_policy_init: interleaving failed\n");
}
}
/* Migrate a policy to a different set of nodes */
-void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask)
+static void mpol_rebind_policy(struct mempolicy *pol,
+ const nodemask_t *newmask)
{
nodemask_t *mpolmask;
nodemask_t tmp;
nodes_clear(nodes);
for (z = pol->v.zonelist->zones; *z; z++)
- node_set((*z)->zone_pgdat->node_id, nodes);
+ node_set(zone_to_nid(*z), nodes);
nodes_remap(tmp, nodes, *mpolmask, *newmask);
nodes = tmp;
* then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT.
*/
- if (zonelist) {
+ if (!IS_ERR(zonelist)) {
/* Good - got mem - substitute new zonelist */
kfree(pol->v.zonelist);
pol->v.zonelist = zonelist;
* Display pages allocated per node and memory policy via /proc.
*/
-static const char *policy_types[] = { "default", "prefer", "bind",
- "interleave" };
+static const char * const policy_types[] =
+ { "default", "prefer", "bind", "interleave" };
/*
* Convert a mempolicy into a string.
md->mapcount_max = count;
md->node[page_to_nid(page)]++;
- cond_resched();
}
#ifdef CONFIG_HUGETLB_PAGE
int show_numa_map(struct seq_file *m, void *v)
{
- struct task_struct *task = m->private;
+ struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
struct numa_maps *md;
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
+ struct mempolicy *pol;
int n;
char buffer[50];
if (!md)
return 0;
- mpol_to_str(buffer, sizeof(buffer),
- get_vma_policy(task, vma, vma->vm_start));
+ pol = get_vma_policy(priv->task, vma, vma->vm_start);
+ mpol_to_str(buffer, sizeof(buffer), pol);
+ /*
+ * unref shared or other task's mempolicy
+ */
+ if (pol != &default_policy && pol != current->mempolicy)
+ __mpol_free(pol);
seq_printf(m, "%08lx %s", vma->vm_start, buffer);
if (file) {
seq_printf(m, " file=");
- seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+ seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= ");
} else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
seq_printf(m, " heap");
} else if (vma->vm_start <= mm->start_stack &&
seq_printf(m, " huge");
} else {
check_pgd_range(vma, vma->vm_start, vma->vm_end,
- &node_online_map, MPOL_MF_STATS, md);
+ &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md);
}
if (!md->pages)
if (md->writeback)
seq_printf(m," writeback=%lu", md->writeback);
- for_each_online_node(n)
+ for_each_node_state(n, N_HIGH_MEMORY)
if (md->node[n])
seq_printf(m, " N%d=%lu", n, md->node[n]);
out:
kfree(md);
if (m->count < m->size)
- m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+ m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
return 0;
}
-