#include <linux/swap.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/migrate.h>
+#include <linux/rmap.h>
+#include <linux/security.h>
#include <asm/tlbflush.h>
#include <asm/uaccess.h>
#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */
#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */
-/* The number of pages to migrate per call to migrate_pages() */
-#define MIGRATE_CHUNK_SIZE 256
-
-static kmem_cache_t *policy_cache;
-static kmem_cache_t *sn_cache;
+static struct kmem_cache *policy_cache;
+static struct kmem_cache *sn_cache;
#define PDprintk(fmt...)
/* Highest zone. An specific allocation for a zone below that is not
policied. */
-int policy_zone = ZONE_DMA;
+enum zone_type policy_zone = ZONE_DMA;
struct mempolicy default_policy = {
.refcnt = ATOMIC_INIT(1), /* never free it */
static struct zonelist *bind_zonelist(nodemask_t *nodes)
{
struct zonelist *zl;
- int num, max, nd, k;
+ int num, max, nd;
+ enum zone_type k;
max = 1 + MAX_NR_ZONES * nodes_weight(*nodes);
zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL);
lower zones etc. Avoid empty zones because the memory allocator
doesn't like them. If you implement node hot removal you
have to fix that. */
- for (k = policy_zone; k >= 0; k--) {
+ k = policy_zone;
+ while (1) {
for_each_node_mask(nd, *nodes) {
struct zone *z = &NODE_DATA(nd)->node_zones[k];
if (z->present_pages > 0)
zl->zones[num++] = z;
}
+ if (k == 0)
+ break;
+ k--;
}
zl->zones[num] = NULL;
return zl;
return policy;
}
-static void gather_stats(struct page *, void *);
+static void gather_stats(struct page *, void *, int pte_dirty);
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags);
continue;
if (flags & MPOL_MF_STATS)
- gather_stats(page, private);
+ gather_stats(page, private, pte_dirty(*pte));
else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
migrate_page_add(page, private, flags);
else
int err;
struct vm_area_struct *first, *vma, *prev;
- /* Clear the LRU lists so pages can be isolated */
- if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
- lru_add_drain_all();
+ if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
+
+ err = migrate_prep();
+ if (err)
+ return ERR_PTR(err);
+ }
first = find_vma(mm, start);
if (!first)
return mpol_check_policy(mode, nodes);
}
+
+/*
+ * Update task->flags PF_MEMPOLICY bit: set iff non-default
+ * mempolicy. Allows more rapid checking of this (combined perhaps
+ * with other PF_* flag bits) on memory allocation hot code paths.
+ *
+ * If called from outside this file, the task 'p' should -only- be
+ * a newly forked child not yet visible on the task list, because
+ * manipulating the task flags of a visible task is not safe.
+ *
+ * The above limitation is why this routine has the funny name
+ * mpol_fix_fork_child_flag().
+ *
+ * It is also safe to call this with a task pointer of current,
+ * which the static wrapper mpol_set_task_struct_flag() does,
+ * for use within this file.
+ */
+
+void mpol_fix_fork_child_flag(struct task_struct *p)
+{
+ if (p->mempolicy)
+ p->flags |= PF_MEMPOLICY;
+ else
+ p->flags &= ~PF_MEMPOLICY;
+}
+
+static void mpol_set_task_struct_flag(void)
+{
+ mpol_fix_fork_child_flag(current);
+}
+
/* Set the process memory policy */
long do_set_mempolicy(int mode, nodemask_t *nodes)
{
return PTR_ERR(new);
mpol_free(current->mempolicy);
current->mempolicy = new;
+ mpol_set_task_struct_flag();
if (new && new->policy == MPOL_INTERLEAVE)
current->il_next = first_node(new->v.nodes);
return 0;
switch (p->policy) {
case MPOL_BIND:
for (i = 0; p->v.zonelist->zones[i]; i++)
- node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id,
+ node_set(zone_to_nid(p->v.zonelist->zones[i]),
*nodes);
break;
case MPOL_DEFAULT:
return err;
}
+#ifdef CONFIG_MIGRATION
/*
* page migration
*/
-
static void migrate_page_add(struct page *page, struct list_head *pagelist,
unsigned long flags)
{
/*
* Avoid migrating a page that is shared with others.
*/
- if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) {
- if (isolate_lru_page(page))
- list_add_tail(&page->lru, pagelist);
- }
+ if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1)
+ isolate_lru_page(page, pagelist);
}
-/*
- * Migrate the list 'pagelist' of pages to a certain destination.
- *
- * Specify destination with either non-NULL vma or dest_node >= 0
- * Return the number of pages not migrated or error code
- */
-static int migrate_pages_to(struct list_head *pagelist,
- struct vm_area_struct *vma, int dest)
+static struct page *new_node_page(struct page *page, unsigned long node, int **x)
{
- LIST_HEAD(newlist);
- LIST_HEAD(moved);
- LIST_HEAD(failed);
- int err = 0;
- unsigned long offset = 0;
- int nr_pages;
- struct page *page;
- struct list_head *p;
-
-redo:
- nr_pages = 0;
- list_for_each(p, pagelist) {
- if (vma) {
- /*
- * The address passed to alloc_page_vma is used to
- * generate the proper interleave behavior. We fake
- * the address here by an increasing offset in order
- * to get the proper distribution of pages.
- *
- * No decision has been made as to which page
- * a certain old page is moved to so we cannot
- * specify the correct address.
- */
- page = alloc_page_vma(GFP_HIGHUSER, vma,
- offset + vma->vm_start);
- offset += PAGE_SIZE;
- }
- else
- page = alloc_pages_node(dest, GFP_HIGHUSER, 0);
-
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
- list_add_tail(&page->lru, &newlist);
- nr_pages++;
- if (nr_pages > MIGRATE_CHUNK_SIZE)
- break;
- }
- err = migrate_pages(pagelist, &newlist, &moved, &failed);
-
- putback_lru_pages(&moved); /* Call release pages instead ?? */
-
- if (err >= 0 && list_empty(&newlist) && !list_empty(pagelist))
- goto redo;
-out:
- /* Return leftover allocated pages */
- while (!list_empty(&newlist)) {
- page = list_entry(newlist.next, struct page, lru);
- list_del(&page->lru);
- __free_page(page);
- }
- list_splice(&failed, pagelist);
- if (err < 0)
- return err;
-
- /* Calculate number of leftover pages */
- nr_pages = 0;
- list_for_each(p, pagelist)
- nr_pages++;
- return nr_pages;
+ return alloc_pages_node(node, GFP_HIGHUSER, 0);
}
/*
check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
- if (!list_empty(&pagelist)) {
- err = migrate_pages_to(&pagelist, NULL, dest);
- if (!list_empty(&pagelist))
- putback_lru_pages(&pagelist);
- }
+ if (!list_empty(&pagelist))
+ err = migrate_pages(&pagelist, new_node_page, dest);
+
return err;
}
down_read(&mm->mmap_sem);
+ err = migrate_vmas(mm, from_nodes, to_nodes, flags);
+ if (err)
+ goto out;
+
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
* bit in 'to' is not also set in 'tmp'. Clear the found 'source'
if (err < 0)
break;
}
-
+out:
up_read(&mm->mmap_sem);
if (err < 0)
return err;
return busy;
+
+}
+
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+ struct vm_area_struct *vma = (struct vm_area_struct *)private;
+
+ return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma));
}
+#else
+
+static void migrate_page_add(struct page *page, struct list_head *pagelist,
+ unsigned long flags)
+{
+}
+
+int do_migrate_pages(struct mm_struct *mm,
+ const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags)
+{
+ return -ENOSYS;
+}
+
+static struct page *new_vma_page(struct page *page, unsigned long private, int **x)
+{
+ return NULL;
+}
+#endif
long do_mbind(unsigned long start, unsigned long len,
unsigned long mode, nodemask_t *nmask, unsigned long flags)
MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))
|| mode > MPOL_MAX)
return -EINVAL;
- if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE))
+ if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE))
return -EPERM;
if (start & ~PAGE_MASK)
err = mbind_range(vma, start, end, new);
if (!list_empty(&pagelist))
- nr_failed = migrate_pages_to(&pagelist, vma, -1);
+ nr_failed = migrate_pages(&pagelist, new_vma_page,
+ (unsigned long)vma);
if (!err && nr_failed && (flags & MPOL_MF_STRICT))
err = -EIO;
}
- if (!list_empty(&pagelist))
- putback_lru_pages(&pagelist);
up_write(&mm->mmap_sem);
mpol_free(new);
/*
* Check if this process has the right to modify the specified
* process. The right exists if the process has administrative
- * capabilities, superuser priviledges or the same
+ * capabilities, superuser privileges or the same
* userid as the target process.
*/
if ((current->euid != task->suid) && (current->euid != task->uid) &&
(current->uid != task->suid) && (current->uid != task->uid) &&
- !capable(CAP_SYS_ADMIN)) {
+ !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
task_nodes = cpuset_mems_allowed(task);
/* Is the user allowed to access the target nodes? */
- if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) {
+ if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) {
err = -EPERM;
goto out;
}
+ err = security_task_movememory(task);
+ if (err)
+ goto out;
+
err = do_migrate_pages(mm, &old, &new,
- capable(CAP_SYS_ADMIN) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
+ capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE);
out:
mmput(mm);
return err;
*/
unsigned slab_node(struct mempolicy *policy)
{
- switch (policy->policy) {
+ int pol = policy ? policy->policy : MPOL_DEFAULT;
+
+ switch (pol) {
case MPOL_INTERLEAVE:
return interleave_nodes(policy);
* Follow bind policy behavior and start allocation at the
* first node.
*/
- return policy->v.zonelist->zones[0]->zone_pgdat->node_id;
+ return zone_to_nid(policy->v.zonelist->zones[0]);
case MPOL_PREFERRED:
if (policy->v.preferred_node >= 0)
if (vma) {
unsigned long off;
- off = vma->vm_pgoff;
+ /*
+ * for small pages, there is no difference between
+ * shift and PAGE_SHIFT, so the bit-shift is safe.
+ * for huge pages, since vm_pgoff is in units of small
+ * pages, we need to shift off the always 0 bits to get
+ * a useful offset.
+ */
+ BUG_ON(shift < PAGE_SHIFT);
+ off = vma->vm_pgoff >> (shift - PAGE_SHIFT);
off += (addr - vma->vm_start) >> shift;
return offset_il_node(pol, vma, off);
} else
zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp);
page = __alloc_pages(gfp, order, zl);
- if (page && page_zone(page) == zl->zones[0]) {
- zone_pcp(zl->zones[0],get_cpu())->interleave_hit++;
- put_cpu();
- }
+ if (page && page_zone(page) == zl->zones[0])
+ inc_zone_page_state(page, NUMA_INTERLEAVE_HIT);
return page;
}
if ((gfp & __GFP_WAIT) && !in_interrupt())
cpuset_update_task_memory_state();
- if (!pol || in_interrupt())
+ if (!pol || in_interrupt() || (gfp & __GFP_THISNODE))
pol = &default_policy;
if (pol->policy == MPOL_INTERLEAVE)
return alloc_page_interleave(gfp, order, interleave_nodes(pol));
atomic_set(&new->refcnt, 1);
if (new->policy == MPOL_BIND) {
int sz = ksize(old->v.zonelist);
- new->v.zonelist = kmalloc(sz, SLAB_KERNEL);
+ new->v.zonelist = kmemdup(old->v.zonelist, sz, SLAB_KERNEL);
if (!new->v.zonelist) {
kmem_cache_free(policy_cache, new);
return ERR_PTR(-ENOMEM);
}
- memcpy(new->v.zonelist, old->v.zonelist, sz);
}
return new;
}
nodes_clear(nodes);
for (z = pol->v.zonelist->zones; *z; z++)
- node_set((*z)->zone_pgdat->node_id, nodes);
+ node_set(zone_to_nid(*z), nodes);
nodes_remap(tmp, nodes, *mpolmask, *newmask);
nodes = tmp;
struct numa_maps {
unsigned long pages;
unsigned long anon;
- unsigned long mapped;
+ unsigned long active;
+ unsigned long writeback;
unsigned long mapcount_max;
+ unsigned long dirty;
+ unsigned long swapcache;
unsigned long node[MAX_NUMNODES];
};
-static void gather_stats(struct page *page, void *private)
+static void gather_stats(struct page *page, void *private, int pte_dirty)
{
struct numa_maps *md = private;
int count = page_mapcount(page);
- if (count)
- md->mapped++;
+ md->pages++;
+ if (pte_dirty || PageDirty(page))
+ md->dirty++;
- if (count > md->mapcount_max)
- md->mapcount_max = count;
+ if (PageSwapCache(page))
+ md->swapcache++;
- md->pages++;
+ if (PageActive(page))
+ md->active++;
+
+ if (PageWriteback(page))
+ md->writeback++;
if (PageAnon(page))
md->anon++;
+ if (count > md->mapcount_max)
+ md->mapcount_max = count;
+
md->node[page_to_nid(page)]++;
- cond_resched();
}
+#ifdef CONFIG_HUGETLB_PAGE
+static void check_huge_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct numa_maps *md)
+{
+ unsigned long addr;
+ struct page *page;
+
+ for (addr = start; addr < end; addr += HPAGE_SIZE) {
+ pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK);
+ pte_t pte;
+
+ if (!ptep)
+ continue;
+
+ pte = *ptep;
+ if (pte_none(pte))
+ continue;
+
+ page = pte_page(pte);
+ if (!page)
+ continue;
+
+ gather_stats(page, md, pte_dirty(*ptep));
+ }
+}
+#else
+static inline void check_huge_range(struct vm_area_struct *vma,
+ unsigned long start, unsigned long end,
+ struct numa_maps *md)
+{
+}
+#endif
+
int show_numa_map(struct seq_file *m, void *v)
{
- struct task_struct *task = m->private;
+ struct proc_maps_private *priv = m->private;
struct vm_area_struct *vma = v;
struct numa_maps *md;
+ struct file *file = vma->vm_file;
+ struct mm_struct *mm = vma->vm_mm;
int n;
char buffer[50];
- if (!vma->vm_mm)
+ if (!mm)
return 0;
md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL);
if (!md)
return 0;
- check_pgd_range(vma, vma->vm_start, vma->vm_end,
- &node_online_map, MPOL_MF_STATS, md);
+ mpol_to_str(buffer, sizeof(buffer),
+ get_vma_policy(priv->task, vma, vma->vm_start));
- if (md->pages) {
- mpol_to_str(buffer, sizeof(buffer),
- get_vma_policy(task, vma, vma->vm_start));
+ seq_printf(m, "%08lx %s", vma->vm_start, buffer);
- seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu",
- vma->vm_start, buffer, md->pages,
- md->mapped, md->mapcount_max);
+ if (file) {
+ seq_printf(m, " file=");
+ seq_path(m, file->f_vfsmnt, file->f_dentry, "\n\t= ");
+ } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) {
+ seq_printf(m, " heap");
+ } else if (vma->vm_start <= mm->start_stack &&
+ vma->vm_end >= mm->start_stack) {
+ seq_printf(m, " stack");
+ }
- if (md->anon)
- seq_printf(m," anon=%lu",md->anon);
+ if (is_vm_hugetlb_page(vma)) {
+ check_huge_range(vma, vma->vm_start, vma->vm_end, md);
+ seq_printf(m, " huge");
+ } else {
+ check_pgd_range(vma, vma->vm_start, vma->vm_end,
+ &node_online_map, MPOL_MF_STATS, md);
+ }
- for_each_online_node(n)
- if (md->node[n])
- seq_printf(m, " N%d=%lu", n, md->node[n]);
+ if (!md->pages)
+ goto out;
- seq_putc(m, '\n');
- }
+ if (md->anon)
+ seq_printf(m," anon=%lu",md->anon);
+
+ if (md->dirty)
+ seq_printf(m," dirty=%lu",md->dirty);
+
+ if (md->pages != md->anon && md->pages != md->dirty)
+ seq_printf(m, " mapped=%lu", md->pages);
+
+ if (md->mapcount_max > 1)
+ seq_printf(m, " mapmax=%lu", md->mapcount_max);
+
+ if (md->swapcache)
+ seq_printf(m," swapcache=%lu", md->swapcache);
+
+ if (md->active < md->pages && !is_vm_hugetlb_page(vma))
+ seq_printf(m," active=%lu", md->active);
+
+ if (md->writeback)
+ seq_printf(m," writeback=%lu", md->writeback);
+
+ for_each_online_node(n)
+ if (md->node[n])
+ seq_printf(m, " N%d=%lu", n, md->node[n]);
+out:
+ seq_putc(m, '\n');
kfree(md);
if (m->count < m->size)
- m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0;
+ m->version = (vma != priv->tail_vma) ? vma->vm_start : 0;
return 0;
}