X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmempolicy.c;h=da9463946556311e9e0945583fd76cdd432298e5;hb=5f8442edfb214908e9c6ca1142bf882c9bc364e5;hp=9cc6d962831dd362c0f22c2a2588a4a907f51f64;hpb=dc9aa5b9d65fd11b1f5246b46ec610ee8b83c6dd;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9cc6d96..da94639 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -84,21 +84,28 @@ #include #include #include +#include +#include +#include +#include +#include #include #include -/* Internal MPOL_MF_xxx flags */ +/* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ +#define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ +#define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ -static kmem_cache_t *policy_cache; -static kmem_cache_t *sn_cache; +static struct kmem_cache *policy_cache; +static struct kmem_cache *sn_cache; #define PDprintk(fmt...) /* Highest zone. An specific allocation for a zone below that is not policied. */ -int policy_zone = ZONE_DMA; +enum zone_type policy_zone = ZONE_DMA; struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ @@ -125,19 +132,36 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) } return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; } + /* Generate a custom zonelist for the BIND policy. */ static struct zonelist *bind_zonelist(nodemask_t *nodes) { struct zonelist *zl; int num, max, nd; + enum zone_type k; max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); - zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); + max++; /* space for zlcache_ptr (see mmzone.h) */ + zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); if (!zl) return NULL; + zl->zlcache_ptr = NULL; num = 0; - for_each_node_mask(nd, *nodes) - zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; + /* First put in the highest zones from all nodes, then all the next + lower zones etc. Avoid empty zones because the memory allocator + doesn't like them. If you implement node hot removal you + have to fix that. */ + k = policy_zone; + while (1) { + for_each_node_mask(nd, *nodes) { + struct zone *z = &NODE_DATA(nd)->node_zones[k]; + if (z->present_pages > 0) + zl->zones[num++] = z; + } + if (k == 0) + break; + k--; + } zl->zones[num] = NULL; return zl; } @@ -176,62 +200,19 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) break; } policy->policy = mode; + policy->cpuset_mems_allowed = cpuset_mems_allowed(current); return policy; } -/* Check if we are the only process mapping the page in question */ -static inline int single_mm_mapping(struct mm_struct *mm, - struct address_space *mapping) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - int rc = 1; - - spin_lock(&mapping->i_mmap_lock); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } -out: - spin_unlock(&mapping->i_mmap_lock); - return rc; -} +static void gather_stats(struct page *, void *, int pte_dirty); +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags); -/* - * Add a page to be migrated to the pagelist - */ -static void migrate_page_add(struct vm_area_struct *vma, - struct page *page, struct list_head *pagelist, unsigned long flags) -{ - /* - * Avoid migrating a page that is shared by others and not writable. - */ - if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || - mapping_writably_mapped(page->mapping) || - single_mm_mapping(vma->vm_mm, page->mapping)) { - int rc = isolate_lru_page(page); - - if (rc == 1) - list_add(&page->lru, pagelist); - /* - * If the isolate attempt was not successful then we just - * encountered an unswappable page. Something must be wrong. - */ - WARN_ON(rc == 0); - } -} - -/* Ensure all existing pages follow the policy. */ +/* Scan through pages checking if pages follow certain conditions. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + void *private) { pte_t *orig_pte; pte_t *pte; @@ -240,20 +221,36 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { struct page *page; - unsigned int nid; + int nid; if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); if (!page) continue; + /* + * The check for PageReserved here is important to avoid + * handling zero pages and other pages that may have been + * marked special by the system. + * + * If the PageReserved would not be checked here then f.e. + * the location of the zero page could have an influence + * on MPOL_MF_STRICT, zero pages would be counted for + * the per node stats, and there would be useless attempts + * to put zero pages on the migration list. + */ + if (PageReserved(page)) + continue; nid = page_to_nid(page); - if (!node_isset(nid, *nodes)) { - if (pagelist) - migrate_page_add(vma, page, pagelist, flags); - else - break; - } + if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) + continue; + + if (flags & MPOL_MF_STATS) + gather_stats(page, private, pte_dirty(*pte)); + else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, private, flags); + else + break; } while (pte++, addr += PAGE_SIZE, addr != end); pte_unmap_unlock(orig_pte, ptl); return addr != end; @@ -262,7 +259,7 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + void *private) { pmd_t *pmd; unsigned long next; @@ -273,7 +270,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, if (pmd_none_or_clear_bad(pmd)) continue; if (check_pte_range(vma, pmd, addr, next, nodes, - flags, pagelist)) + flags, private)) return -EIO; } while (pmd++, addr = next, addr != end); return 0; @@ -282,7 +279,7 @@ static inline int check_pmd_range(struct vm_area_struct *vma, pud_t *pud, static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + void *private) { pud_t *pud; unsigned long next; @@ -293,7 +290,7 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, if (pud_none_or_clear_bad(pud)) continue; if (check_pmd_range(vma, pud, addr, next, nodes, - flags, pagelist)) + flags, private)) return -EIO; } while (pud++, addr = next, addr != end); return 0; @@ -302,7 +299,7 @@ static inline int check_pud_range(struct vm_area_struct *vma, pgd_t *pgd, static inline int check_pgd_range(struct vm_area_struct *vma, unsigned long addr, unsigned long end, const nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + void *private) { pgd_t *pgd; unsigned long next; @@ -313,7 +310,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma, if (pgd_none_or_clear_bad(pgd)) continue; if (check_pud_range(vma, pgd, addr, next, nodes, - flags, pagelist)) + flags, private)) return -EIO; } while (pgd++, addr = next, addr != end); return 0; @@ -323,7 +320,7 @@ static inline int check_pgd_range(struct vm_area_struct *vma, static inline int vma_migratable(struct vm_area_struct *vma) { if (vma->vm_flags & ( - VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP)) + VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) return 0; return 1; } @@ -335,12 +332,18 @@ static inline int vma_migratable(struct vm_area_struct *vma) */ static struct vm_area_struct * check_range(struct mm_struct *mm, unsigned long start, unsigned long end, - const nodemask_t *nodes, unsigned long flags, - struct list_head *pagelist) + const nodemask_t *nodes, unsigned long flags, void *private) { int err; struct vm_area_struct *first, *vma, *prev; + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + + err = migrate_prep(); + if (err) + return ERR_PTR(err); + } + first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); @@ -363,7 +366,7 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, if (vma->vm_start > start) start = vma->vm_start; err = check_pgd_range(vma, start, endvma, nodes, - flags, pagelist); + flags, private); if (err) { first = ERR_PTR(err); break; @@ -422,77 +425,41 @@ static int contextualize_policy(int mode, nodemask_t *nodes) if (!nodes) return 0; - /* Update current mems_allowed */ - cpuset_update_current_mems_allowed(); - /* Ignore nodes not set in current->mems_allowed */ - cpuset_restrict_to_mems_allowed(nodes->bits); + cpuset_update_task_memory_state(); + if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) + return -EINVAL; return mpol_check_policy(mode, nodes); } -long do_mbind(unsigned long start, unsigned long len, - unsigned long mode, nodemask_t *nmask, unsigned long flags) -{ - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; - struct mempolicy *new; - unsigned long end; - int err; - LIST_HEAD(pagelist); - - if ((flags & ~(unsigned long)(MPOL_MF_STRICT|MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) - || mode > MPOL_MAX) - return -EINVAL; - if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - if (start & ~PAGE_MASK) - return -EINVAL; - - if (mode == MPOL_DEFAULT) - flags &= ~MPOL_MF_STRICT; - - len = (len + PAGE_SIZE - 1) & PAGE_MASK; - end = start + len; - - if (end < start) - return -EINVAL; - if (end == start) - return 0; - - if (mpol_check_policy(mode, nmask)) - return -EINVAL; - - new = mpol_new(mode, nmask); - if (IS_ERR(new)) - return PTR_ERR(new); - - /* - * If we are using the default policy then operation - * on discontinuous address spaces is okay after all - */ - if (!new) - flags |= MPOL_MF_DISCONTIG_OK; - - PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes_addr(nodes)[0]); +/* + * Update task->flags PF_MEMPOLICY bit: set iff non-default + * mempolicy. Allows more rapid checking of this (combined perhaps + * with other PF_* flag bits) on memory allocation hot code paths. + * + * If called from outside this file, the task 'p' should -only- be + * a newly forked child not yet visible on the task list, because + * manipulating the task flags of a visible task is not safe. + * + * The above limitation is why this routine has the funny name + * mpol_fix_fork_child_flag(). + * + * It is also safe to call this with a task pointer of current, + * which the static wrapper mpol_set_task_struct_flag() does, + * for use within this file. + */ - down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nmask, flags, - (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) ? &pagelist : NULL); - err = PTR_ERR(vma); - if (!IS_ERR(vma)) { - err = mbind_range(vma, start, end, new); - if (!list_empty(&pagelist)) - migrate_pages(&pagelist, NULL); - if (!err && !list_empty(&pagelist) && (flags & MPOL_MF_STRICT)) - err = -EIO; - } - if (!list_empty(&pagelist)) - putback_lru_pages(&pagelist); +void mpol_fix_fork_child_flag(struct task_struct *p) +{ + if (p->mempolicy) + p->flags |= PF_MEMPOLICY; + else + p->flags &= ~PF_MEMPOLICY; +} - up_write(&mm->mmap_sem); - mpol_free(new); - return err; +static void mpol_set_task_struct_flag(void) +{ + mpol_fix_fork_child_flag(current); } /* Set the process memory policy */ @@ -507,6 +474,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; + mpol_set_task_struct_flag(); if (new && new->policy == MPOL_INTERLEAVE) current->il_next = first_node(new->v.nodes); return 0; @@ -521,7 +489,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, + node_set(zone_to_nid(p->v.zonelist->zones[i]), *nodes); break; case MPOL_DEFAULT: @@ -563,7 +531,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; if (flags & MPOL_F_ADDR) { @@ -614,12 +582,236 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +#ifdef CONFIG_MIGRATION +/* + * page migration + */ +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ + /* + * Avoid migrating a page that is shared with others. + */ + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) + isolate_lru_page(page, pagelist); +} + +static struct page *new_node_page(struct page *page, unsigned long node, int **x) +{ + return alloc_pages_node(node, GFP_HIGHUSER, 0); +} + +/* + * Migrate pages from one node to a target node. + * Returns error or the number of pages not migrated. + */ +int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) +{ + nodemask_t nmask; + LIST_HEAD(pagelist); + int err = 0; + + nodes_clear(nmask); + node_set(source, nmask); + + check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_node_page, dest); + + return err; +} + +/* + * Move pages between the two nodesets so as to preserve the physical + * layout as much as possible. + * + * Returns the number of page that could not be moved. + */ +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + LIST_HEAD(pagelist); + int busy = 0; + int err = 0; + nodemask_t tmp; + + down_read(&mm->mmap_sem); + + err = migrate_vmas(mm, from_nodes, to_nodes, flags); + if (err) + goto out; + +/* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scannng from_tmp, we at least have the + * most recent pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ + + tmp = *from_nodes; + while (!nodes_empty(tmp)) { + int s,d; + int source = -1; + int dest = 0; + + for_each_node_mask(s, tmp) { + d = node_remap(s, *from_nodes, *to_nodes); + if (s == d) + continue; + + source = s; /* Node moved. Memorize */ + dest = d; + + /* dest not in remaining from nodes? */ + if (!node_isset(dest, tmp)) + break; + } + if (source == -1) + break; + + node_clear(source, tmp); + err = migrate_to_node(mm, source, dest, flags); + if (err > 0) + busy += err; + if (err < 0) + break; + } +out: + up_read(&mm->mmap_sem); + if (err < 0) + return err; + return busy; + +} + +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +{ + struct vm_area_struct *vma = (struct vm_area_struct *)private; + + return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); +} +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + return -ENOSYS; +} + +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +{ + return NULL; +} +#endif + +long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct mempolicy *new; + unsigned long end; + int err; + LIST_HEAD(pagelist); + + if ((flags & ~(unsigned long)(MPOL_MF_STRICT | + MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + || mode > MPOL_MAX) + return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + if (start & ~PAGE_MASK) + return -EINVAL; + + if (mode == MPOL_DEFAULT) + flags &= ~MPOL_MF_STRICT; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + + if (mpol_check_policy(mode, nmask)) + return -EINVAL; + + new = mpol_new(mode, nmask); + if (IS_ERR(new)) + return PTR_ERR(new); + + /* + * If we are using the default policy then operation + * on discontinuous address spaces is okay after all + */ + if (!new) + flags |= MPOL_MF_DISCONTIG_OK; + + PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, + mode,nodes_addr(nodes)[0]); + + down_write(&mm->mmap_sem); + vma = check_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT, &pagelist); + + err = PTR_ERR(vma); + if (!IS_ERR(vma)) { + int nr_failed = 0; + + err = mbind_range(vma, start, end, new); + + if (!list_empty(&pagelist)) + nr_failed = migrate_pages(&pagelist, new_vma_page, + (unsigned long)vma); + + if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; + } + + up_write(&mm->mmap_sem); + mpol_free(new); + return err; +} + /* * User space interface with variable sized bitmaps for nodelists. */ /* Copy a node mask from user space. */ -static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, +static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long k; @@ -630,6 +822,8 @@ static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; + if (maxnode > PAGE_SIZE*BITS_PER_BYTE) + return -EINVAL; nlongs = BITS_TO_LONGS(maxnode); if ((maxnode % BITS_PER_LONG) == 0) @@ -708,6 +902,70 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, return do_set_mempolicy(mode, &nodes); } +asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, + const unsigned long __user *old_nodes, + const unsigned long __user *new_nodes) +{ + struct mm_struct *mm; + struct task_struct *task; + nodemask_t old; + nodemask_t new; + nodemask_t task_nodes; + int err; + + err = get_nodes(&old, old_nodes, maxnode); + if (err) + return err; + + err = get_nodes(&new, new_nodes, maxnode); + if (err) + return err; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_pid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser privileges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + + task_nodes = cpuset_mems_allowed(task); + /* Is the user allowed to access the target nodes? */ + if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { + err = -EPERM; + goto out; + } + + err = security_task_movememory(task); + if (err) + goto out; + + err = do_migrate_pages(mm, &old, &new, + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); +out: + mmput(mm); + return err; +} + + /* Retrieve NUMA policy */ asmlinkage long sys_get_mempolicy(int __user *policy, unsigned long __user *nmask, @@ -814,8 +1072,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, #endif /* Return effective policy for a VMA */ -struct mempolicy * -get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) +static struct mempolicy * get_vma_policy(struct task_struct *task, + struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; @@ -874,6 +1132,35 @@ static unsigned interleave_nodes(struct mempolicy *policy) return nid; } +/* + * Depending on the memory policy provide a node from which to allocate the + * next slab entry. + */ +unsigned slab_node(struct mempolicy *policy) +{ + int pol = policy ? policy->policy : MPOL_DEFAULT; + + switch (pol) { + case MPOL_INTERLEAVE: + return interleave_nodes(policy); + + case MPOL_BIND: + /* + * Follow bind policy behavior and start allocation at the + * first node. + */ + return zone_to_nid(policy->v.zonelist->zones[0]); + + case MPOL_PREFERRED: + if (policy->v.preferred_node >= 0) + return policy->v.preferred_node; + /* Fall through */ + + default: + return numa_node_id(); + } +} + /* Do static interleaving for a VMA with known offset. */ static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) @@ -898,13 +1185,22 @@ static inline unsigned interleave_nid(struct mempolicy *pol, if (vma) { unsigned long off; - off = vma->vm_pgoff; + /* + * for small pages, there is no difference between + * shift and PAGE_SHIFT, so the bit-shift is safe. + * for huge pages, since vm_pgoff is in units of small + * pages, we need to shift off the always 0 bits to get + * a useful offset. + */ + BUG_ON(shift < PAGE_SHIFT); + off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; return offset_il_node(pol, vma, off); } else return interleave_nodes(pol); } +#ifdef CONFIG_HUGETLBFS /* Return a zonelist suitable for a huge page allocation. */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) { @@ -918,6 +1214,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) } return zonelist_policy(GFP_HIGHUSER, pol); } +#endif /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ @@ -929,10 +1226,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zl->zones[0]) { - zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; - put_cpu(); - } + if (page && page_zone(page) == zl->zones[0]) + inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } @@ -963,7 +1258,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (unlikely(pol->policy == MPOL_INTERLEAVE)) { unsigned nid; @@ -989,7 +1284,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. * - * Don't call cpuset_update_current_mems_allowed() unless + * Don't call cpuset_update_task_memory_state() unless * 1) it's ok to take cpuset_sem (can WAIT), and * 2) allocating for current task (not interrupt). */ @@ -998,8 +1293,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) struct mempolicy *pol = current->mempolicy; if ((gfp & __GFP_WAIT) && !in_interrupt()) - cpuset_update_current_mems_allowed(); - if (!pol || in_interrupt()) + cpuset_update_task_memory_state(); + if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; if (pol->policy == MPOL_INTERLEAVE) return alloc_page_interleave(gfp, order, interleave_nodes(pol)); @@ -1007,6 +1302,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) } EXPORT_SYMBOL(alloc_pages_current); +/* + * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it + * rebinds the mempolicy its copying by calling mpol_rebind_policy() + * with the mems_allowed returned by cpuset_mems_allowed(). This + * keeps mempolicies cpuset relative after its cpuset moves. See + * further kernel/cpuset.c update_nodemask(). + */ +void *cpuset_being_rebound; + /* Slow path of a mempolicy copy */ struct mempolicy *__mpol_copy(struct mempolicy *old) { @@ -1014,16 +1318,19 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + if (current_cpuset_is_being_rebound()) { + nodemask_t mems = cpuset_mems_allowed(current); + mpol_rebind_policy(old, &mems); + } *new = *old; atomic_set(&new->refcnt, 1); if (new->policy == MPOL_BIND) { int sz = ksize(old->v.zonelist); - new->v.zonelist = kmalloc(sz, SLAB_KERNEL); + new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); if (!new->v.zonelist) { kmem_cache_free(policy_cache, new); return ERR_PTR(-ENOMEM); } - memcpy(new->v.zonelist, old->v.zonelist, sz); } return new; } @@ -1220,6 +1527,30 @@ restart: return 0; } +void mpol_shared_policy_init(struct shared_policy *info, int policy, + nodemask_t *policy_nodes) +{ + info->root = RB_ROOT; + spin_lock_init(&info->lock); + + if (policy != MPOL_DEFAULT) { + struct mempolicy *newpol; + + /* Falls back to MPOL_DEFAULT on any error */ + newpol = mpol_new(policy, policy_nodes); + if (!IS_ERR(newpol)) { + /* Create pseudo-vma that contains just the policy */ + struct vm_area_struct pvma; + + memset(&pvma, 0, sizeof(struct vm_area_struct)); + /* Policy covers entire file */ + pvma.vm_end = TASK_SIZE; + mpol_set_shared_policy(info, &pvma, newpol); + mpol_free(newpol); + } + } +} + int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, struct mempolicy *npol) { @@ -1288,25 +1619,31 @@ void numa_default_policy(void) } /* Migrate a policy to a different set of nodes */ -static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, - const nodemask_t *new) +void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { + nodemask_t *mpolmask; nodemask_t tmp; if (!pol) return; + mpolmask = &pol->cpuset_mems_allowed; + if (nodes_equal(*mpolmask, *newmask)) + return; switch (pol->policy) { case MPOL_DEFAULT: break; case MPOL_INTERLEAVE: - nodes_remap(tmp, pol->v.nodes, *old, *new); + nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); pol->v.nodes = tmp; - current->il_next = node_remap(current->il_next, *old, *new); + *mpolmask = *newmask; + current->il_next = node_remap(current->il_next, + *mpolmask, *newmask); break; case MPOL_PREFERRED: pol->v.preferred_node = node_remap(pol->v.preferred_node, - *old, *new); + *mpolmask, *newmask); + *mpolmask = *newmask; break; case MPOL_BIND: { nodemask_t nodes; @@ -1315,8 +1652,8 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, nodes_clear(nodes); for (z = pol->v.zonelist->zones; *z; z++) - node_set((*z)->zone_pgdat->node_id, nodes); - nodes_remap(tmp, nodes, *old, *new); + node_set(zone_to_nid(*z), nodes); + nodes_remap(tmp, nodes, *mpolmask, *newmask); nodes = tmp; zonelist = bind_zonelist(&nodes); @@ -1331,6 +1668,7 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, kfree(pol->v.zonelist); pol->v.zonelist = zonelist; } + *mpolmask = *newmask; break; } default: @@ -1340,12 +1678,234 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, } /* - * Someone moved this task to different nodes. Fixup mempolicies. + * Wrapper for mpol_rebind_policy() that just requires task + * pointer, and updates task mempolicy. + */ + +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +{ + mpol_rebind_policy(tsk->mempolicy, new); +} + +/* + * Rebind each vma in mm to new nodemask. * - * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, - * once we have a cpuset mechanism to mark which cpuset subtree is migrating. + * Call holding a reference to mm. Takes mm->mmap_sem during call. */ -void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) + +void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { - rebind_policy(current->mempolicy, old, new); + struct vm_area_struct *vma; + + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + mpol_rebind_policy(vma->vm_policy, new); + up_write(&mm->mmap_sem); } + +/* + * Display pages allocated per node and memory policy via /proc. + */ + +static const char * const policy_types[] = + { "default", "prefer", "bind", "interleave" }; + +/* + * Convert a mempolicy into a string. + * Returns the number of characters in buffer (if positive) + * or an error (negative) + */ +static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) +{ + char *p = buffer; + int l; + nodemask_t nodes; + int mode = pol ? pol->policy : MPOL_DEFAULT; + + switch (mode) { + case MPOL_DEFAULT: + nodes_clear(nodes); + break; + + case MPOL_PREFERRED: + nodes_clear(nodes); + node_set(pol->v.preferred_node, nodes); + break; + + case MPOL_BIND: + get_zonemask(pol, &nodes); + break; + + case MPOL_INTERLEAVE: + nodes = pol->v.nodes; + break; + + default: + BUG(); + return -EFAULT; + } + + l = strlen(policy_types[mode]); + if (buffer + maxlen < p + l + 1) + return -ENOSPC; + + strcpy(p, policy_types[mode]); + p += l; + + if (!nodes_empty(nodes)) { + if (buffer + maxlen < p + 2) + return -ENOSPC; + *p++ = '='; + p += nodelist_scnprintf(p, buffer + maxlen - p, nodes); + } + return p - buffer; +} + +struct numa_maps { + unsigned long pages; + unsigned long anon; + unsigned long active; + unsigned long writeback; + unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; + unsigned long node[MAX_NUMNODES]; +}; + +static void gather_stats(struct page *page, void *private, int pte_dirty) +{ + struct numa_maps *md = private; + int count = page_mapcount(page); + + md->pages++; + if (pte_dirty || PageDirty(page)) + md->dirty++; + + if (PageSwapCache(page)) + md->swapcache++; + + if (PageActive(page)) + md->active++; + + if (PageWriteback(page)) + md->writeback++; + + if (PageAnon(page)) + md->anon++; + + if (count > md->mapcount_max) + md->mapcount_max = count; + + md->node[page_to_nid(page)]++; +} + +#ifdef CONFIG_HUGETLB_PAGE +static void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ + unsigned long addr; + struct page *page; + + for (addr = start; addr < end; addr += HPAGE_SIZE) { + pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); + pte_t pte; + + if (!ptep) + continue; + + pte = *ptep; + if (pte_none(pte)) + continue; + + page = pte_page(pte); + if (!page) + continue; + + gather_stats(page, md, pte_dirty(*ptep)); + } +} +#else +static inline void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ +} +#endif + +int show_numa_map(struct seq_file *m, void *v) +{ + struct proc_maps_private *priv = m->private; + struct vm_area_struct *vma = v; + struct numa_maps *md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + int n; + char buffer[50]; + + if (!mm) + return 0; + + md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); + if (!md) + return 0; + + mpol_to_str(buffer, sizeof(buffer), + get_vma_policy(priv->task, vma, vma->vm_start)); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_printf(m, " file="); + seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } + + if (is_vm_hugetlb_page(vma)) { + check_huge_range(vma, vma->vm_start, vma->vm_end, md); + seq_printf(m, " huge"); + } else { + check_pgd_range(vma, vma->vm_start, vma->vm_end, + &node_online_map, MPOL_MF_STATS, md); + } + + if (!md->pages) + goto out; + + if (md->anon) + seq_printf(m," anon=%lu",md->anon); + + if (md->dirty) + seq_printf(m," dirty=%lu",md->dirty); + + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m," swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m," active=%lu", md->active); + + if (md->writeback) + seq_printf(m," writeback=%lu", md->writeback); + + for_each_online_node(n) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); + kfree(md); + + if (m->count < m->size) + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; + return 0; +} +