X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmempolicy.c;h=d76e8eb342d0df2e0468f014076a720866ae2277;hb=d1a78c32edcabea8c7c6449c967191a47f83a77f;hp=4c0510e9e7f660223d069b9578f4b07e92a90f39;hpb=132beacf9785d2e6e8aecb59aa078f3ca5668fa6;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4c0510e..d76e8eb 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -86,6 +86,9 @@ #include #include #include +#include +#include +#include #include #include @@ -95,14 +98,14 @@ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ -static kmem_cache_t *policy_cache; -static kmem_cache_t *sn_cache; +static struct kmem_cache *policy_cache; +static struct kmem_cache *sn_cache; #define PDprintk(fmt...) /* Highest zone. An specific allocation for a zone below that is not policied. */ -int policy_zone = ZONE_DMA; +enum zone_type policy_zone = 0; struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ @@ -129,19 +132,40 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) } return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; } + /* Generate a custom zonelist for the BIND policy. */ static struct zonelist *bind_zonelist(nodemask_t *nodes) { struct zonelist *zl; int num, max, nd; + enum zone_type k; max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); - zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); + max++; /* space for zlcache_ptr (see mmzone.h) */ + zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); if (!zl) - return NULL; + return ERR_PTR(-ENOMEM); + zl->zlcache_ptr = NULL; num = 0; - for_each_node_mask(nd, *nodes) - zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; + /* First put in the highest zones from all nodes, then all the next + lower zones etc. Avoid empty zones because the memory allocator + doesn't like them. If you implement node hot removal you + have to fix that. */ + k = policy_zone; + while (1) { + for_each_node_mask(nd, *nodes) { + struct zone *z = &NODE_DATA(nd)->node_zones[k]; + if (z->present_pages > 0) + zl->zones[num++] = z; + } + if (k == 0) + break; + k--; + } + if (num == 0) { + kfree(zl); + return ERR_PTR(-EINVAL); + } zl->zones[num] = NULL; return zl; } @@ -173,65 +197,21 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) break; case MPOL_BIND: policy->v.zonelist = bind_zonelist(nodes); - if (policy->v.zonelist == NULL) { + if (IS_ERR(policy->v.zonelist)) { + void *error_code = policy->v.zonelist; kmem_cache_free(policy_cache, policy); - return ERR_PTR(-ENOMEM); + return error_code; } break; } policy->policy = mode; + policy->cpuset_mems_allowed = cpuset_mems_allowed(current); return policy; } -/* Check if we are the only process mapping the page in question */ -static inline int single_mm_mapping(struct mm_struct *mm, - struct address_space *mapping) -{ - struct vm_area_struct *vma; - struct prio_tree_iter iter; - int rc = 1; - - spin_lock(&mapping->i_mmap_lock); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } -out: - spin_unlock(&mapping->i_mmap_lock); - return rc; -} - -/* - * Add a page to be migrated to the pagelist - */ -static void migrate_page_add(struct vm_area_struct *vma, - struct page *page, struct list_head *pagelist, unsigned long flags) -{ - /* - * Avoid migrating a page that is shared by others and not writable. - */ - if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || - mapping_writably_mapped(page->mapping) || - single_mm_mapping(vma->vm_mm, page->mapping)) { - int rc = isolate_lru_page(page); - - if (rc == 1) - list_add(&page->lru, pagelist); - /* - * If the isolate attempt was not successful then we just - * encountered an unswappable page. Something must be wrong. - */ - WARN_ON(rc == 0); - } -} - -static void gather_stats(struct page *, void *); +static void gather_stats(struct page *, void *, int pte_dirty); +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags); /* Scan through pages checking if pages follow certain conditions. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, @@ -246,24 +226,34 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { struct page *page; - unsigned int nid; + int nid; if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); if (!page) continue; + /* + * The check for PageReserved here is important to avoid + * handling zero pages and other pages that may have been + * marked special by the system. + * + * If the PageReserved would not be checked here then f.e. + * the location of the zero page could have an influence + * on MPOL_MF_STRICT, zero pages would be counted for + * the per node stats, and there would be useless attempts + * to put zero pages on the migration list. + */ + if (PageReserved(page)) + continue; nid = page_to_nid(page); if (node_isset(nid, *nodes) == !!(flags & MPOL_MF_INVERT)) continue; if (flags & MPOL_MF_STATS) - gather_stats(page, private); - else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - spin_unlock(ptl); - migrate_page_add(vma, page, private, flags); - spin_lock(ptl); - } + gather_stats(page, private, pte_dirty(*pte)); + else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, private, flags); else break; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -331,15 +321,6 @@ static inline int check_pgd_range(struct vm_area_struct *vma, return 0; } -/* Check if a vma is migratable */ -static inline int vma_migratable(struct vm_area_struct *vma) -{ - if (vma->vm_flags & ( - VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP)) - return 0; - return 1; -} - /* * Check if all pages in a range are on a set of nodes. * If pagelist != NULL then isolate pages from the LRU and @@ -352,6 +333,13 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, int err; struct vm_area_struct *first, *vma, *prev; + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + + err = migrate_prep(); + if (err) + return ERR_PTR(err); + } + first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); @@ -433,95 +421,41 @@ static int contextualize_policy(int mode, nodemask_t *nodes) if (!nodes) return 0; - /* Update current mems_allowed */ - cpuset_update_current_mems_allowed(); - /* Ignore nodes not set in current->mems_allowed */ - cpuset_restrict_to_mems_allowed(nodes->bits); + cpuset_update_task_memory_state(); + if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) + return -EINVAL; return mpol_check_policy(mode, nodes); } -static int swap_pages(struct list_head *pagelist) -{ - LIST_HEAD(moved); - LIST_HEAD(failed); - int n; - n = migrate_pages(pagelist, NULL, &moved, &failed); - putback_lru_pages(&failed); - putback_lru_pages(&moved); +/* + * Update task->flags PF_MEMPOLICY bit: set iff non-default + * mempolicy. Allows more rapid checking of this (combined perhaps + * with other PF_* flag bits) on memory allocation hot code paths. + * + * If called from outside this file, the task 'p' should -only- be + * a newly forked child not yet visible on the task list, because + * manipulating the task flags of a visible task is not safe. + * + * The above limitation is why this routine has the funny name + * mpol_fix_fork_child_flag(). + * + * It is also safe to call this with a task pointer of current, + * which the static wrapper mpol_set_task_struct_flag() does, + * for use within this file. + */ - return n; +void mpol_fix_fork_child_flag(struct task_struct *p) +{ + if (p->mempolicy) + p->flags |= PF_MEMPOLICY; + else + p->flags &= ~PF_MEMPOLICY; } -long do_mbind(unsigned long start, unsigned long len, - unsigned long mode, nodemask_t *nmask, unsigned long flags) +static void mpol_set_task_struct_flag(void) { - struct vm_area_struct *vma; - struct mm_struct *mm = current->mm; - struct mempolicy *new; - unsigned long end; - int err; - LIST_HEAD(pagelist); - - if ((flags & ~(unsigned long)(MPOL_MF_STRICT | - MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) - || mode > MPOL_MAX) - return -EINVAL; - if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) - return -EPERM; - - if (start & ~PAGE_MASK) - return -EINVAL; - - if (mode == MPOL_DEFAULT) - flags &= ~MPOL_MF_STRICT; - - len = (len + PAGE_SIZE - 1) & PAGE_MASK; - end = start + len; - - if (end < start) - return -EINVAL; - if (end == start) - return 0; - - if (mpol_check_policy(mode, nmask)) - return -EINVAL; - - new = mpol_new(mode, nmask); - if (IS_ERR(new)) - return PTR_ERR(new); - - /* - * If we are using the default policy then operation - * on discontinuous address spaces is okay after all - */ - if (!new) - flags |= MPOL_MF_DISCONTIG_OK; - - PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes_addr(nodes)[0]); - - down_write(&mm->mmap_sem); - vma = check_range(mm, start, end, nmask, - flags | MPOL_MF_INVERT, &pagelist); - - err = PTR_ERR(vma); - if (!IS_ERR(vma)) { - int nr_failed = 0; - - err = mbind_range(vma, start, end, new); - if (!list_empty(&pagelist)) - nr_failed = swap_pages(&pagelist); - - if (!err && nr_failed && (flags & MPOL_MF_STRICT)) - err = -EIO; - } - if (!list_empty(&pagelist)) - putback_lru_pages(&pagelist); - - up_write(&mm->mmap_sem); - mpol_free(new); - return err; + mpol_fix_fork_child_flag(current); } /* Set the process memory policy */ @@ -536,6 +470,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; + mpol_set_task_struct_flag(); if (new && new->policy == MPOL_INTERLEAVE) current->il_next = first_node(new->v.nodes); return 0; @@ -550,7 +485,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, + node_set(zone_to_nid(p->v.zonelist->zones[i]), *nodes); break; case MPOL_DEFAULT: @@ -592,7 +527,7 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, struct vm_area_struct *vma = NULL; struct mempolicy *pol = current->mempolicy; - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) return -EINVAL; if (flags & MPOL_F_ADDR) { @@ -643,11 +578,50 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +#ifdef CONFIG_MIGRATION +/* + * page migration + */ +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ + /* + * Avoid migrating a page that is shared with others. + */ + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) + isolate_lru_page(page, pagelist); +} + +static struct page *new_node_page(struct page *page, unsigned long node, int **x) +{ + return alloc_pages_node(node, GFP_HIGHUSER, 0); +} + +/* + * Migrate pages from one node to a target node. + * Returns error or the number of pages not migrated. + */ +int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) +{ + nodemask_t nmask; + LIST_HEAD(pagelist); + int err = 0; + + nodes_clear(nmask); + node_set(source, nmask); + + check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_node_page, dest); + + return err; +} + /* - * For now migrate_pages simply swaps out the pages from nodes that are in - * the source set but not in the target set. In the future, we would - * want a function that moves pages between the two nodesets in such - * a way as to preserve the physical layout as much as possible. + * Move pages between the two nodesets so as to preserve the physical + * layout as much as possible. * * Returns the number of page that could not be moved. */ @@ -655,22 +629,177 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) { LIST_HEAD(pagelist); - int count = 0; - nodemask_t nodes; + int busy = 0; + int err = 0; + nodemask_t tmp; - nodes_andnot(nodes, *from_nodes, *to_nodes); + down_read(&mm->mmap_sem); - down_read(&mm->mmap_sem); - check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, - flags | MPOL_MF_DISCONTIG_OK, &pagelist); + err = migrate_vmas(mm, from_nodes, to_nodes, flags); + if (err) + goto out; - if (!list_empty(&pagelist)) { - count = swap_pages(&pagelist); - putback_lru_pages(&pagelist); - } +/* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scannng from_tmp, we at least have the + * most recent pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ + tmp = *from_nodes; + while (!nodes_empty(tmp)) { + int s,d; + int source = -1; + int dest = 0; + + for_each_node_mask(s, tmp) { + d = node_remap(s, *from_nodes, *to_nodes); + if (s == d) + continue; + + source = s; /* Node moved. Memorize */ + dest = d; + + /* dest not in remaining from nodes? */ + if (!node_isset(dest, tmp)) + break; + } + if (source == -1) + break; + + node_clear(source, tmp); + err = migrate_to_node(mm, source, dest, flags); + if (err > 0) + busy += err; + if (err < 0) + break; + } +out: up_read(&mm->mmap_sem); - return count; + if (err < 0) + return err; + return busy; + +} + +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +{ + struct vm_area_struct *vma = (struct vm_area_struct *)private; + + return alloc_page_vma(GFP_HIGHUSER, vma, page_address_in_vma(page, vma)); +} +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + return -ENOSYS; +} + +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +{ + return NULL; +} +#endif + +long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + struct mempolicy *new; + unsigned long end; + int err; + LIST_HEAD(pagelist); + + if ((flags & ~(unsigned long)(MPOL_MF_STRICT | + MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + || mode > MPOL_MAX) + return -EINVAL; + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) + return -EPERM; + + if (start & ~PAGE_MASK) + return -EINVAL; + + if (mode == MPOL_DEFAULT) + flags &= ~MPOL_MF_STRICT; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + + if (mpol_check_policy(mode, nmask)) + return -EINVAL; + + new = mpol_new(mode, nmask); + if (IS_ERR(new)) + return PTR_ERR(new); + + /* + * If we are using the default policy then operation + * on discontinuous address spaces is okay after all + */ + if (!new) + flags |= MPOL_MF_DISCONTIG_OK; + + PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, + mode,nodes_addr(nodes)[0]); + + down_write(&mm->mmap_sem); + vma = check_range(mm, start, end, nmask, + flags | MPOL_MF_INVERT, &pagelist); + + err = PTR_ERR(vma); + if (!IS_ERR(vma)) { + int nr_failed = 0; + + err = mbind_range(vma, start, end, new); + + if (!list_empty(&pagelist)) + nr_failed = migrate_pages(&pagelist, new_vma_page, + (unsigned long)vma); + + if (!err && nr_failed && (flags & MPOL_MF_STRICT)) + err = -EIO; + } + + up_write(&mm->mmap_sem); + mpol_free(new); + return err; } /* @@ -689,6 +818,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; + if (maxnode > PAGE_SIZE*BITS_PER_BYTE) + return -EINVAL; nlongs = BITS_TO_LONGS(maxnode); if ((maxnode % BITS_PER_LONG) == 0) @@ -749,6 +880,10 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, err = get_nodes(&nodes, nmask, maxnode); if (err) return err; +#ifdef CONFIG_CPUSETS + /* Restrict the nodes to the allowed nodes in the cpuset */ + nodes_and(nodes, nodes, current->mems_allowed); +#endif return do_mbind(start, len, mode, &nodes, flags); } @@ -767,9 +902,6 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, return do_set_mempolicy(mode, &nodes); } -/* Macro needed until Paul implements this function in kernel/cpusets.c */ -#define cpuset_mems_allowed(task) node_online_map - asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, const unsigned long __user *old_nodes, const unsigned long __user *new_nodes) @@ -805,24 +937,29 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, /* * Check if this process has the right to modify the specified * process. The right exists if the process has administrative - * capabilities, superuser priviledges or the same + * capabilities, superuser privileges or the same * userid as the target process. */ if ((current->euid != task->suid) && (current->euid != task->uid) && (current->uid != task->suid) && (current->uid != task->uid) && - !capable(CAP_SYS_ADMIN)) { + !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ - if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { + if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } - err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); + err = security_task_movememory(task); + if (err) + goto out; + + err = do_migrate_pages(mm, &old, &new, + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: mmput(mm); return err; @@ -935,8 +1072,8 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, #endif /* Return effective policy for a VMA */ -struct mempolicy * -get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) +static struct mempolicy * get_vma_policy(struct task_struct *task, + struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; @@ -995,6 +1132,35 @@ static unsigned interleave_nodes(struct mempolicy *policy) return nid; } +/* + * Depending on the memory policy provide a node from which to allocate the + * next slab entry. + */ +unsigned slab_node(struct mempolicy *policy) +{ + int pol = policy ? policy->policy : MPOL_DEFAULT; + + switch (pol) { + case MPOL_INTERLEAVE: + return interleave_nodes(policy); + + case MPOL_BIND: + /* + * Follow bind policy behavior and start allocation at the + * first node. + */ + return zone_to_nid(policy->v.zonelist->zones[0]); + + case MPOL_PREFERRED: + if (policy->v.preferred_node >= 0) + return policy->v.preferred_node; + /* Fall through */ + + default: + return numa_node_id(); + } +} + /* Do static interleaving for a VMA with known offset. */ static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) @@ -1019,13 +1185,22 @@ static inline unsigned interleave_nid(struct mempolicy *pol, if (vma) { unsigned long off; - off = vma->vm_pgoff; + /* + * for small pages, there is no difference between + * shift and PAGE_SHIFT, so the bit-shift is safe. + * for huge pages, since vm_pgoff is in units of small + * pages, we need to shift off the always 0 bits to get + * a useful offset. + */ + BUG_ON(shift < PAGE_SHIFT); + off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; return offset_il_node(pol, vma, off); } else return interleave_nodes(pol); } +#ifdef CONFIG_HUGETLBFS /* Return a zonelist suitable for a huge page allocation. */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) { @@ -1039,6 +1214,7 @@ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) } return zonelist_policy(GFP_HIGHUSER, pol); } +#endif /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ @@ -1050,10 +1226,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zl->zones[0]) { - zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; - put_cpu(); - } + if (page && page_zone(page) == zl->zones[0]) + inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } @@ -1084,7 +1258,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); - cpuset_update_current_mems_allowed(); + cpuset_update_task_memory_state(); if (unlikely(pol->policy == MPOL_INTERLEAVE)) { unsigned nid; @@ -1110,7 +1284,7 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) * interrupt context and apply the current process NUMA policy. * Returns NULL when no page can be allocated. * - * Don't call cpuset_update_current_mems_allowed() unless + * Don't call cpuset_update_task_memory_state() unless * 1) it's ok to take cpuset_sem (can WAIT), and * 2) allocating for current task (not interrupt). */ @@ -1119,8 +1293,8 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) struct mempolicy *pol = current->mempolicy; if ((gfp & __GFP_WAIT) && !in_interrupt()) - cpuset_update_current_mems_allowed(); - if (!pol || in_interrupt()) + cpuset_update_task_memory_state(); + if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; if (pol->policy == MPOL_INTERLEAVE) return alloc_page_interleave(gfp, order, interleave_nodes(pol)); @@ -1128,6 +1302,15 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) } EXPORT_SYMBOL(alloc_pages_current); +/* + * If mpol_copy() sees current->cpuset == cpuset_being_rebound, then it + * rebinds the mempolicy its copying by calling mpol_rebind_policy() + * with the mems_allowed returned by cpuset_mems_allowed(). This + * keeps mempolicies cpuset relative after its cpuset moves. See + * further kernel/cpuset.c update_nodemask(). + */ +void *cpuset_being_rebound; + /* Slow path of a mempolicy copy */ struct mempolicy *__mpol_copy(struct mempolicy *old) { @@ -1135,16 +1318,19 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) if (!new) return ERR_PTR(-ENOMEM); + if (current_cpuset_is_being_rebound()) { + nodemask_t mems = cpuset_mems_allowed(current); + mpol_rebind_policy(old, &mems); + } *new = *old; atomic_set(&new->refcnt, 1); if (new->policy == MPOL_BIND) { int sz = ksize(old->v.zonelist); - new->v.zonelist = kmalloc(sz, SLAB_KERNEL); + new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); if (!new->v.zonelist) { kmem_cache_free(policy_cache, new); return ERR_PTR(-ENOMEM); } - memcpy(new->v.zonelist, old->v.zonelist, sz); } return new; } @@ -1341,6 +1527,30 @@ restart: return 0; } +void mpol_shared_policy_init(struct shared_policy *info, int policy, + nodemask_t *policy_nodes) +{ + info->root = RB_ROOT; + spin_lock_init(&info->lock); + + if (policy != MPOL_DEFAULT) { + struct mempolicy *newpol; + + /* Falls back to MPOL_DEFAULT on any error */ + newpol = mpol_new(policy, policy_nodes); + if (!IS_ERR(newpol)) { + /* Create pseudo-vma that contains just the policy */ + struct vm_area_struct pvma; + + memset(&pvma, 0, sizeof(struct vm_area_struct)); + /* Policy covers entire file */ + pvma.vm_end = TASK_SIZE; + mpol_set_shared_policy(info, &pvma, newpol); + mpol_free(newpol); + } + } +} + int mpol_set_shared_policy(struct shared_policy *info, struct vm_area_struct *vma, struct mempolicy *npol) { @@ -1409,25 +1619,31 @@ void numa_default_policy(void) } /* Migrate a policy to a different set of nodes */ -static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, - const nodemask_t *new) +void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) { + nodemask_t *mpolmask; nodemask_t tmp; if (!pol) return; + mpolmask = &pol->cpuset_mems_allowed; + if (nodes_equal(*mpolmask, *newmask)) + return; switch (pol->policy) { case MPOL_DEFAULT: break; case MPOL_INTERLEAVE: - nodes_remap(tmp, pol->v.nodes, *old, *new); + nodes_remap(tmp, pol->v.nodes, *mpolmask, *newmask); pol->v.nodes = tmp; - current->il_next = node_remap(current->il_next, *old, *new); + *mpolmask = *newmask; + current->il_next = node_remap(current->il_next, + *mpolmask, *newmask); break; case MPOL_PREFERRED: pol->v.preferred_node = node_remap(pol->v.preferred_node, - *old, *new); + *mpolmask, *newmask); + *mpolmask = *newmask; break; case MPOL_BIND: { nodemask_t nodes; @@ -1436,8 +1652,8 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, nodes_clear(nodes); for (z = pol->v.zonelist->zones; *z; z++) - node_set((*z)->zone_pgdat->node_id, nodes); - nodes_remap(tmp, nodes, *old, *new); + node_set(zone_to_nid(*z), nodes); + nodes_remap(tmp, nodes, *mpolmask, *newmask); nodes = tmp; zonelist = bind_zonelist(&nodes); @@ -1447,11 +1663,12 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. */ - if (zonelist) { + if (!IS_ERR(zonelist)) { /* Good - got mem - substitute new zonelist */ kfree(pol->v.zonelist); pol->v.zonelist = zonelist; } + *mpolmask = *newmask; break; } default: @@ -1461,22 +1678,37 @@ static void rebind_policy(struct mempolicy *pol, const nodemask_t *old, } /* - * Someone moved this task to different nodes. Fixup mempolicies. + * Wrapper for mpol_rebind_policy() that just requires task + * pointer, and updates task mempolicy. + */ + +void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) +{ + mpol_rebind_policy(tsk->mempolicy, new); +} + +/* + * Rebind each vma in mm to new nodemask. * - * TODO - fixup current->mm->vma and shmfs/tmpfs/hugetlbfs policies as well, - * once we have a cpuset mechanism to mark which cpuset subtree is migrating. + * Call holding a reference to mm. Takes mm->mmap_sem during call. */ -void numa_policy_rebind(const nodemask_t *old, const nodemask_t *new) + +void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { - rebind_policy(current->mempolicy, old, new); + struct vm_area_struct *vma; + + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) + mpol_rebind_policy(vma->vm_policy, new); + up_write(&mm->mmap_sem); } /* * Display pages allocated per node and memory policy via /proc. */ -static const char *policy_types[] = { "default", "prefer", "bind", - "interleave" }; +static const char * const policy_types[] = + { "default", "prefer", "bind", "interleave" }; /* * Convert a mempolicy into a string. @@ -1532,70 +1764,148 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) struct numa_maps { unsigned long pages; unsigned long anon; - unsigned long mapped; + unsigned long active; + unsigned long writeback; unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; unsigned long node[MAX_NUMNODES]; }; -static void gather_stats(struct page *page, void *private) +static void gather_stats(struct page *page, void *private, int pte_dirty) { struct numa_maps *md = private; int count = page_mapcount(page); - if (count) - md->mapped++; + md->pages++; + if (pte_dirty || PageDirty(page)) + md->dirty++; - if (count > md->mapcount_max) - md->mapcount_max = count; + if (PageSwapCache(page)) + md->swapcache++; - md->pages++; + if (PageActive(page)) + md->active++; + + if (PageWriteback(page)) + md->writeback++; if (PageAnon(page)) md->anon++; + if (count > md->mapcount_max) + md->mapcount_max = count; + md->node[page_to_nid(page)]++; - cond_resched(); } +#ifdef CONFIG_HUGETLB_PAGE +static void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ + unsigned long addr; + struct page *page; + + for (addr = start; addr < end; addr += HPAGE_SIZE) { + pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); + pte_t pte; + + if (!ptep) + continue; + + pte = *ptep; + if (pte_none(pte)) + continue; + + page = pte_page(pte); + if (!page) + continue; + + gather_stats(page, md, pte_dirty(*ptep)); + } +} +#else +static inline void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ +} +#endif + int show_numa_map(struct seq_file *m, void *v) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; struct numa_maps *md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; int n; char buffer[50]; - if (!vma->vm_mm) + if (!mm) return 0; md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); if (!md) return 0; - check_pgd_range(vma, vma->vm_start, vma->vm_end, - &node_online_map, MPOL_MF_STATS, md); + mpol_to_str(buffer, sizeof(buffer), + get_vma_policy(priv->task, vma, vma->vm_start)); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_printf(m, " file="); + seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } - if (md->pages) { - mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(task, vma, vma->vm_start)); + if (is_vm_hugetlb_page(vma)) { + check_huge_range(vma, vma->vm_start, vma->vm_end, md); + seq_printf(m, " huge"); + } else { + check_pgd_range(vma, vma->vm_start, vma->vm_end, + &node_online_map, MPOL_MF_STATS, md); + } - seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", - vma->vm_start, buffer, md->pages, - md->mapped, md->mapcount_max); + if (!md->pages) + goto out; - if (md->anon) - seq_printf(m," anon=%lu",md->anon); + if (md->anon) + seq_printf(m," anon=%lu",md->anon); - for_each_online_node(n) - if (md->node[n]) - seq_printf(m, " N%d=%lu", n, md->node[n]); + if (md->dirty) + seq_printf(m," dirty=%lu",md->dirty); - seq_putc(m, '\n'); - } + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m," swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m," active=%lu", md->active); + + if (md->writeback) + seq_printf(m," writeback=%lu", md->writeback); + + for_each_online_node(n) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); kfree(md); if (m->count < m->size) - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; return 0; }