X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmempolicy.c;h=83c69f8a64c29dd0878050b3912e5f0c2b1af3ac;hb=8c0863403f109a43d7000b4646da4818220d501f;hp=3171f884d2459a30ad113d9008b82b04d833671c;hpb=7339ff8302fd70aabf5f1ae26e0c4905fa74a495;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 3171f88..83c69f8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -72,20 +72,23 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include #include #include -#include #include #include #include +#include +#include +#include +#include #include #include @@ -95,20 +98,21 @@ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ #define MPOL_MF_STATS (MPOL_MF_INTERNAL << 2) /* Gather statistics */ -static kmem_cache_t *policy_cache; -static kmem_cache_t *sn_cache; - -#define PDprintk(fmt...) +static struct kmem_cache *policy_cache; +static struct kmem_cache *sn_cache; /* Highest zone. An specific allocation for a zone below that is not policied. */ -int policy_zone = ZONE_DMA; +enum zone_type policy_zone = 0; struct mempolicy default_policy = { .refcnt = ATOMIC_INIT(1), /* never free it */ .policy = MPOL_DEFAULT, }; +static void mpol_rebind_policy(struct mempolicy *pol, + const nodemask_t *newmask); + /* Do sanity checking on a policy */ static int mpol_check_policy(int mode, nodemask_t *nodes) { @@ -127,21 +131,42 @@ static int mpol_check_policy(int mode, nodemask_t *nodes) return -EINVAL; break; } - return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; + return nodes_subset(*nodes, node_states[N_HIGH_MEMORY]) ? 0 : -EINVAL; } + /* Generate a custom zonelist for the BIND policy. */ static struct zonelist *bind_zonelist(nodemask_t *nodes) { struct zonelist *zl; int num, max, nd; + enum zone_type k; max = 1 + MAX_NR_ZONES * nodes_weight(*nodes); - zl = kmalloc(sizeof(void *) * max, GFP_KERNEL); + max++; /* space for zlcache_ptr (see mmzone.h) */ + zl = kmalloc(sizeof(struct zone *) * max, GFP_KERNEL); if (!zl) - return NULL; + return ERR_PTR(-ENOMEM); + zl->zlcache_ptr = NULL; num = 0; - for_each_node_mask(nd, *nodes) - zl->zones[num++] = &NODE_DATA(nd)->node_zones[policy_zone]; + /* First put in the highest zones from all nodes, then all the next + lower zones etc. Avoid empty zones because the memory allocator + doesn't like them. If you implement node hot removal you + have to fix that. */ + k = MAX_NR_ZONES - 1; + while (1) { + for_each_node_mask(nd, *nodes) { + struct zone *z = &NODE_DATA(nd)->node_zones[k]; + if (z->present_pages > 0) + zl->zones[num++] = z; + } + if (k == 0) + break; + k--; + } + if (num == 0) { + kfree(zl); + return ERR_PTR(-EINVAL); + } zl->zones[num] = NULL; return zl; } @@ -151,7 +176,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) { struct mempolicy *policy; - PDprintk("setting mode %d nodes[0] %lx\n", mode, nodes_addr(*nodes)[0]); + pr_debug("setting mode %d nodes[0] %lx\n", + mode, nodes ? nodes_addr(*nodes)[0] : -1); + if (mode == MPOL_DEFAULT) return NULL; policy = kmem_cache_alloc(policy_cache, GFP_KERNEL); @@ -161,7 +188,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) switch (mode) { case MPOL_INTERLEAVE: policy->v.nodes = *nodes; - if (nodes_weight(*nodes) == 0) { + nodes_and(policy->v.nodes, policy->v.nodes, + node_states[N_HIGH_MEMORY]); + if (nodes_weight(policy->v.nodes) == 0) { kmem_cache_free(policy_cache, policy); return ERR_PTR(-EINVAL); } @@ -173,9 +202,10 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) break; case MPOL_BIND: policy->v.zonelist = bind_zonelist(nodes); - if (policy->v.zonelist == NULL) { + if (IS_ERR(policy->v.zonelist)) { + void *error_code = policy->v.zonelist; kmem_cache_free(policy_cache, policy); - return ERR_PTR(-ENOMEM); + return error_code; } break; } @@ -184,9 +214,9 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) return policy; } -static void gather_stats(struct page *, void *); -static void migrate_page_add(struct vm_area_struct *vma, - struct page *page, struct list_head *pagelist, unsigned long flags); +static void gather_stats(struct page *, void *, int pte_dirty); +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags); /* Scan through pages checking if pages follow certain conditions. */ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, @@ -201,13 +231,24 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { struct page *page; - unsigned int nid; + int nid; if (!pte_present(*pte)) continue; page = vm_normal_page(vma, addr, *pte); if (!page) continue; + /* + * The check for PageReserved here is important to avoid + * handling zero pages and other pages that may have been + * marked special by the system. + * + * If the PageReserved would not be checked here then f.e. + * the location of the zero page could have an influence + * on MPOL_MF_STRICT, zero pages would be counted for + * the per node stats, and there would be useless attempts + * to put zero pages on the migration list. + */ if (PageReserved(page)) continue; nid = page_to_nid(page); @@ -215,12 +256,9 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, continue; if (flags & MPOL_MF_STATS) - gather_stats(page, private); - else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { - spin_unlock(ptl); - migrate_page_add(vma, page, private, flags); - spin_lock(ptl); - } + gather_stats(page, private, pte_dirty(*pte)); + else if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) + migrate_page_add(page, private, flags); else break; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -288,15 +326,6 @@ static inline int check_pgd_range(struct vm_area_struct *vma, return 0; } -/* Check if a vma is migratable */ -static inline int vma_migratable(struct vm_area_struct *vma) -{ - if (vma->vm_flags & ( - VM_LOCKED|VM_IO|VM_HUGETLB|VM_PFNMAP|VM_RESERVED)) - return 0; - return 1; -} - /* * Check if all pages in a range are on a set of nodes. * If pagelist != NULL then isolate pages from the LRU and @@ -309,6 +338,13 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, int err; struct vm_area_struct *first, *vma, *prev; + if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) { + + err = migrate_prep(); + if (err) + return ERR_PTR(err); + } + first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); @@ -348,7 +384,7 @@ static int policy_vma(struct vm_area_struct *vma, struct mempolicy *new) int err = 0; struct mempolicy *old = vma->vm_policy; - PDprintk("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", + pr_debug("vma %lx-%lx/%lx vm_ops %p vm_file %p set_policy %p\n", vma->vm_start, vma->vm_end, vma->vm_pgoff, vma->vm_ops, vma->vm_file, vma->vm_ops ? vma->vm_ops->set_policy : NULL); @@ -396,8 +432,39 @@ static int contextualize_policy(int mode, nodemask_t *nodes) return mpol_check_policy(mode, nodes); } + +/* + * Update task->flags PF_MEMPOLICY bit: set iff non-default + * mempolicy. Allows more rapid checking of this (combined perhaps + * with other PF_* flag bits) on memory allocation hot code paths. + * + * If called from outside this file, the task 'p' should -only- be + * a newly forked child not yet visible on the task list, because + * manipulating the task flags of a visible task is not safe. + * + * The above limitation is why this routine has the funny name + * mpol_fix_fork_child_flag(). + * + * It is also safe to call this with a task pointer of current, + * which the static wrapper mpol_set_task_struct_flag() does, + * for use within this file. + */ + +void mpol_fix_fork_child_flag(struct task_struct *p) +{ + if (p->mempolicy) + p->flags |= PF_MEMPOLICY; + else + p->flags &= ~PF_MEMPOLICY; +} + +static void mpol_set_task_struct_flag(void) +{ + mpol_fix_fork_child_flag(current); +} + /* Set the process memory policy */ -long do_set_mempolicy(int mode, nodemask_t *nodes) +static long do_set_mempolicy(int mode, nodemask_t *nodes) { struct mempolicy *new; @@ -408,6 +475,7 @@ long do_set_mempolicy(int mode, nodemask_t *nodes) return PTR_ERR(new); mpol_free(current->mempolicy); current->mempolicy = new; + mpol_set_task_struct_flag(); if (new && new->policy == MPOL_INTERLEAVE) current->il_next = first_node(new->v.nodes); return 0; @@ -422,7 +490,7 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) switch (p->policy) { case MPOL_BIND: for (i = 0; p->v.zonelist->zones[i]; i++) - node_set(p->v.zonelist->zones[i]->zone_pgdat->node_id, + node_set(zone_to_nid(p->v.zonelist->zones[i]), *nodes); break; case MPOL_DEFAULT: @@ -431,9 +499,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) *nodes = p->v.nodes; break; case MPOL_PREFERRED: - /* or use current node instead of online map? */ + /* or use current node instead of memory_map? */ if (p->v.preferred_node < 0) - *nodes = node_online_map; + *nodes = node_states[N_HIGH_MEMORY]; else node_set(p->v.preferred_node, *nodes); break; @@ -456,8 +524,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) } /* Retrieve NUMA policy */ -long do_get_mempolicy(int *policy, nodemask_t *nmask, - unsigned long addr, unsigned long flags) +static long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; @@ -465,8 +533,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, struct mempolicy *pol = current->mempolicy; cpuset_update_task_memory_state(); - if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) + if (flags & + ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; + + if (flags & MPOL_F_MEMS_ALLOWED) { + if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) + return -EINVAL; + *policy = 0; /* just so it's initialized */ + *nmask = cpuset_current_mems_allowed; + return 0; + } + if (flags & MPOL_F_ADDR) { down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); @@ -515,76 +593,51 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +#ifdef CONFIG_MIGRATION /* * page migration */ - -/* Check if we are the only process mapping the page in question */ -static inline int single_mm_mapping(struct mm_struct *mm, - struct address_space *mapping) +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) { - struct vm_area_struct *vma; - struct prio_tree_iter iter; - int rc = 1; + /* + * Avoid migrating a page that is shared with others. + */ + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) + isolate_lru_page(page, pagelist); +} - spin_lock(&mapping->i_mmap_lock); - vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, 0, ULONG_MAX) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) - if (mm != vma->vm_mm) { - rc = 0; - goto out; - } -out: - spin_unlock(&mapping->i_mmap_lock); - return rc; +static struct page *new_node_page(struct page *page, unsigned long node, int **x) +{ + return alloc_pages_node(node, GFP_HIGHUSER_MOVABLE, 0); } /* - * Add a page to be migrated to the pagelist + * Migrate pages from one node to a target node. + * Returns error or the number of pages not migrated. */ -static void migrate_page_add(struct vm_area_struct *vma, - struct page *page, struct list_head *pagelist, unsigned long flags) +static int migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) { - /* - * Avoid migrating a page that is shared by others and not writable. - */ - if ((flags & MPOL_MF_MOVE_ALL) || !page->mapping || PageAnon(page) || - mapping_writably_mapped(page->mapping) || - single_mm_mapping(vma->vm_mm, page->mapping)) { - int rc = isolate_lru_page(page); + nodemask_t nmask; + LIST_HEAD(pagelist); + int err = 0; - if (rc == 1) - list_add(&page->lru, pagelist); - /* - * If the isolate attempt was not successful then we just - * encountered an unswappable page. Something must be wrong. - */ - WARN_ON(rc == 0); - } -} + nodes_clear(nmask); + node_set(source, nmask); -static int swap_pages(struct list_head *pagelist) -{ - LIST_HEAD(moved); - LIST_HEAD(failed); - int n; + check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nmask, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); - n = migrate_pages(pagelist, NULL, &moved, &failed); - putback_lru_pages(&failed); - putback_lru_pages(&moved); + if (!list_empty(&pagelist)) + err = migrate_pages(&pagelist, new_node_page, dest); - return n; + return err; } /* - * For now migrate_pages simply swaps out the pages from nodes that are in - * the source set but not in the target set. In the future, we would - * want a function that moves pages between the two nodesets in such - * a way as to preserve the physical layout as much as possible. + * Move pages between the two nodesets so as to preserve the physical + * layout as much as possible. * * Returns the number of page that could not be moved. */ @@ -592,26 +645,129 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) { LIST_HEAD(pagelist); - int count = 0; - nodemask_t nodes; + int busy = 0; + int err = 0; + nodemask_t tmp; - nodes_andnot(nodes, *from_nodes, *to_nodes); + down_read(&mm->mmap_sem); - down_read(&mm->mmap_sem); - check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, - flags | MPOL_MF_DISCONTIG_OK, &pagelist); + err = migrate_vmas(mm, from_nodes, to_nodes, flags); + if (err) + goto out; - if (!list_empty(&pagelist)) { - count = swap_pages(&pagelist); - putback_lru_pages(&pagelist); - } +/* + * Find a 'source' bit set in 'tmp' whose corresponding 'dest' + * bit in 'to' is not also set in 'tmp'. Clear the found 'source' + * bit in 'tmp', and return that pair for migration. + * The pair of nodemasks 'to' and 'from' define the map. + * + * If no pair of bits is found that way, fallback to picking some + * pair of 'source' and 'dest' bits that are not the same. If the + * 'source' and 'dest' bits are the same, this represents a node + * that will be migrating to itself, so no pages need move. + * + * If no bits are left in 'tmp', or if all remaining bits left + * in 'tmp' correspond to the same bit in 'to', return false + * (nothing left to migrate). + * + * This lets us pick a pair of nodes to migrate between, such that + * if possible the dest node is not already occupied by some other + * source node, minimizing the risk of overloading the memory on a + * node that would happen if we migrated incoming memory to a node + * before migrating outgoing memory source that same node. + * + * A single scan of tmp is sufficient. As we go, we remember the + * most recent pair that moved (s != d). If we find a pair + * that not only moved, but what's better, moved to an empty slot + * (d is not set in tmp), then we break out then, with that pair. + * Otherwise when we finish scannng from_tmp, we at least have the + * most recent pair that moved. If we get all the way through + * the scan of tmp without finding any node that moved, much less + * moved to an empty node, then there is nothing left worth migrating. + */ + + tmp = *from_nodes; + while (!nodes_empty(tmp)) { + int s,d; + int source = -1; + int dest = 0; + + for_each_node_mask(s, tmp) { + d = node_remap(s, *from_nodes, *to_nodes); + if (s == d) + continue; + + source = s; /* Node moved. Memorize */ + dest = d; + /* dest not in remaining from nodes? */ + if (!node_isset(dest, tmp)) + break; + } + if (source == -1) + break; + + node_clear(source, tmp); + err = migrate_to_node(mm, source, dest, flags); + if (err > 0) + busy += err; + if (err < 0) + break; + } +out: up_read(&mm->mmap_sem); - return count; + if (err < 0) + return err; + return busy; + +} + +/* + * Allocate a new page for page migration based on vma policy. + * Start assuming that page is mapped by vma pointed to by @private. + * Search forward from there, if not. N.B., this assumes that the + * list of pages handed to migrate_pages()--which is how we get here-- + * is in virtual address order. + */ +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +{ + struct vm_area_struct *vma = (struct vm_area_struct *)private; + unsigned long uninitialized_var(address); + + while (vma) { + address = page_address_in_vma(page, vma); + if (address != -EFAULT) + break; + vma = vma->vm_next; + } + + /* + * if !vma, alloc_page_vma() will use task or system default policy + */ + return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); +} +#else + +static void migrate_page_add(struct page *page, struct list_head *pagelist, + unsigned long flags) +{ +} + +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + return -ENOSYS; +} + +static struct page *new_vma_page(struct page *page, unsigned long private, int **x) +{ + return NULL; } +#endif -long do_mbind(unsigned long start, unsigned long len, - unsigned long mode, nodemask_t *nmask, unsigned long flags) +static long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, + unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; @@ -624,7 +780,7 @@ long do_mbind(unsigned long start, unsigned long len, MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) || mode > MPOL_MAX) return -EINVAL; - if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_RESOURCE)) + if ((flags & MPOL_MF_MOVE_ALL) && !capable(CAP_SYS_NICE)) return -EPERM; if (start & ~PAGE_MASK) @@ -655,8 +811,8 @@ long do_mbind(unsigned long start, unsigned long len, if (!new) flags |= MPOL_MF_DISCONTIG_OK; - PDprintk("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, - mode,nodes_addr(nodes)[0]); + pr_debug("mbind %lx-%lx mode:%ld nodes:%lx\n",start,start+len, + mode, nmask ? nodes_addr(*nmask)[0] : -1); down_write(&mm->mmap_sem); vma = check_range(mm, start, end, nmask, @@ -667,14 +823,14 @@ long do_mbind(unsigned long start, unsigned long len, int nr_failed = 0; err = mbind_range(vma, start, end, new); + if (!list_empty(&pagelist)) - nr_failed = swap_pages(&pagelist); + nr_failed = migrate_pages(&pagelist, new_vma_page, + (unsigned long)vma); if (!err && nr_failed && (flags & MPOL_MF_STRICT)) err = -EIO; } - if (!list_empty(&pagelist)) - putback_lru_pages(&pagelist); up_write(&mm->mmap_sem); mpol_free(new); @@ -697,6 +853,8 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, nodes_clear(*nodes); if (maxnode == 0 || !nmask) return 0; + if (maxnode > PAGE_SIZE*BITS_PER_BYTE) + return -EINVAL; nlongs = BITS_TO_LONGS(maxnode); if ((maxnode % BITS_PER_LONG) == 0) @@ -757,6 +915,10 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, err = get_nodes(&nodes, nmask, maxnode); if (err) return err; +#ifdef CONFIG_CPUSETS + /* Restrict the nodes to the allowed nodes in the cpuset */ + nodes_and(nodes, nodes, current->mems_allowed); +#endif return do_mbind(start, len, mode, &nodes, flags); } @@ -796,7 +958,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, /* Find the mm_struct */ read_lock(&tasklist_lock); - task = pid ? find_task_by_pid(pid) : current; + task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); return -ESRCH; @@ -810,24 +972,34 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, /* * Check if this process has the right to modify the specified * process. The right exists if the process has administrative - * capabilities, superuser priviledges or the same + * capabilities, superuser privileges or the same * userid as the target process. */ if ((current->euid != task->suid) && (current->euid != task->uid) && (current->uid != task->suid) && (current->uid != task->uid) && - !capable(CAP_SYS_ADMIN)) { + !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } task_nodes = cpuset_mems_allowed(task); /* Is the user allowed to access the target nodes? */ - if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { + if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_NICE)) { err = -EPERM; goto out; } - err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); + if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { + err = -EINVAL; + goto out; + } + + err = security_task_movememory(task); + if (err) + goto out; + + err = do_migrate_pages(mm, &old, &new, + capable(CAP_SYS_NICE) ? MPOL_MF_MOVE_ALL : MPOL_MF_MOVE); out: mmput(mm); return err; @@ -840,7 +1012,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy, unsigned long maxnode, unsigned long addr, unsigned long flags) { - int err, pval; + int err; + int uninitialized_var(pval); nodemask_t nodes; if (nmask != NULL && maxnode < MAX_NUMNODES) @@ -939,21 +1112,37 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, #endif -/* Return effective policy for a VMA */ +/* + * get_vma_policy(@task, @vma, @addr) + * @task - task for fallback if vma policy == default + * @vma - virtual memory area whose policy is sought + * @addr - address in @vma for shared policy lookup + * + * Returns effective policy for a VMA at specified address. + * Falls back to @task or system default policy, as necessary. + * Returned policy has extra reference count if shared, vma, + * or some other task's policy [show_numa_maps() can pass + * @task != current]. It is the caller's responsibility to + * free the reference in these cases. + */ static struct mempolicy * get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; + int shared_pol = 0; if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) + if (vma->vm_ops && vma->vm_ops->get_policy) { pol = vma->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy && + shared_pol = 1; /* if pol non-NULL, add ref below */ + } else if (vma->vm_policy && vma->vm_policy->policy != MPOL_DEFAULT) pol = vma->vm_policy; } if (!pol) pol = &default_policy; + else if (!shared_pol && pol != current->mempolicy) + mpol_get(pol); /* vma or other task's policy */ return pol; } @@ -1000,6 +1189,35 @@ static unsigned interleave_nodes(struct mempolicy *policy) return nid; } +/* + * Depending on the memory policy provide a node from which to allocate the + * next slab entry. + */ +unsigned slab_node(struct mempolicy *policy) +{ + int pol = policy ? policy->policy : MPOL_DEFAULT; + + switch (pol) { + case MPOL_INTERLEAVE: + return interleave_nodes(policy); + + case MPOL_BIND: + /* + * Follow bind policy behavior and start allocation at the + * first node. + */ + return zone_to_nid(policy->v.zonelist->zones[0]); + + case MPOL_PREFERRED: + if (policy->v.preferred_node >= 0) + return policy->v.preferred_node; + /* Fall through */ + + default: + return numa_node_id(); + } +} + /* Do static interleaving for a VMA with known offset. */ static unsigned offset_il_node(struct mempolicy *pol, struct vm_area_struct *vma, unsigned long off) @@ -1024,26 +1242,63 @@ static inline unsigned interleave_nid(struct mempolicy *pol, if (vma) { unsigned long off; - off = vma->vm_pgoff; + /* + * for small pages, there is no difference between + * shift and PAGE_SHIFT, so the bit-shift is safe. + * for huge pages, since vm_pgoff is in units of small + * pages, we need to shift off the always 0 bits to get + * a useful offset. + */ + BUG_ON(shift < PAGE_SHIFT); + off = vma->vm_pgoff >> (shift - PAGE_SHIFT); off += (addr - vma->vm_start) >> shift; return offset_il_node(pol, vma, off); } else return interleave_nodes(pol); } -/* Return a zonelist suitable for a huge page allocation. */ -struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr) +#ifdef CONFIG_HUGETLBFS +/* + * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * @vma = virtual memory area whose policy is sought + * @addr = address in @vma for shared policy lookup and interleave policy + * @gfp_flags = for requested zone + * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy + * + * Returns a zonelist suitable for a huge page allocation. + * If the effective policy is 'BIND, returns pointer to policy's zonelist. + * If it is also a policy for which get_vma_policy() returns an extra + * reference, we must hold that reference until after allocation. + * In that case, return policy via @mpol so hugetlb allocation can drop + * the reference. For non-'BIND referenced policies, we can/do drop the + * reference here, so the caller doesn't need to know about the special case + * for default and current task policy. + */ +struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, + gfp_t gfp_flags, struct mempolicy **mpol) { struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct zonelist *zl; + *mpol = NULL; /* probably no unref needed */ if (pol->policy == MPOL_INTERLEAVE) { unsigned nid; nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); - return NODE_DATA(nid)->node_zonelists + gfp_zone(GFP_HIGHUSER); + __mpol_free(pol); /* finished with pol */ + return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); } - return zonelist_policy(GFP_HIGHUSER, pol); + + zl = zonelist_policy(GFP_HIGHUSER, pol); + if (unlikely(pol != &default_policy && pol != current->mempolicy)) { + if (pol->policy != MPOL_BIND) + __mpol_free(pol); /* finished with pol */ + else + *mpol = pol; /* unref needed after allocation */ + } + return zl; } +#endif /* Allocate a page in interleaved policy. Own path because it needs to do special accounting. */ @@ -1055,10 +1310,8 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order, zl = NODE_DATA(nid)->node_zonelists + gfp_zone(gfp); page = __alloc_pages(gfp, order, zl); - if (page && page_zone(page) == zl->zones[0]) { - zone_pcp(zl->zones[0],get_cpu())->interleave_hit++; - put_cpu(); - } + if (page && page_zone(page) == zl->zones[0]) + inc_zone_page_state(page, NUMA_INTERLEAVE_HIT); return page; } @@ -1088,6 +1341,7 @@ struct page * alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct zonelist *zl; cpuset_update_task_memory_state(); @@ -1097,7 +1351,19 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); return alloc_page_interleave(gfp, 0, nid); } - return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); + zl = zonelist_policy(gfp, pol); + if (pol != &default_policy && pol != current->mempolicy) { + /* + * slow path: ref counted policy -- shared or vma + */ + struct page *page = __alloc_pages(gfp, 0, zl); + __mpol_free(pol); + return page; + } + /* + * fast path: default or task policy + */ + return __alloc_pages(gfp, 0, zl); } /** @@ -1125,7 +1391,7 @@ struct page *alloc_pages_current(gfp_t gfp, unsigned order) if ((gfp & __GFP_WAIT) && !in_interrupt()) cpuset_update_task_memory_state(); - if (!pol || in_interrupt()) + if (!pol || in_interrupt() || (gfp & __GFP_THISNODE)) pol = &default_policy; if (pol->policy == MPOL_INTERLEAVE) return alloc_page_interleave(gfp, order, interleave_nodes(pol)); @@ -1140,7 +1406,6 @@ EXPORT_SYMBOL(alloc_pages_current); * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). */ -void *cpuset_being_rebound; /* Slow path of a mempolicy copy */ struct mempolicy *__mpol_copy(struct mempolicy *old) @@ -1157,12 +1422,11 @@ struct mempolicy *__mpol_copy(struct mempolicy *old) atomic_set(&new->refcnt, 1); if (new->policy == MPOL_BIND) { int sz = ksize(old->v.zonelist); - new->v.zonelist = kmalloc(sz, SLAB_KERNEL); + new->v.zonelist = kmemdup(old->v.zonelist, sz, GFP_KERNEL); if (!new->v.zonelist) { kmem_cache_free(policy_cache, new); return ERR_PTR(-ENOMEM); } - memcpy(new->v.zonelist, old->v.zonelist, sz); } return new; } @@ -1266,7 +1530,7 @@ static void sp_insert(struct shared_policy *sp, struct sp_node *new) } rb_link_node(&new->nd, parent, p); rb_insert_color(&new->nd, &sp->root); - PDprintk("inserting %lx-%lx: %d\n", new->start, new->end, + pr_debug("inserting %lx-%lx: %d\n", new->start, new->end, new->policy ? new->policy->policy : 0); } @@ -1291,14 +1555,14 @@ mpol_shared_policy_lookup(struct shared_policy *sp, unsigned long idx) static void sp_delete(struct shared_policy *sp, struct sp_node *n) { - PDprintk("deleting %lx-l%x\n", n->start, n->end); + pr_debug("deleting %lx-l%lx\n", n->start, n->end); rb_erase(&n->nd, &sp->root); mpol_free(n->policy); kmem_cache_free(sn_cache, n); } -struct sp_node * -sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) +static struct sp_node *sp_alloc(unsigned long start, unsigned long end, + struct mempolicy *pol) { struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); @@ -1390,10 +1654,10 @@ int mpol_set_shared_policy(struct shared_policy *info, struct sp_node *new = NULL; unsigned long sz = vma_pages(vma); - PDprintk("set_shared_policy %lx sz %lu %d %lx\n", + pr_debug("set_shared_policy %lx sz %lu %d %lx\n", vma->vm_pgoff, sz, npol? npol->policy : -1, - npol ? nodes_addr(npol->v.nodes)[0] : -1); + npol ? nodes_addr(npol->v.nodes)[0] : -1); if (npol) { new = sp_alloc(vma->vm_pgoff, vma->vm_pgoff + sz, npol); @@ -1429,18 +1693,43 @@ void mpol_free_shared_policy(struct shared_policy *p) /* assumes fs == KERNEL_DS */ void __init numa_policy_init(void) { + nodemask_t interleave_nodes; + unsigned long largest = 0; + int nid, prefer = 0; + policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), - 0, SLAB_PANIC, NULL, NULL); + 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), - 0, SLAB_PANIC, NULL, NULL); + 0, SLAB_PANIC, NULL); + + /* + * Set interleaving policy for system init. Interleaving is only + * enabled across suitably sized nodes (default is >= 16MB), or + * fall back to the largest node if they're all smaller. + */ + nodes_clear(interleave_nodes); + for_each_node_state(nid, N_HIGH_MEMORY) { + unsigned long total_pages = node_present_pages(nid); + + /* Preserve the largest node */ + if (largest < total_pages) { + largest = total_pages; + prefer = nid; + } + + /* Interleave this node? */ + if ((total_pages << PAGE_SHIFT) >= (16 << 20)) + node_set(nid, interleave_nodes); + } - /* Set interleaving policy for system init. This way not all - the data structures allocated at system boot end up in node zero. */ + /* All too small, use the largest */ + if (unlikely(nodes_empty(interleave_nodes))) + node_set(prefer, interleave_nodes); - if (do_set_mempolicy(MPOL_INTERLEAVE, &node_online_map)) + if (do_set_mempolicy(MPOL_INTERLEAVE, &interleave_nodes)) printk("numa_policy_init: interleaving failed\n"); } @@ -1451,7 +1740,8 @@ void numa_default_policy(void) } /* Migrate a policy to a different set of nodes */ -void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) +static void mpol_rebind_policy(struct mempolicy *pol, + const nodemask_t *newmask) { nodemask_t *mpolmask; nodemask_t tmp; @@ -1484,7 +1774,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) nodes_clear(nodes); for (z = pol->v.zonelist->zones; *z; z++) - node_set((*z)->zone_pgdat->node_id, nodes); + node_set(zone_to_nid(*z), nodes); nodes_remap(tmp, nodes, *mpolmask, *newmask); nodes = tmp; @@ -1495,7 +1785,7 @@ void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) * then zonelist_policy() will "FALL THROUGH" to MPOL_DEFAULT. */ - if (zonelist) { + if (!IS_ERR(zonelist)) { /* Good - got mem - substitute new zonelist */ kfree(pol->v.zonelist); pol->v.zonelist = zonelist; @@ -1539,8 +1829,8 @@ void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) * Display pages allocated per node and memory policy via /proc. */ -static const char *policy_types[] = { "default", "prefer", "bind", - "interleave" }; +static const char * const policy_types[] = + { "default", "prefer", "bind", "interleave" }; /* * Convert a mempolicy into a string. @@ -1596,70 +1886,153 @@ static inline int mpol_to_str(char *buffer, int maxlen, struct mempolicy *pol) struct numa_maps { unsigned long pages; unsigned long anon; - unsigned long mapped; + unsigned long active; + unsigned long writeback; unsigned long mapcount_max; + unsigned long dirty; + unsigned long swapcache; unsigned long node[MAX_NUMNODES]; }; -static void gather_stats(struct page *page, void *private) +static void gather_stats(struct page *page, void *private, int pte_dirty) { struct numa_maps *md = private; int count = page_mapcount(page); - if (count) - md->mapped++; + md->pages++; + if (pte_dirty || PageDirty(page)) + md->dirty++; - if (count > md->mapcount_max) - md->mapcount_max = count; + if (PageSwapCache(page)) + md->swapcache++; - md->pages++; + if (PageActive(page)) + md->active++; + + if (PageWriteback(page)) + md->writeback++; if (PageAnon(page)) md->anon++; + if (count > md->mapcount_max) + md->mapcount_max = count; + md->node[page_to_nid(page)]++; - cond_resched(); } +#ifdef CONFIG_HUGETLB_PAGE +static void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ + unsigned long addr; + struct page *page; + + for (addr = start; addr < end; addr += HPAGE_SIZE) { + pte_t *ptep = huge_pte_offset(vma->vm_mm, addr & HPAGE_MASK); + pte_t pte; + + if (!ptep) + continue; + + pte = *ptep; + if (pte_none(pte)) + continue; + + page = pte_page(pte); + if (!page) + continue; + + gather_stats(page, md, pte_dirty(*ptep)); + } +} +#else +static inline void check_huge_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + struct numa_maps *md) +{ +} +#endif + int show_numa_map(struct seq_file *m, void *v) { - struct task_struct *task = m->private; + struct proc_maps_private *priv = m->private; struct vm_area_struct *vma = v; struct numa_maps *md; + struct file *file = vma->vm_file; + struct mm_struct *mm = vma->vm_mm; + struct mempolicy *pol; int n; char buffer[50]; - if (!vma->vm_mm) + if (!mm) return 0; md = kzalloc(sizeof(struct numa_maps), GFP_KERNEL); if (!md) return 0; - check_pgd_range(vma, vma->vm_start, vma->vm_end, - &node_online_map, MPOL_MF_STATS, md); + pol = get_vma_policy(priv->task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol); + /* + * unref shared or other task's mempolicy + */ + if (pol != &default_policy && pol != current->mempolicy) + __mpol_free(pol); + + seq_printf(m, "%08lx %s", vma->vm_start, buffer); + + if (file) { + seq_printf(m, " file="); + seq_path(m, file->f_path.mnt, file->f_path.dentry, "\n\t= "); + } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { + seq_printf(m, " heap"); + } else if (vma->vm_start <= mm->start_stack && + vma->vm_end >= mm->start_stack) { + seq_printf(m, " stack"); + } - if (md->pages) { - mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(task, vma, vma->vm_start)); + if (is_vm_hugetlb_page(vma)) { + check_huge_range(vma, vma->vm_start, vma->vm_end, md); + seq_printf(m, " huge"); + } else { + check_pgd_range(vma, vma->vm_start, vma->vm_end, + &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); + } - seq_printf(m, "%08lx %s pages=%lu mapped=%lu maxref=%lu", - vma->vm_start, buffer, md->pages, - md->mapped, md->mapcount_max); + if (!md->pages) + goto out; - if (md->anon) - seq_printf(m," anon=%lu",md->anon); + if (md->anon) + seq_printf(m," anon=%lu",md->anon); - for_each_online_node(n) - if (md->node[n]) - seq_printf(m, " N%d=%lu", n, md->node[n]); + if (md->dirty) + seq_printf(m," dirty=%lu",md->dirty); - seq_putc(m, '\n'); - } + if (md->pages != md->anon && md->pages != md->dirty) + seq_printf(m, " mapped=%lu", md->pages); + + if (md->mapcount_max > 1) + seq_printf(m, " mapmax=%lu", md->mapcount_max); + + if (md->swapcache) + seq_printf(m," swapcache=%lu", md->swapcache); + + if (md->active < md->pages && !is_vm_hugetlb_page(vma)) + seq_printf(m," active=%lu", md->active); + + if (md->writeback) + seq_printf(m," writeback=%lu", md->writeback); + + for_each_node_state(n, N_HIGH_MEMORY) + if (md->node[n]) + seq_printf(m, " N%d=%lu", n, md->node[n]); +out: + seq_putc(m, '\n'); kfree(md); if (m->count < m->size) - m->version = (vma != get_gate_vma(task)) ? vma->vm_start : 0; + m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; return 0; } -