X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fmempolicy.c;h=8d246c3b340f5cd635e8e7177d23abbb38efec64;hb=31f1de46b90ad360a16e7af3e277d104961df923;hp=9f4e9b95e8f2238597f88b362221527160bf98f9;hpb=396faf0303d273219db5d7eb4a2879ad977ed185;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9f4e9b9..8d246c3 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -72,23 +72,23 @@ #include #include #include -#include #include #include #include #include #include #include +#include #include #include #include -#include #include #include #include #include #include #include +#include #include #include @@ -110,25 +110,57 @@ struct mempolicy default_policy = { .policy = MPOL_DEFAULT, }; +static void mpol_rebind_policy(struct mempolicy *pol, + const nodemask_t *newmask); + /* Do sanity checking on a policy */ static int mpol_check_policy(int mode, nodemask_t *nodes) { - int empty = nodes_empty(*nodes); + int was_empty, is_empty; + + if (!nodes) + return 0; + + /* + * "Contextualize" the in-coming nodemast for cpusets: + * Remember whether in-coming nodemask was empty, If not, + * restrict the nodes to the allowed nodes in the cpuset. + * This is guaranteed to be a subset of nodes with memory. + */ + cpuset_update_task_memory_state(); + is_empty = was_empty = nodes_empty(*nodes); + if (!was_empty) { + nodes_and(*nodes, *nodes, cpuset_current_mems_allowed); + is_empty = nodes_empty(*nodes); /* after "contextualization" */ + } switch (mode) { case MPOL_DEFAULT: - if (!empty) + /* + * require caller to specify an empty nodemask + * before "contextualization" + */ + if (!was_empty) return -EINVAL; break; case MPOL_BIND: case MPOL_INTERLEAVE: - /* Preferred will only use the first bit, but allow - more for now. */ - if (empty) + /* + * require at least 1 valid node after "contextualization" + */ + if (is_empty) + return -EINVAL; + break; + case MPOL_PREFERRED: + /* + * Did caller specify invalid nodes? + * Don't silently accept this as "local allocation". + */ + if (!was_empty && is_empty) return -EINVAL; break; } - return nodes_subset(*nodes, node_online_map) ? 0 : -EINVAL; + return 0; } /* Generate a custom zonelist for the BIND policy. */ @@ -149,7 +181,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) lower zones etc. Avoid empty zones because the memory allocator doesn't like them. If you implement node hot removal you have to fix that. */ - k = policy_zone; + k = MAX_NR_ZONES - 1; while (1) { for_each_node_mask(nd, *nodes) { struct zone *z = &NODE_DATA(nd)->node_zones[k]; @@ -185,7 +217,7 @@ static struct mempolicy *mpol_new(int mode, nodemask_t *nodes) switch (mode) { case MPOL_INTERLEAVE: policy->v.nodes = *nodes; - if (nodes_weight(*nodes) == 0) { + if (nodes_weight(policy->v.nodes) == 0) { kmem_cache_free(policy_cache, policy); return ERR_PTR(-EINVAL); } @@ -416,18 +448,6 @@ static int mbind_range(struct vm_area_struct *vma, unsigned long start, return err; } -static int contextualize_policy(int mode, nodemask_t *nodes) -{ - if (!nodes) - return 0; - - cpuset_update_task_memory_state(); - if (!cpuset_nodes_subset_current_mems_allowed(*nodes)) - return -EINVAL; - return mpol_check_policy(mode, nodes); -} - - /* * Update task->flags PF_MEMPOLICY bit: set iff non-default * mempolicy. Allows more rapid checking of this (combined perhaps @@ -459,11 +479,11 @@ static void mpol_set_task_struct_flag(void) } /* Set the process memory policy */ -long do_set_mempolicy(int mode, nodemask_t *nodes) +static long do_set_mempolicy(int mode, nodemask_t *nodes) { struct mempolicy *new; - if (contextualize_policy(mode, nodes)) + if (mpol_check_policy(mode, nodes)) return -EINVAL; new = mpol_new(mode, nodes); if (IS_ERR(new)) @@ -494,9 +514,9 @@ static void get_zonemask(struct mempolicy *p, nodemask_t *nodes) *nodes = p->v.nodes; break; case MPOL_PREFERRED: - /* or use current node instead of online map? */ + /* or use current node instead of memory_map? */ if (p->v.preferred_node < 0) - *nodes = node_online_map; + *nodes = node_states[N_HIGH_MEMORY]; else node_set(p->v.preferred_node, *nodes); break; @@ -519,8 +539,8 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr) } /* Retrieve NUMA policy */ -long do_get_mempolicy(int *policy, nodemask_t *nmask, - unsigned long addr, unsigned long flags) +static long do_get_mempolicy(int *policy, nodemask_t *nmask, + unsigned long addr, unsigned long flags) { int err; struct mm_struct *mm = current->mm; @@ -528,8 +548,18 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, struct mempolicy *pol = current->mempolicy; cpuset_update_task_memory_state(); - if (flags & ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR)) + if (flags & + ~(unsigned long)(MPOL_F_NODE|MPOL_F_ADDR|MPOL_F_MEMS_ALLOWED)) return -EINVAL; + + if (flags & MPOL_F_MEMS_ALLOWED) { + if (flags & (MPOL_F_NODE|MPOL_F_ADDR)) + return -EINVAL; + *policy = 0; /* just so it's initialized */ + *nmask = cpuset_current_mems_allowed; + return 0; + } + if (flags & MPOL_F_ADDR) { down_read(&mm->mmap_sem); vma = find_vma_intersection(mm, addr, addr+1); @@ -601,7 +631,8 @@ static struct page *new_node_page(struct page *page, unsigned long node, int **x * Migrate pages from one node to a target node. * Returns error or the number of pages not migrated. */ -int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) +static int migrate_to_node(struct mm_struct *mm, int source, int dest, + int flags) { nodemask_t nmask; LIST_HEAD(pagelist); @@ -706,12 +737,29 @@ out: } +/* + * Allocate a new page for page migration based on vma policy. + * Start assuming that page is mapped by vma pointed to by @private. + * Search forward from there, if not. N.B., this assumes that the + * list of pages handed to migrate_pages()--which is how we get here-- + * is in virtual address order. + */ static struct page *new_vma_page(struct page *page, unsigned long private, int **x) { struct vm_area_struct *vma = (struct vm_area_struct *)private; + unsigned long uninitialized_var(address); - return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - page_address_in_vma(page, vma)); + while (vma) { + address = page_address_in_vma(page, vma); + if (address != -EFAULT) + break; + vma = vma->vm_next; + } + + /* + * if !vma, alloc_page_vma() will use task or system default policy + */ + return alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); } #else @@ -732,8 +780,9 @@ static struct page *new_vma_page(struct page *page, unsigned long private, int * } #endif -long do_mbind(unsigned long start, unsigned long len, - unsigned long mode, nodemask_t *nmask, unsigned long flags) +static long do_mbind(unsigned long start, unsigned long len, + unsigned long mode, nodemask_t *nmask, + unsigned long flags) { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; @@ -881,10 +930,6 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, err = get_nodes(&nodes, nmask, maxnode); if (err) return err; -#ifdef CONFIG_CPUSETS - /* Restrict the nodes to the allowed nodes in the cpuset */ - nodes_and(nodes, nodes, current->mems_allowed); -#endif return do_mbind(start, len, mode, &nodes, flags); } @@ -924,7 +969,7 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, /* Find the mm_struct */ read_lock(&tasklist_lock); - task = pid ? find_task_by_pid(pid) : current; + task = pid ? find_task_by_vpid(pid) : current; if (!task) { read_unlock(&tasklist_lock); return -ESRCH; @@ -955,6 +1000,11 @@ asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, goto out; } + if (!nodes_subset(new, node_states[N_HIGH_MEMORY])) { + err = -EINVAL; + goto out; + } + err = security_task_movememory(task); if (err) goto out; @@ -973,7 +1023,8 @@ asmlinkage long sys_get_mempolicy(int __user *policy, unsigned long maxnode, unsigned long addr, unsigned long flags) { - int err, pval; + int err; + int uninitialized_var(pval); nodemask_t nodes; if (nmask != NULL && maxnode < MAX_NUMNODES) @@ -1072,21 +1123,37 @@ asmlinkage long compat_sys_mbind(compat_ulong_t start, compat_ulong_t len, #endif -/* Return effective policy for a VMA */ +/* + * get_vma_policy(@task, @vma, @addr) + * @task - task for fallback if vma policy == default + * @vma - virtual memory area whose policy is sought + * @addr - address in @vma for shared policy lookup + * + * Returns effective policy for a VMA at specified address. + * Falls back to @task or system default policy, as necessary. + * Returned policy has extra reference count if shared, vma, + * or some other task's policy [show_numa_maps() can pass + * @task != current]. It is the caller's responsibility to + * free the reference in these cases. + */ static struct mempolicy * get_vma_policy(struct task_struct *task, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = task->mempolicy; + int shared_pol = 0; if (vma) { - if (vma->vm_ops && vma->vm_ops->get_policy) + if (vma->vm_ops && vma->vm_ops->get_policy) { pol = vma->vm_ops->get_policy(vma, addr); - else if (vma->vm_policy && + shared_pol = 1; /* if pol non-NULL, add ref below */ + } else if (vma->vm_policy && vma->vm_policy->policy != MPOL_DEFAULT) pol = vma->vm_policy; } if (!pol) pol = &default_policy; + else if (!shared_pol && pol != current->mempolicy) + mpol_get(pol); /* vma or other task's policy */ return pol; } @@ -1202,19 +1269,45 @@ static inline unsigned interleave_nid(struct mempolicy *pol, } #ifdef CONFIG_HUGETLBFS -/* Return a zonelist suitable for a huge page allocation. */ +/* + * huge_zonelist(@vma, @addr, @gfp_flags, @mpol) + * @vma = virtual memory area whose policy is sought + * @addr = address in @vma for shared policy lookup and interleave policy + * @gfp_flags = for requested zone + * @mpol = pointer to mempolicy pointer for reference counted 'BIND policy + * + * Returns a zonelist suitable for a huge page allocation. + * If the effective policy is 'BIND, returns pointer to policy's zonelist. + * If it is also a policy for which get_vma_policy() returns an extra + * reference, we must hold that reference until after allocation. + * In that case, return policy via @mpol so hugetlb allocation can drop + * the reference. For non-'BIND referenced policies, we can/do drop the + * reference here, so the caller doesn't need to know about the special case + * for default and current task policy. + */ struct zonelist *huge_zonelist(struct vm_area_struct *vma, unsigned long addr, - gfp_t gfp_flags) + gfp_t gfp_flags, struct mempolicy **mpol) { struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct zonelist *zl; + *mpol = NULL; /* probably no unref needed */ if (pol->policy == MPOL_INTERLEAVE) { unsigned nid; nid = interleave_nid(pol, vma, addr, HPAGE_SHIFT); + __mpol_free(pol); /* finished with pol */ return NODE_DATA(nid)->node_zonelists + gfp_zone(gfp_flags); } - return zonelist_policy(GFP_HIGHUSER, pol); + + zl = zonelist_policy(GFP_HIGHUSER, pol); + if (unlikely(pol != &default_policy && pol != current->mempolicy)) { + if (pol->policy != MPOL_BIND) + __mpol_free(pol); /* finished with pol */ + else + *mpol = pol; /* unref needed after allocation */ + } + return zl; } #endif @@ -1259,6 +1352,7 @@ struct page * alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) { struct mempolicy *pol = get_vma_policy(current, vma, addr); + struct zonelist *zl; cpuset_update_task_memory_state(); @@ -1268,7 +1362,19 @@ alloc_page_vma(gfp_t gfp, struct vm_area_struct *vma, unsigned long addr) nid = interleave_nid(pol, vma, addr, PAGE_SHIFT); return alloc_page_interleave(gfp, 0, nid); } - return __alloc_pages(gfp, 0, zonelist_policy(gfp, pol)); + zl = zonelist_policy(gfp, pol); + if (pol != &default_policy && pol != current->mempolicy) { + /* + * slow path: ref counted policy -- shared or vma + */ + struct page *page = __alloc_pages(gfp, 0, zl); + __mpol_free(pol); + return page; + } + /* + * fast path: default or task policy + */ + return __alloc_pages(gfp, 0, zl); } /** @@ -1311,7 +1417,6 @@ EXPORT_SYMBOL(alloc_pages_current); * keeps mempolicies cpuset relative after its cpuset moves. See * further kernel/cpuset.c update_nodemask(). */ -void *cpuset_being_rebound; /* Slow path of a mempolicy copy */ struct mempolicy *__mpol_copy(struct mempolicy *old) @@ -1467,8 +1572,8 @@ static void sp_delete(struct shared_policy *sp, struct sp_node *n) kmem_cache_free(sn_cache, n); } -struct sp_node * -sp_alloc(unsigned long start, unsigned long end, struct mempolicy *pol) +static struct sp_node *sp_alloc(unsigned long start, unsigned long end, + struct mempolicy *pol) { struct sp_node *n = kmem_cache_alloc(sn_cache, GFP_KERNEL); @@ -1605,11 +1710,11 @@ void __init numa_policy_init(void) policy_cache = kmem_cache_create("numa_policy", sizeof(struct mempolicy), - 0, SLAB_PANIC, NULL, NULL); + 0, SLAB_PANIC, NULL); sn_cache = kmem_cache_create("shared_policy_node", sizeof(struct sp_node), - 0, SLAB_PANIC, NULL, NULL); + 0, SLAB_PANIC, NULL); /* * Set interleaving policy for system init. Interleaving is only @@ -1617,7 +1722,7 @@ void __init numa_policy_init(void) * fall back to the largest node if they're all smaller. */ nodes_clear(interleave_nodes); - for_each_online_node(nid) { + for_each_node_state(nid, N_HIGH_MEMORY) { unsigned long total_pages = node_present_pages(nid); /* Preserve the largest node */ @@ -1646,7 +1751,8 @@ void numa_default_policy(void) } /* Migrate a policy to a different set of nodes */ -void mpol_rebind_policy(struct mempolicy *pol, const nodemask_t *newmask) +static void mpol_rebind_policy(struct mempolicy *pol, + const nodemask_t *newmask) { nodemask_t *mpolmask; nodemask_t tmp; @@ -1867,6 +1973,7 @@ int show_numa_map(struct seq_file *m, void *v) struct numa_maps *md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; + struct mempolicy *pol; int n; char buffer[50]; @@ -1877,8 +1984,13 @@ int show_numa_map(struct seq_file *m, void *v) if (!md) return 0; - mpol_to_str(buffer, sizeof(buffer), - get_vma_policy(priv->task, vma, vma->vm_start)); + pol = get_vma_policy(priv->task, vma, vma->vm_start); + mpol_to_str(buffer, sizeof(buffer), pol); + /* + * unref shared or other task's mempolicy + */ + if (pol != &default_policy && pol != current->mempolicy) + __mpol_free(pol); seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1897,7 +2009,7 @@ int show_numa_map(struct seq_file *m, void *v) seq_printf(m, " huge"); } else { check_pgd_range(vma, vma->vm_start, vma->vm_end, - &node_online_map, MPOL_MF_STATS, md); + &node_states[N_HIGH_MEMORY], MPOL_MF_STATS, md); } if (!md->pages) @@ -1924,7 +2036,7 @@ int show_numa_map(struct seq_file *m, void *v) if (md->writeback) seq_printf(m," writeback=%lu", md->writeback); - for_each_online_node(n) + for_each_node_state(n, N_HIGH_MEMORY) if (md->node[n]) seq_printf(m, " N%d=%lu", n, md->node[n]); out: @@ -1935,4 +2047,3 @@ out: m->version = (vma != priv->tail_vma) ? vma->vm_start : 0; return 0; } -