From 6ae11b278bca1cd41651bae49a8c69de2f6a6262 Mon Sep 17 00:00:00 2001 From: Lee Schermerhorn Date: Mon, 14 Dec 2009 17:58:16 -0800 Subject: [PATCH] hugetlb: add nodemask arg to huge page alloc, free and surplus adjust functions In preparation for constraining huge page allocation and freeing by the controlling task's numa mempolicy, add a "nodes_allowed" nodemask pointer to the allocate, free and surplus adjustment functions. For now, pass NULL to indicate default behavior--i.e., use node_online_map. A subsqeuent patch will derive a non-default mask from the controlling task's numa mempolicy. Note that this method of updating the global hstate nr_hugepages under the constraint of a nodemask simplifies keeping the global state consistent--especially the number of persistent and surplus pages relative to reservations and overcommit limits. There are undoubtedly other ways to do this, but this works for both interfaces: mempolicy and per node attributes. [rientjes@google.com: fix HIGHMEM compile error] Signed-off-by: Lee Schermerhorn Reviewed-by: Mel Gorman Acked-by: David Rientjes Reviewed-by: Andi Kleen Cc: KAMEZAWA Hiroyuki Cc: Randy Dunlap Cc: Nishanth Aravamudan Cc: Andi Kleen Cc: Adam Litke Cc: Andy Whitcroft Cc: Eric Whitney Cc: Christoph Lameter Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 125 ++++++++++++++++++++++++++++++++++------------------------- 1 file changed, 72 insertions(+), 53 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bffcf77..324d1ab 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -622,48 +622,56 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) } /* - * common helper function for hstate_next_node_to_{alloc|free}. - * return next node in node_online_map, wrapping at end. + * common helper functions for hstate_next_node_to_{alloc|free}. + * We may have allocated or freed a huge page based on a different + * nodes_allowed previously, so h->next_node_to_{alloc|free} might + * be outside of *nodes_allowed. Ensure that we use an allowed + * node for alloc or free. */ -static int next_node_allowed(int nid) +static int next_node_allowed(int nid, nodemask_t *nodes_allowed) { - nid = next_node(nid, node_online_map); + nid = next_node(nid, *nodes_allowed); if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); + nid = first_node(*nodes_allowed); VM_BUG_ON(nid >= MAX_NUMNODES); return nid; } +static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) +{ + if (!node_isset(nid, *nodes_allowed)) + nid = next_node_allowed(nid, nodes_allowed); + return nid; +} + /* - * Use a helper variable to find the next node and then - * copy it back to next_nid_to_alloc afterwards: - * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. - * But we don't need to use a spin_lock here: it really - * doesn't matter if occasionally a racer chooses the - * same nid as we do. Move nid forward in the mask even - * if we just successfully allocated a hugepage so that - * the next caller gets hugepages on the next node. + * returns the previously saved node ["this node"] from which to + * allocate a persistent huge page for the pool and advance the + * next node from which to allocate, handling wrap at end of node + * mask. */ -static int hstate_next_node_to_alloc(struct hstate *h) +static int hstate_next_node_to_alloc(struct hstate *h, + nodemask_t *nodes_allowed) { - int nid, next_nid; + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_alloc, nodes_allowed); + h->next_nid_to_alloc = next_node_allowed(nid, nodes_allowed); - nid = h->next_nid_to_alloc; - next_nid = next_node_allowed(nid); - h->next_nid_to_alloc = next_nid; return nid; } -static int alloc_fresh_huge_page(struct hstate *h) +static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int start_nid; int next_nid; int ret = 0; - start_nid = hstate_next_node_to_alloc(h); + start_nid = hstate_next_node_to_alloc(h, nodes_allowed); next_nid = start_nid; do { @@ -672,7 +680,7 @@ static int alloc_fresh_huge_page(struct hstate *h) ret = 1; break; } - next_nid = hstate_next_node_to_alloc(h); + next_nid = hstate_next_node_to_alloc(h, nodes_allowed); } while (next_nid != start_nid); if (ret) @@ -684,18 +692,20 @@ static int alloc_fresh_huge_page(struct hstate *h) } /* - * helper for free_pool_huge_page() - return the next node - * from which to free a huge page. Advance the next node id - * whether or not we find a free huge page to free so that the - * next attempt to free addresses the next node. + * helper for free_pool_huge_page() - return the previously saved + * node ["this node"] from which to free a huge page. Advance the + * next node id whether or not we find a free huge page to free so + * that the next attempt to free addresses the next node. */ -static int hstate_next_node_to_free(struct hstate *h) +static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) { - int nid, next_nid; + int nid; + + VM_BUG_ON(!nodes_allowed); + + nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); + h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); - nid = h->next_nid_to_free; - next_nid = next_node_allowed(nid); - h->next_nid_to_free = next_nid; return nid; } @@ -705,13 +715,14 @@ static int hstate_next_node_to_free(struct hstate *h) * balanced over allowed nodes. * Called with hugetlb_lock locked. */ -static int free_pool_huge_page(struct hstate *h, bool acct_surplus) +static int free_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, + bool acct_surplus) { int start_nid; int next_nid; int ret = 0; - start_nid = hstate_next_node_to_free(h); + start_nid = hstate_next_node_to_free(h, nodes_allowed); next_nid = start_nid; do { @@ -735,7 +746,7 @@ static int free_pool_huge_page(struct hstate *h, bool acct_surplus) ret = 1; break; } - next_nid = hstate_next_node_to_free(h); + next_nid = hstate_next_node_to_free(h, nodes_allowed); } while (next_nid != start_nid); return ret; @@ -937,7 +948,7 @@ static void return_unused_surplus_pages(struct hstate *h, * on-line nodes for us and will handle the hstate accounting. */ while (nr_pages--) { - if (!free_pool_huge_page(h, 1)) + if (!free_pool_huge_page(h, &node_online_map, 1)) break; } } @@ -1047,7 +1058,8 @@ int __weak alloc_bootmem_huge_page(struct hstate *h) void *addr; addr = __alloc_bootmem_node_nopanic( - NODE_DATA(hstate_next_node_to_alloc(h)), + NODE_DATA(hstate_next_node_to_alloc(h, + &node_online_map)), huge_page_size(h), huge_page_size(h), 0); if (addr) { @@ -1102,7 +1114,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (h->order >= MAX_ORDER) { if (!alloc_bootmem_huge_page(h)) break; - } else if (!alloc_fresh_huge_page(h)) + } else if (!alloc_fresh_huge_page(h, &node_online_map)) break; } h->max_huge_pages = i; @@ -1144,14 +1156,15 @@ static void __init report_hugepages(void) } #ifdef CONFIG_HIGHMEM -static void try_to_free_low(struct hstate *h, unsigned long count) +static void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { int i; if (h->order >= MAX_ORDER) return; - for (i = 0; i < MAX_NUMNODES; ++i) { + for_each_node_mask(i, *nodes_allowed) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; list_for_each_entry_safe(page, next, freel, lru) { @@ -1167,7 +1180,8 @@ static void try_to_free_low(struct hstate *h, unsigned long count) } } #else -static inline void try_to_free_low(struct hstate *h, unsigned long count) +static inline void try_to_free_low(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { } #endif @@ -1177,7 +1191,8 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) * balanced by operating on them in a round-robin fashion. * Returns 1 if an adjustment was made. */ -static int adjust_pool_surplus(struct hstate *h, int delta) +static int adjust_pool_surplus(struct hstate *h, nodemask_t *nodes_allowed, + int delta) { int start_nid, next_nid; int ret = 0; @@ -1185,9 +1200,9 @@ static int adjust_pool_surplus(struct hstate *h, int delta) VM_BUG_ON(delta != -1 && delta != 1); if (delta < 0) - start_nid = hstate_next_node_to_alloc(h); + start_nid = hstate_next_node_to_alloc(h, nodes_allowed); else - start_nid = hstate_next_node_to_free(h); + start_nid = hstate_next_node_to_free(h, nodes_allowed); next_nid = start_nid; do { @@ -1197,7 +1212,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) * To shrink on this node, there must be a surplus page */ if (!h->surplus_huge_pages_node[nid]) { - next_nid = hstate_next_node_to_alloc(h); + next_nid = hstate_next_node_to_alloc(h, + nodes_allowed); continue; } } @@ -1207,7 +1223,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) */ if (h->surplus_huge_pages_node[nid] >= h->nr_huge_pages_node[nid]) { - next_nid = hstate_next_node_to_free(h); + next_nid = hstate_next_node_to_free(h, + nodes_allowed); continue; } } @@ -1222,7 +1239,8 @@ static int adjust_pool_surplus(struct hstate *h, int delta) } #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) -static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) +static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, + nodemask_t *nodes_allowed) { unsigned long min_count, ret; @@ -1242,7 +1260,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) */ spin_lock(&hugetlb_lock); while (h->surplus_huge_pages && count > persistent_huge_pages(h)) { - if (!adjust_pool_surplus(h, -1)) + if (!adjust_pool_surplus(h, nodes_allowed, -1)) break; } @@ -1253,7 +1271,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) * and reducing the surplus. */ spin_unlock(&hugetlb_lock); - ret = alloc_fresh_huge_page(h); + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -1277,13 +1295,13 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) */ min_count = h->resv_huge_pages + h->nr_huge_pages - h->free_huge_pages; min_count = max(count, min_count); - try_to_free_low(h, min_count); + try_to_free_low(h, min_count, nodes_allowed); while (min_count < persistent_huge_pages(h)) { - if (!free_pool_huge_page(h, 0)) + if (!free_pool_huge_page(h, nodes_allowed, 0)) break; } while (count < persistent_huge_pages(h)) { - if (!adjust_pool_surplus(h, 1)) + if (!adjust_pool_surplus(h, nodes_allowed, 1)) break; } out: @@ -1329,7 +1347,7 @@ static ssize_t nr_hugepages_store(struct kobject *kobj, if (err) return 0; - h->max_huge_pages = set_max_huge_pages(h, input); + h->max_huge_pages = set_max_huge_pages(h, input, &node_online_map); return count; } @@ -1571,7 +1589,8 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) - h->max_huge_pages = set_max_huge_pages(h, tmp); + h->max_huge_pages = set_max_huge_pages(h, tmp, + &node_online_map); return 0; } -- 1.8.2.3