X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fhugetlb.c;h=28c655ba935396bb14a73957cb07ac58cb5e63fa;hb=88c3bd707c2552bcef93cc3724647903aece159d;hp=82378d44a0c5aded21a84686ac56052d588cea37;hpb=e5ff215941d59f8ae6bf58f6428dc5c26745a612;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 82378d4..28c655b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7,16 +7,21 @@ #include #include #include +#include #include #include +#include #include #include #include #include #include +#include +#include #include #include +#include #include #include "internal.h" @@ -29,9 +34,12 @@ static int max_hstate; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; +__initdata LIST_HEAD(huge_boot_pages); + /* for command line parsing */ static struct hstate * __initdata parsed_hstate; static unsigned long __initdata default_hstate_max_huge_pages; +static unsigned long __initdata default_hstate_size; #define for_each_hstate(h) \ for ((h) = hstates; (h) < &hstates[max_hstate]; (h)++) @@ -212,6 +220,35 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, } /* + * Return the size of the pages allocated when backing a VMA. In the majority + * cases this will be same size as used by the page table entries. + */ +unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + struct hstate *hstate; + + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; + + hstate = hstate_vma(vma); + + return 1UL << (hstate->order + PAGE_SHIFT); +} + +/* + * Return the page size being used by the MMU to back a VMA. In the majority + * of cases, the page size used by the kernel matches the MMU size. On + * architectures where it differs, an architecture-specific version of this + * function is required. + */ +#ifndef vma_mmu_pagesize +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} +#endif + +/* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to * alignment. @@ -255,7 +292,7 @@ struct resv_map { struct list_head regions; }; -struct resv_map *resv_map_alloc(void) +static struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); if (!resv_map) @@ -267,7 +304,7 @@ struct resv_map *resv_map_alloc(void) return resv_map; } -void resv_map_release(struct kref *ref) +static void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); @@ -282,7 +319,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); - return 0; + return NULL; } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) @@ -337,20 +374,37 @@ void reset_vma_resv_huge_pages(struct vm_area_struct *vma) } /* Returns true if the VMA has associated reserve pages */ -static int vma_has_private_reserves(struct vm_area_struct *vma) +static int vma_has_reserves(struct vm_area_struct *vma) { if (vma->vm_flags & VM_SHARED) - return 0; - if (!is_vma_resv_set(vma, HPAGE_RESV_OWNER)) - return 0; - return 1; + return 1; + if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) + return 1; + return 0; } +static void clear_gigantic_page(struct page *page, + unsigned long addr, unsigned long sz) +{ + int i; + struct page *p = page; + + might_sleep(); + for (i = 0; i < sz/PAGE_SIZE; i++, p = mem_map_next(p, page, i)) { + cond_resched(); + clear_user_highpage(p, addr + i * PAGE_SIZE); + } +} static void clear_huge_page(struct page *page, unsigned long addr, unsigned long sz) { int i; + if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, sz); + return; + } + might_sleep(); for (i = 0; i < sz/PAGE_SIZE; i++) { cond_resched(); @@ -358,12 +412,34 @@ static void clear_huge_page(struct page *page, } } +static void copy_gigantic_page(struct page *dst, struct page *src, + unsigned long addr, struct vm_area_struct *vma) +{ + int i; + struct hstate *h = hstate_vma(vma); + struct page *dst_base = dst; + struct page *src_base = src; + might_sleep(); + for (i = 0; i < pages_per_huge_page(h); ) { + cond_resched(); + copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); + + i++; + dst = mem_map_next(dst, dst_base, i); + src = mem_map_next(src, src_base, i); + } +} static void copy_huge_page(struct page *dst, struct page *src, unsigned long addr, struct vm_area_struct *vma) { int i; struct hstate *h = hstate_vma(vma); + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src, addr, vma); + return; + } + might_sleep(); for (i = 0; i < pages_per_huge_page(h); i++) { cond_resched(); @@ -415,7 +491,7 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_private_reserves(vma) && + if (!vma_has_reserves(vma) && h->free_huge_pages - h->resv_huge_pages == 0) return NULL; @@ -448,6 +524,8 @@ static void update_and_free_page(struct hstate *h, struct page *page) { int i; + VM_BUG_ON(h->order >= MAX_ORDER); + h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++) { @@ -488,7 +566,7 @@ static void free_huge_page(struct page *page) INIT_LIST_HEAD(&page->lru); spin_lock(&hugetlb_lock); - if (h->surplus_huge_pages_node[nid]) { + if (h->surplus_huge_pages_node[nid] && huge_page_order(h) < MAX_ORDER) { update_and_free_page(h, page); h->surplus_huge_pages--; h->surplus_huge_pages_node[nid]--; @@ -549,13 +627,16 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; + if (h->order >= MAX_ORDER) + return NULL; + page = alloc_pages_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); if (page) { if (arch_prepare_hugepage(page)) { - __free_pages(page, HUGETLB_PAGE_ORDER); + __free_pages(page, huge_page_order(h)); return NULL; } prep_new_huge_page(h, page, nid); @@ -564,6 +645,27 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) return page; } +/* + * Use a helper variable to find the next node and then + * copy it back to hugetlb_next_nid afterwards: + * otherwise there's a window in which a racer might + * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * But we don't need to use a spin_lock here: it really + * doesn't matter if occasionally a racer chooses the + * same nid as we do. Move nid forward in the mask even + * if we just successfully allocated a hugepage so that + * the next caller gets hugepages on the next node. + */ +static int hstate_next_node(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->hugetlb_next_nid, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->hugetlb_next_nid = next_nid; + return next_nid; +} + static int alloc_fresh_huge_page(struct hstate *h) { struct page *page; @@ -577,21 +679,7 @@ static int alloc_fresh_huge_page(struct hstate *h) page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); if (page) ret = 1; - /* - * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: - * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. - * But we don't need to use a spin_lock here: it really - * doesn't matter if occasionally a racer chooses the - * same nid as we do. Move nid forward in the mask even - * if we just successfully allocated a hugepage so that - * the next caller gets hugepages on the next node. - */ - next_nid = next_node(h->hugetlb_next_nid, node_online_map); - if (next_nid == MAX_NUMNODES) - next_nid = first_node(node_online_map); - h->hugetlb_next_nid = next_nid; + next_nid = hstate_next_node(h); } while (!page && h->hugetlb_next_nid != start_nid); if (ret) @@ -608,6 +696,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, struct page *page; unsigned int nid; + if (h->order >= MAX_ORDER) + return NULL; + /* * Assume we will successfully allocate the surplus page to * prevent racing processes from causing the surplus to exceed @@ -645,6 +736,11 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); + if (page && arch_prepare_hugepage(page)) { + __free_pages(page, huge_page_order(h)); + return NULL; + } + spin_lock(&hugetlb_lock); if (page) { /* @@ -784,6 +880,10 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; + /* Cannot return gigantic pages currently */ + if (h->order >= MAX_ORDER) + return; + nr_pages = min(unused_resv_pages, h->surplus_huge_pages); while (remaining_iterations-- && nr_pages) { @@ -818,7 +918,7 @@ static void return_unused_surplus_pages(struct hstate *h, * an instantiated the change should be committed via vma_commit_reservation. * No action is required on failure. */ -static int vma_needs_reservation(struct hstate *h, +static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping = vma->vm_file->f_mapping; @@ -833,7 +933,7 @@ static int vma_needs_reservation(struct hstate *h, return 1; } else { - int err; + long err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); @@ -869,7 +969,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned int chg; + long chg; /* * Processes that did not create the mapping will have no reserves and @@ -905,115 +1005,120 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } -static void __init hugetlb_init_one_hstate(struct hstate *h) +int __weak alloc_bootmem_huge_page(struct hstate *h) { - unsigned long i; + struct huge_bootmem_page *m; + int nr_nodes = nodes_weight(node_online_map); - for (i = 0; i < MAX_NUMNODES; ++i) - INIT_LIST_HEAD(&h->hugepage_freelists[i]); + while (nr_nodes) { + void *addr; - h->hugetlb_next_nid = first_node(node_online_map); + addr = __alloc_bootmem_node_nopanic( + NODE_DATA(h->hugetlb_next_nid), + huge_page_size(h), huge_page_size(h), 0); - for (i = 0; i < h->max_huge_pages; ++i) { - if (!alloc_fresh_huge_page(h)) - break; + if (addr) { + /* + * Use the beginning of the huge page to store the + * huge_bootmem_page struct (until gather_bootmem + * puts them into the mem_map). + */ + m = addr; + goto found; + } + hstate_next_node(h); + nr_nodes--; } - h->max_huge_pages = h->free_huge_pages = h->nr_huge_pages = i; + return 0; + +found: + BUG_ON((unsigned long)virt_to_phys(m) & (huge_page_size(h) - 1)); + /* Put them into a private list first because mem_map is not up yet */ + list_add(&m->list, &huge_boot_pages); + m->hstate = h; + return 1; } -static void __init hugetlb_init_hstates(void) +static void prep_compound_huge_page(struct page *page, int order) { - struct hstate *h; - - for_each_hstate(h) { - hugetlb_init_one_hstate(h); - } + if (unlikely(order > (MAX_ORDER - 1))) + prep_compound_gigantic_page(page, order); + else + prep_compound_page(page, order); } -static void __init report_hugepages(void) +/* Put bootmem huge pages into the standard lists after mem_map is up */ +static void __init gather_bootmem_prealloc(void) { - struct hstate *h; + struct huge_bootmem_page *m; - for_each_hstate(h) { - printk(KERN_INFO "Total HugeTLB memory allocated, " - "%ld %dMB pages\n", - h->free_huge_pages, - 1 << (h->order + PAGE_SHIFT - 20)); + list_for_each_entry(m, &huge_boot_pages, list) { + struct page *page = virt_to_page(m); + struct hstate *h = m->hstate; + __ClearPageReserved(page); + WARN_ON(page_count(page) != 1); + prep_compound_huge_page(page, h->order); + prep_new_huge_page(h, page, page_to_nid(page)); } } -static int __init hugetlb_init(void) +static void __init hugetlb_hstate_alloc_pages(struct hstate *h) { - BUILD_BUG_ON(HPAGE_SHIFT == 0); + unsigned long i; - if (!size_to_hstate(HPAGE_SIZE)) { - hugetlb_add_hstate(HUGETLB_PAGE_ORDER); - parsed_hstate->max_huge_pages = default_hstate_max_huge_pages; + for (i = 0; i < h->max_huge_pages; ++i) { + if (h->order >= MAX_ORDER) { + if (!alloc_bootmem_huge_page(h)) + break; + } else if (!alloc_fresh_huge_page(h)) + break; } - default_hstate_idx = size_to_hstate(HPAGE_SIZE) - hstates; - - hugetlb_init_hstates(); - - report_hugepages(); - - return 0; + h->max_huge_pages = i; } -module_init(hugetlb_init); -/* Should be called on processing a hugepagesz=... option */ -void __init hugetlb_add_hstate(unsigned order) +static void __init hugetlb_init_hstates(void) { struct hstate *h; - if (size_to_hstate(PAGE_SIZE << order)) { - printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); - return; + + for_each_hstate(h) { + /* oversize hugepages were init'ed in early boot */ + if (h->order < MAX_ORDER) + hugetlb_hstate_alloc_pages(h); } - BUG_ON(max_hstate >= HUGE_MAX_HSTATE); - BUG_ON(order == 0); - h = &hstates[max_hstate++]; - h->order = order; - h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); - hugetlb_init_one_hstate(h); - parsed_hstate = h; } -static int __init hugetlb_setup(char *s) +static char * __init memfmt(char *buf, unsigned long n) { - unsigned long *mhp; - - /* - * !max_hstate means we haven't parsed a hugepagesz= parameter yet, - * so this hugepages= parameter goes to the "default hstate". - */ - if (!max_hstate) - mhp = &default_hstate_max_huge_pages; + if (n >= (1UL << 30)) + sprintf(buf, "%lu GB", n >> 30); + else if (n >= (1UL << 20)) + sprintf(buf, "%lu MB", n >> 20); else - mhp = &parsed_hstate->max_huge_pages; - - if (sscanf(s, "%lu", mhp) <= 0) - *mhp = 0; - - return 1; + sprintf(buf, "%lu KB", n >> 10); + return buf; } -__setup("hugepages=", hugetlb_setup); -static unsigned int cpuset_mems_nr(unsigned int *array) +static void __init report_hugepages(void) { - int node; - unsigned int nr = 0; - - for_each_node_mask(node, cpuset_current_mems_allowed) - nr += array[node]; + struct hstate *h; - return nr; + for_each_hstate(h) { + char buf[32]; + printk(KERN_INFO "HugeTLB registered %s page size, " + "pre-allocated %ld pages\n", + memfmt(buf, huge_page_size(h)), + h->free_huge_pages); + } } -#ifdef CONFIG_SYSCTL #ifdef CONFIG_HIGHMEM static void try_to_free_low(struct hstate *h, unsigned long count) { int i; + if (h->order >= MAX_ORDER) + return; + for (i = 0; i < MAX_NUMNODES; ++i) { struct page *page, *next; struct list_head *freel = &h->hugepage_freelists[i]; @@ -1040,6 +1145,9 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { unsigned long min_count, ret; + if (h->order >= MAX_ORDER) + return h->max_huge_pages; + /* * Increase the pool size * First take pages out of surplus state. Then make up the @@ -1105,6 +1213,269 @@ out: return ret; } +#define HSTATE_ATTR_RO(_name) \ + static struct kobj_attribute _name##_attr = __ATTR_RO(_name) + +#define HSTATE_ATTR(_name) \ + static struct kobj_attribute _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +static struct kobject *hugepages_kobj; +static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; + +static struct hstate *kobj_to_hstate(struct kobject *kobj) +{ + int i; + for (i = 0; i < HUGE_MAX_HSTATE; i++) + if (hstate_kobjs[i] == kobj) + return &hstates[i]; + BUG(); + return NULL; +} + +static ssize_t nr_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_huge_pages); +} +static ssize_t nr_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + h->max_huge_pages = set_max_huge_pages(h, input); + + return count; +} +HSTATE_ATTR(nr_hugepages); + +static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages); +} +static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, size_t count) +{ + int err; + unsigned long input; + struct hstate *h = kobj_to_hstate(kobj); + + err = strict_strtoul(buf, 10, &input); + if (err) + return 0; + + spin_lock(&hugetlb_lock); + h->nr_overcommit_huge_pages = input; + spin_unlock(&hugetlb_lock); + + return count; +} +HSTATE_ATTR(nr_overcommit_hugepages); + +static ssize_t free_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->free_huge_pages); +} +HSTATE_ATTR_RO(free_hugepages); + +static ssize_t resv_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->resv_huge_pages); +} +HSTATE_ATTR_RO(resv_hugepages); + +static ssize_t surplus_hugepages_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct hstate *h = kobj_to_hstate(kobj); + return sprintf(buf, "%lu\n", h->surplus_huge_pages); +} +HSTATE_ATTR_RO(surplus_hugepages); + +static struct attribute *hstate_attrs[] = { + &nr_hugepages_attr.attr, + &nr_overcommit_hugepages_attr.attr, + &free_hugepages_attr.attr, + &resv_hugepages_attr.attr, + &surplus_hugepages_attr.attr, + NULL, +}; + +static struct attribute_group hstate_attr_group = { + .attrs = hstate_attrs, +}; + +static int __init hugetlb_sysfs_add_hstate(struct hstate *h) +{ + int retval; + + hstate_kobjs[h - hstates] = kobject_create_and_add(h->name, + hugepages_kobj); + if (!hstate_kobjs[h - hstates]) + return -ENOMEM; + + retval = sysfs_create_group(hstate_kobjs[h - hstates], + &hstate_attr_group); + if (retval) + kobject_put(hstate_kobjs[h - hstates]); + + return retval; +} + +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h); + if (err) + printk(KERN_ERR "Hugetlb: Unable to add hstate %s", + h->name); + } +} + +static void __exit hugetlb_exit(void) +{ + struct hstate *h; + + for_each_hstate(h) { + kobject_put(hstate_kobjs[h - hstates]); + } + + kobject_put(hugepages_kobj); +} +module_exit(hugetlb_exit); + +static int __init hugetlb_init(void) +{ + /* Some platform decide whether they support huge pages at boot + * time. On these, such as powerpc, HPAGE_SHIFT is set to 0 when + * there is no such support + */ + if (HPAGE_SHIFT == 0) + return 0; + + if (!size_to_hstate(default_hstate_size)) { + default_hstate_size = HPAGE_SIZE; + if (!size_to_hstate(default_hstate_size)) + hugetlb_add_hstate(HUGETLB_PAGE_ORDER); + } + default_hstate_idx = size_to_hstate(default_hstate_size) - hstates; + if (default_hstate_max_huge_pages) + default_hstate.max_huge_pages = default_hstate_max_huge_pages; + + hugetlb_init_hstates(); + + gather_bootmem_prealloc(); + + report_hugepages(); + + hugetlb_sysfs_init(); + + return 0; +} +module_init(hugetlb_init); + +/* Should be called on processing a hugepagesz=... option */ +void __init hugetlb_add_hstate(unsigned order) +{ + struct hstate *h; + unsigned long i; + + if (size_to_hstate(PAGE_SIZE << order)) { + printk(KERN_WARNING "hugepagesz= specified twice, ignoring\n"); + return; + } + BUG_ON(max_hstate >= HUGE_MAX_HSTATE); + BUG_ON(order == 0); + h = &hstates[max_hstate++]; + h->order = order; + h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1); + h->nr_huge_pages = 0; + h->free_huge_pages = 0; + for (i = 0; i < MAX_NUMNODES; ++i) + INIT_LIST_HEAD(&h->hugepage_freelists[i]); + h->hugetlb_next_nid = first_node(node_online_map); + snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", + huge_page_size(h)/1024); + + parsed_hstate = h; +} + +static int __init hugetlb_nrpages_setup(char *s) +{ + unsigned long *mhp; + static unsigned long *last_mhp; + + /* + * !max_hstate means we haven't parsed a hugepagesz= parameter yet, + * so this hugepages= parameter goes to the "default hstate". + */ + if (!max_hstate) + mhp = &default_hstate_max_huge_pages; + else + mhp = &parsed_hstate->max_huge_pages; + + if (mhp == last_mhp) { + printk(KERN_WARNING "hugepages= specified twice without " + "interleaving hugepagesz=, ignoring\n"); + return 1; + } + + if (sscanf(s, "%lu", mhp) <= 0) + *mhp = 0; + + /* + * Global state is always initialized later in hugetlb_init. + * But we need to allocate >= MAX_ORDER hstates here early to still + * use the bootmem allocator. + */ + if (max_hstate && parsed_hstate->order >= MAX_ORDER) + hugetlb_hstate_alloc_pages(parsed_hstate); + + last_mhp = mhp; + + return 1; +} +__setup("hugepages=", hugetlb_nrpages_setup); + +static int __init hugetlb_default_setup(char *s) +{ + default_hstate_size = memparse(s, &s); + return 1; +} +__setup("default_hugepagesz=", hugetlb_default_setup); + +static unsigned int cpuset_mems_nr(unsigned int *array) +{ + int node; + unsigned int nr = 0; + + for_each_node_mask(node, cpuset_current_mems_allowed) + nr += array[node]; + + return nr; +} + +#ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, struct file *file, void __user *buffer, size_t *length, loff_t *ppos) @@ -1162,15 +1533,15 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, #endif /* CONFIG_SYSCTL */ -int hugetlb_report_meminfo(char *buf) +void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h = &default_hstate; - return sprintf(buf, - "HugePages_Total: %5lu\n" - "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" - "HugePages_Surp: %5lu\n" - "Hugepagesize: %5lu kB\n", + seq_printf(m, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", h->nr_huge_pages, h->free_huge_pages, h->resv_huge_pages, @@ -1271,8 +1642,10 @@ static void hugetlb_vm_op_close(struct vm_area_struct *vma) kref_put(&reservations->refs, resv_map_release); - if (reserve) + if (reserve) { hugetlb_acct_memory(h, -reserve); + hugetlb_put_quota(vma->vm_file->f_mapping, reserve); + } } } @@ -1389,6 +1762,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, BUG_ON(start & ~huge_page_mask(h)); BUG_ON(end & ~huge_page_mask(h)); + mmu_notifier_invalidate_range_start(mm, start, end); spin_lock(&mm->page_table_lock); for (address = start; address < end; address += sz) { ptep = huge_pte_offset(mm, address); @@ -1430,6 +1804,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, } spin_unlock(&mm->page_table_lock); flush_tlb_range(vma, start, end); + mmu_notifier_invalidate_range_end(mm, start, end); list_for_each_entry_safe(page, tmp, &page_list, lru) { list_del(&page->lru); put_page(page); @@ -1439,19 +1814,9 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, unsigned long end, struct page *ref_page) { - /* - * It is undesirable to test vma->vm_file as it should be non-null - * for valid hugetlb area. However, vm_file will be NULL in the error - * cleanup path of do_mmap_pgoff. When hugetlbfs ->mmap method fails, - * do_mmap_pgoff() nullifies vma->vm_file before calling this function - * to clean up. Since no pte has actually been setup, it is safe to - * do nothing in this case. - */ - if (vma->vm_file) { - spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); - __unmap_hugepage_range(vma, start, end, ref_page); - spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); - } + spin_lock(&vma->vm_file->f_mapping->i_mmap_lock); + __unmap_hugepage_range(vma, start, end, ref_page); + spin_unlock(&vma->vm_file->f_mapping->i_mmap_lock); } /* @@ -1460,11 +1825,10 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ -int unmap_ref_private(struct mm_struct *mm, - struct vm_area_struct *vma, - struct page *page, - unsigned long address) +static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) { + struct hstate *h = hstate_vma(vma); struct vm_area_struct *iter_vma; struct address_space *mapping; struct prio_tree_iter iter; @@ -1474,7 +1838,7 @@ int unmap_ref_private(struct mm_struct *mm, * vm_pgoff is in PAGE_SIZE units, hence the different calculation * from page cache lookup which is in HPAGE_SIZE units. */ - address = address & huge_page_mask(hstate_vma(vma)); + address = address & huge_page_mask(h); pgoff = ((address - vma->vm_start) >> PAGE_SHIFT) + (vma->vm_pgoff >> PAGE_SHIFT); mapping = (struct address_space *)page_private(page); @@ -1493,7 +1857,7 @@ int unmap_ref_private(struct mm_struct *mm, */ if (!is_vma_resv_set(iter_vma, HPAGE_RESV_OWNER)) unmap_hugepage_range(iter_vma, - address, address + HPAGE_SIZE, + address, address + huge_page_size(h), page); } @@ -1655,6 +2019,18 @@ retry: lock_page(page); } + /* + * If we are going to COW a private mapping later, we examine the + * pending reservations for this page now. This will ensure that + * any allocations necessary to record that reservation occur outside + * the spinlock. + */ + if (write_access && !(vma->vm_flags & VM_SHARED)) + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto backout_unlocked; + } + spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> huge_page_shift(h); if (idx >= size) @@ -1680,6 +2056,7 @@ out: backout: spin_unlock(&mm->page_table_lock); +backout_unlocked: unlock_page(page); put_page(page); goto out; @@ -1691,6 +2068,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *ptep; pte_t entry; int ret; + struct page *pagecache_page = NULL; static DEFINE_MUTEX(hugetlb_instantiation_mutex); struct hstate *h = hstate_vma(vma); @@ -1707,30 +2085,79 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); - return ret; + goto out_mutex; } ret = 0; + /* + * If we are going to COW the mapping later, we examine the pending + * reservations for this page now. This will ensure that any + * allocations necessary to record that reservation occur outside the + * spinlock. For private mappings, we also lookup the pagecache + * page now as it is used to determine if a reservation has been + * consumed. + */ + if (write_access && !pte_write(entry)) { + if (vma_needs_reservation(h, vma, address) < 0) { + ret = VM_FAULT_OOM; + goto out_mutex; + } + + if (!(vma->vm_flags & VM_SHARED)) + pagecache_page = hugetlbfs_pagecache_page(h, + vma, address); + } + spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ - if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) { - struct page *page; - page = hugetlbfs_pagecache_page(h, vma, address); - ret = hugetlb_cow(mm, vma, address, ptep, entry, page); - if (page) { - unlock_page(page); - put_page(page); - } + if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + goto out_page_table_lock; + + + if (write_access) { + if (!pte_write(entry)) { + ret = hugetlb_cow(mm, vma, address, ptep, entry, + pagecache_page); + goto out_page_table_lock; } + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) + update_mmu_cache(vma, address, entry); + +out_page_table_lock: spin_unlock(&mm->page_table_lock); + + if (pagecache_page) { + unlock_page(pagecache_page); + put_page(pagecache_page); + } + +out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); return ret; } +/* Can be overriden by architectures */ +__attribute__((weak)) struct page * +follow_huge_pud(struct mm_struct *mm, unsigned long address, + pud_t *pud, int write) +{ + BUG(); + return NULL; +} + +static int huge_zeropage_ok(pte_t *ptep, int write, int shared) +{ + if (!ptep || write || shared) + return 0; + else + return huge_pte_none(huge_ptep_get(ptep)); +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, @@ -1740,6 +2167,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr = *position; int remainder = *length; struct hstate *h = hstate_vma(vma); + int zeropage_ok = 0; + int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { @@ -1752,8 +2181,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * first, for the page indexing below to work. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (huge_zeropage_ok(pte, write, shared)) + zeropage_ok = 1; - if (!pte || huge_pte_none(huge_ptep_get(pte)) || + if (!pte || + (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || (write && !pte_write(huge_ptep_get(pte)))) { int ret; @@ -1773,8 +2205,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { - get_page(page); - pages[i] = page + pfn_offset; + if (zeropage_ok) + pages[i] = ZERO_PAGE(0); + else + pages[i] = mem_map_offset(page, pfn_offset); + get_page(pages[i]); } if (vmas) @@ -1834,12 +2269,18 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int acctflag) { long ret, chg; struct hstate *h = hstate_inode(inode); - if (vma && vma->vm_flags & VM_NORESERVE) + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page + * and filesystem quota without using reserves + */ + if (acctflag & VM_NORESERVE) return 0; /* @@ -1864,13 +2305,31 @@ int hugetlb_reserve_pages(struct inode *inode, if (chg < 0) return chg; + /* There must be enough filesystem quota for the mapping */ if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; + + /* + * Check enough hugepages are available for the reservation. + * Hand back the quota if there are not + */ ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; } + + /* + * Account for the reservations made. Shared mappings record regions + * that have reservations as they are shared by multiple VMAs. + * When the last VMA disappears, the region map says how much + * the reservation was and the page cache tells how much of + * the reservation was consumed. Private mappings are per-VMA and + * only the consumed reservations are tracked. When the VMA + * disappears, the original reservation is the VMA size and the + * consumed reservations are stored in the map. Hence, nothing + * else has to be done for private mappings here + */ if (!vma || vma->vm_flags & VM_SHARED) region_add(&inode->i_mapping->private_list, from, to); return 0;