X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fhugetlb.c;h=5d7601b0287487321314c1d969f17425b339f9b1;hb=d327cf7449e6fd5cbac784c641770e9366faa386;hp=6058b53dcb8905bc33bc25371083b47167de7596;hpb=7526674de0c921e7f1e9b6f71a1f9d832557b554;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6058b53..5d7601b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -220,6 +220,36 @@ static pgoff_t vma_hugecache_offset(struct hstate *h, } /* + * Return the size of the pages allocated when backing a VMA. In the majority + * cases this will be same size as used by the page table entries. + */ +unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) +{ + struct hstate *hstate; + + if (!is_vm_hugetlb_page(vma)) + return PAGE_SIZE; + + hstate = hstate_vma(vma); + + return 1UL << (hstate->order + PAGE_SHIFT); +} +EXPORT_SYMBOL_GPL(vma_kernel_pagesize); + +/* + * Return the page size being used by the MMU to back a VMA. In the majority + * of cases, the page size used by the kernel matches the MMU size. On + * architectures where it differs, an architecture-specific version of this + * function is required. + */ +#ifndef vma_mmu_pagesize +unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) +{ + return vma_kernel_pagesize(vma); +} +#endif + +/* * Flags for MAP_PRIVATE reservations. These are stored in the bottom * bits of the reservation map pointer, which are always clear due to * alignment. @@ -287,7 +317,7 @@ static void resv_map_release(struct kref *ref) static struct resv_map *vma_resv_map(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); return NULL; @@ -296,7 +326,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, (get_vma_private_data(vma) & HPAGE_RESV_MASK) | (unsigned long)map); @@ -305,7 +335,7 @@ static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - VM_BUG_ON(vma->vm_flags & VM_SHARED); + VM_BUG_ON(vma->vm_flags & VM_MAYSHARE); set_vma_private_data(vma, get_vma_private_data(vma) | flags); } @@ -324,7 +354,7 @@ static void decrement_hugepage_resv_vma(struct hstate *h, if (vma->vm_flags & VM_NORESERVE) return; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { /* Shared mappings always use reserves */ h->resv_huge_pages--; } else if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { @@ -340,14 +370,14 @@ static void decrement_hugepage_resv_vma(struct hstate *h, void reset_vma_resv_huge_pages(struct vm_area_struct *vma) { VM_BUG_ON(!is_vm_hugetlb_page(vma)); - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) vma->vm_private_data = (void *)0; } /* Returns true if the VMA has associated reserve pages */ static int vma_has_reserves(struct vm_area_struct *vma) { - if (vma->vm_flags & VM_SHARED) + if (vma->vm_flags & VM_MAYSHARE) return 1; if (is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return 1; @@ -371,8 +401,10 @@ static void clear_huge_page(struct page *page, { int i; - if (unlikely(sz > MAX_ORDER_NR_PAGES)) - return clear_gigantic_page(page, addr, sz); + if (unlikely(sz > MAX_ORDER_NR_PAGES)) { + clear_gigantic_page(page, addr, sz); + return; + } might_sleep(); for (i = 0; i < sz/PAGE_SIZE; i++) { @@ -404,8 +436,10 @@ static void copy_huge_page(struct page *dst, struct page *src, int i; struct hstate *h = hstate_vma(vma); - if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) - return copy_gigantic_page(dst, src, addr, vma); + if (unlikely(pages_per_huge_page(h) > MAX_ORDER_NR_PAGES)) { + copy_gigantic_page(dst, src, addr, vma); + return; + } might_sleep(); for (i = 0; i < pages_per_huge_page(h); i++) { @@ -422,24 +456,6 @@ static void enqueue_huge_page(struct hstate *h, struct page *page) h->free_huge_pages_node[nid]++; } -static struct page *dequeue_huge_page(struct hstate *h) -{ - int nid; - struct page *page = NULL; - - for (nid = 0; nid < MAX_NUMNODES; ++nid) { - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - break; - } - } - return page; -} - static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve) @@ -545,41 +561,6 @@ static void free_huge_page(struct page *page) hugetlb_put_quota(mapping, 1); } -/* - * Increment or decrement surplus_huge_pages. Keep node-specific counters - * balanced by operating on them in a round-robin fashion. - * Returns 1 if an adjustment was made. - */ -static int adjust_pool_surplus(struct hstate *h, int delta) -{ - static int prev_nid; - int nid = prev_nid; - int ret = 0; - - VM_BUG_ON(delta != -1 && delta != 1); - do { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - /* To shrink on this node, there must be a surplus page */ - if (delta < 0 && !h->surplus_huge_pages_node[nid]) - continue; - /* Surplus cannot exceed the total number of pages */ - if (delta > 0 && h->surplus_huge_pages_node[nid] >= - h->nr_huge_pages_node[nid]) - continue; - - h->surplus_huge_pages += delta; - h->surplus_huge_pages_node[nid] += delta; - ret = 1; - break; - } while (nid != prev_nid); - - prev_nid = nid; - return ret; -} - static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) { set_compound_page_dtor(page, free_huge_page); @@ -590,6 +571,34 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) put_page(page); /* free it into the hugepage allocator */ } +static void prep_compound_gigantic_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + struct page *p = page + 1; + + /* we rely on prep_new_huge_page to set the destructor */ + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __SetPageTail(p); + p->first_page = page; + } +} + +int PageHuge(struct page *page) +{ + compound_page_dtor *dtor; + + if (!PageCompound(page)) + return 0; + + page = compound_head(page); + dtor = get_compound_page_dtor(page); + + return dtor == free_huge_page; +} + static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) { struct page *page; @@ -597,7 +606,7 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) if (h->order >= MAX_ORDER) return NULL; - page = alloc_pages_node(nid, + page = alloc_pages_exact_node(nid, htlb_alloc_mask|__GFP_COMP|__GFP_THISNODE| __GFP_REPEAT|__GFP_NOWARN, huge_page_order(h)); @@ -614,22 +623,22 @@ static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) /* * Use a helper variable to find the next node and then - * copy it back to hugetlb_next_nid afterwards: + * copy it back to next_nid_to_alloc afterwards: * otherwise there's a window in which a racer might - * pass invalid nid MAX_NUMNODES to alloc_pages_node. + * pass invalid nid MAX_NUMNODES to alloc_pages_exact_node. * But we don't need to use a spin_lock here: it really * doesn't matter if occasionally a racer chooses the * same nid as we do. Move nid forward in the mask even * if we just successfully allocated a hugepage so that * the next caller gets hugepages on the next node. */ -static int hstate_next_node(struct hstate *h) +static int hstate_next_node_to_alloc(struct hstate *h) { int next_nid; - next_nid = next_node(h->hugetlb_next_nid, node_online_map); + next_nid = next_node(h->next_nid_to_alloc, node_online_map); if (next_nid == MAX_NUMNODES) next_nid = first_node(node_online_map); - h->hugetlb_next_nid = next_nid; + h->next_nid_to_alloc = next_nid; return next_nid; } @@ -640,14 +649,15 @@ static int alloc_fresh_huge_page(struct hstate *h) int next_nid; int ret = 0; - start_nid = h->hugetlb_next_nid; + start_nid = h->next_nid_to_alloc; + next_nid = start_nid; do { - page = alloc_fresh_huge_page_node(h, h->hugetlb_next_nid); + page = alloc_fresh_huge_page_node(h, next_nid); if (page) ret = 1; - next_nid = hstate_next_node(h); - } while (!page && h->hugetlb_next_nid != start_nid); + next_nid = hstate_next_node_to_alloc(h); + } while (!page && next_nid != start_nid); if (ret) count_vm_event(HTLB_BUDDY_PGALLOC); @@ -657,6 +667,61 @@ static int alloc_fresh_huge_page(struct hstate *h) return ret; } +/* + * helper for free_pool_huge_page() - find next node + * from which to free a huge page + */ +static int hstate_next_node_to_free(struct hstate *h) +{ + int next_nid; + next_nid = next_node(h->next_nid_to_free, node_online_map); + if (next_nid == MAX_NUMNODES) + next_nid = first_node(node_online_map); + h->next_nid_to_free = next_nid; + return next_nid; +} + +/* + * Free huge page from pool from next node to free. + * Attempt to keep persistent huge pages more or less + * balanced over allowed nodes. + * Called with hugetlb_lock locked. + */ +static int free_pool_huge_page(struct hstate *h, bool acct_surplus) +{ + int start_nid; + int next_nid; + int ret = 0; + + start_nid = h->next_nid_to_free; + next_nid = start_nid; + + do { + /* + * If we're returning unused surplus pages, only examine + * nodes with surplus pages. + */ + if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) && + !list_empty(&h->hugepage_freelists[next_nid])) { + struct page *page = + list_entry(h->hugepage_freelists[next_nid].next, + struct page, lru); + list_del(&page->lru); + h->free_huge_pages--; + h->free_huge_pages_node[next_nid]--; + if (acct_surplus) { + h->surplus_huge_pages--; + h->surplus_huge_pages_node[next_nid]--; + } + update_and_free_page(h, page); + ret = 1; + } + next_nid = hstate_next_node_to_free(h); + } while (!ret && next_nid != start_nid); + + return ret; +} + static struct page *alloc_buddy_huge_page(struct hstate *h, struct vm_area_struct *vma, unsigned long address) { @@ -828,22 +893,13 @@ free: * When releasing a hugetlb pool reservation, any surplus pages that were * allocated to satisfy the reservation must be explicitly freed if they were * never used. + * Called with hugetlb_lock held. */ static void return_unused_surplus_pages(struct hstate *h, unsigned long unused_resv_pages) { - static int nid = -1; - struct page *page; unsigned long nr_pages; - /* - * We want to release as many surplus pages as possible, spread - * evenly across all nodes. Iterate across all nodes until we - * can no longer free unreserved surplus pages. This occurs when - * the nodes with surplus pages have no free pages. - */ - unsigned long remaining_iterations = num_online_nodes(); - /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; @@ -853,26 +909,17 @@ static void return_unused_surplus_pages(struct hstate *h, nr_pages = min(unused_resv_pages, h->surplus_huge_pages); - while (remaining_iterations-- && nr_pages) { - nid = next_node(nid, node_online_map); - if (nid == MAX_NUMNODES) - nid = first_node(node_online_map); - - if (!h->surplus_huge_pages_node[nid]) - continue; - - if (!list_empty(&h->hugepage_freelists[nid])) { - page = list_entry(h->hugepage_freelists[nid].next, - struct page, lru); - list_del(&page->lru); - update_and_free_page(h, page); - h->free_huge_pages--; - h->free_huge_pages_node[nid]--; - h->surplus_huge_pages--; - h->surplus_huge_pages_node[nid]--; - nr_pages--; - remaining_iterations = num_online_nodes(); - } + /* + * We want to release as many surplus pages as possible, spread + * evenly across all nodes. Iterate across all nodes until we + * can no longer free unreserved surplus pages. This occurs when + * the nodes with surplus pages have no free pages. + * free_pool_huge_page() will balance the the frees across the + * on-line nodes for us and will handle the hstate accounting. + */ + while (nr_pages--) { + if (!free_pool_huge_page(h, 1)) + break; } } @@ -885,13 +932,13 @@ static void return_unused_surplus_pages(struct hstate *h, * an instantiated the change should be committed via vma_commit_reservation. * No action is required on failure. */ -static int vma_needs_reservation(struct hstate *h, +static long vma_needs_reservation(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); return region_chg(&inode->i_mapping->private_list, idx, idx + 1); @@ -900,7 +947,7 @@ static int vma_needs_reservation(struct hstate *h, return 1; } else { - int err; + long err; pgoff_t idx = vma_hugecache_offset(h, vma, addr); struct resv_map *reservations = vma_resv_map(vma); @@ -916,7 +963,7 @@ static void vma_commit_reservation(struct hstate *h, struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { pgoff_t idx = vma_hugecache_offset(h, vma, addr); region_add(&inode->i_mapping->private_list, idx, idx + 1); @@ -936,7 +983,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *page; struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned int chg; + long chg; /* * Processes that did not create the mapping will have no reserves and @@ -972,7 +1019,7 @@ static struct page *alloc_huge_page(struct vm_area_struct *vma, return page; } -__attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) +int __weak alloc_bootmem_huge_page(struct hstate *h) { struct huge_bootmem_page *m; int nr_nodes = nodes_weight(node_online_map); @@ -981,9 +1028,10 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) void *addr; addr = __alloc_bootmem_node_nopanic( - NODE_DATA(h->hugetlb_next_nid), + NODE_DATA(h->next_nid_to_alloc), huge_page_size(h), huge_page_size(h), 0); + hstate_next_node_to_alloc(h); if (addr) { /* * Use the beginning of the huge page to store the @@ -991,10 +1039,8 @@ __attribute__((weak)) int alloc_bootmem_huge_page(struct hstate *h) * puts them into the mem_map). */ m = addr; - if (m) - goto found; + goto found; } - hstate_next_node(h); nr_nodes--; } return 0; @@ -1108,6 +1154,53 @@ static inline void try_to_free_low(struct hstate *h, unsigned long count) } #endif +/* + * Increment or decrement surplus_huge_pages. Keep node-specific counters + * balanced by operating on them in a round-robin fashion. + * Returns 1 if an adjustment was made. + */ +static int adjust_pool_surplus(struct hstate *h, int delta) +{ + int start_nid, next_nid; + int ret = 0; + + VM_BUG_ON(delta != -1 && delta != 1); + + if (delta < 0) + start_nid = h->next_nid_to_alloc; + else + start_nid = h->next_nid_to_free; + next_nid = start_nid; + + do { + int nid = next_nid; + if (delta < 0) { + next_nid = hstate_next_node_to_alloc(h); + /* + * To shrink on this node, there must be a surplus page + */ + if (!h->surplus_huge_pages_node[nid]) + continue; + } + if (delta > 0) { + next_nid = hstate_next_node_to_free(h); + /* + * Surplus cannot exceed the total number of pages + */ + if (h->surplus_huge_pages_node[nid] >= + h->nr_huge_pages_node[nid]) + continue; + } + + h->surplus_huge_pages += delta; + h->surplus_huge_pages_node[nid] += delta; + ret = 1; + break; + } while (next_nid != start_nid); + + return ret; +} + #define persistent_huge_pages(h) (h->nr_huge_pages - h->surplus_huge_pages) static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) { @@ -1166,10 +1259,8 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count) min_count = max(count, min_count); try_to_free_low(h, min_count); while (min_count < persistent_huge_pages(h)) { - struct page *page = dequeue_huge_page(h); - if (!page) + if (!free_pool_huge_page(h, 0)) break; - update_and_free_page(h, page); } while (count < persistent_huge_pages(h)) { if (!adjust_pool_surplus(h, 1)) @@ -1381,7 +1472,8 @@ void __init hugetlb_add_hstate(unsigned order) h->free_huge_pages = 0; for (i = 0; i < MAX_NUMNODES; ++i) INIT_LIST_HEAD(&h->hugepage_freelists[i]); - h->hugetlb_next_nid = first_node(node_online_map); + h->next_nid_to_alloc = first_node(node_online_map); + h->next_nid_to_free = first_node(node_online_map); snprintf(h->name, HSTATE_NAME_LEN, "hugepages-%lukB", huge_page_size(h)/1024); @@ -1445,7 +1537,7 @@ static unsigned int cpuset_mems_nr(unsigned int *array) #ifdef CONFIG_SYSCTL int hugetlb_sysctl_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1456,7 +1548,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) h->max_huge_pages = set_max_huge_pages(h, tmp); @@ -1465,10 +1557,10 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write, } int hugetlb_treat_movable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_dointvec(table, write, file, buffer, length, ppos); + proc_dointvec(table, write, buffer, length, ppos); if (hugepages_treat_as_movable) htlb_alloc_mask = GFP_HIGHUSER_MOVABLE; else @@ -1477,7 +1569,7 @@ int hugetlb_treat_movable_handler(struct ctl_table *table, int write, } int hugetlb_overcommit_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { struct hstate *h = &default_hstate; @@ -1488,7 +1580,7 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write, table->data = &tmp; table->maxlen = sizeof(unsigned long); - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write) { spin_lock(&hugetlb_lock); @@ -1629,7 +1721,7 @@ static int hugetlb_vm_op_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return 0; } -struct vm_operations_struct hugetlb_vm_ops = { +const struct vm_operations_struct hugetlb_vm_ops = { .fault = hugetlb_vm_op_fault, .open = hugetlb_vm_op_open, .close = hugetlb_vm_op_close, @@ -1861,7 +1953,7 @@ retry_avoidcopy: * at the time of fork() could consume its reserves on COW instead * of the full address range. */ - if (!(vma->vm_flags & VM_SHARED) && + if (!(vma->vm_flags & VM_MAYSHARE) && is_vma_resv_set(vma, HPAGE_RESV_OWNER) && old_page != pagecache_page) outside_reserve = 1; @@ -1924,8 +2016,28 @@ static struct page *hugetlbfs_pagecache_page(struct hstate *h, return find_lock_page(mapping, idx); } +/* + * Return whether there is a pagecache page to back given address within VMA. + * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page. + */ +static bool hugetlbfs_pagecache_present(struct hstate *h, + struct vm_area_struct *vma, unsigned long address) +{ + struct address_space *mapping; + pgoff_t idx; + struct page *page; + + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, address); + + page = find_get_page(mapping, idx); + if (page) + put_page(page); + return page != NULL; +} + static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, pte_t *ptep, int write_access) + unsigned long address, pte_t *ptep, unsigned int flags) { struct hstate *h = hstate_vma(vma); int ret = VM_FAULT_SIGBUS; @@ -1968,7 +2080,7 @@ retry: clear_huge_page(page, address, huge_page_size(h)); __SetPageUptodate(page); - if (vma->vm_flags & VM_SHARED) { + if (vma->vm_flags & VM_MAYSHARE) { int err; struct inode *inode = mapping->host; @@ -1993,7 +2105,7 @@ retry: * any allocations necessary to record that reservation occur outside * the spinlock. */ - if (write_access && !(vma->vm_flags & VM_SHARED)) + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; @@ -2012,7 +2124,7 @@ retry: && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); - if (write_access && !(vma->vm_flags & VM_SHARED)) { + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte, page); } @@ -2031,7 +2143,7 @@ backout_unlocked: } int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, - unsigned long address, int write_access) + unsigned long address, unsigned int flags) { pte_t *ptep; pte_t entry; @@ -2052,7 +2164,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_lock(&hugetlb_instantiation_mutex); entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { - ret = hugetlb_no_page(mm, vma, address, ptep, write_access); + ret = hugetlb_no_page(mm, vma, address, ptep, flags); goto out_mutex; } @@ -2066,13 +2178,13 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * page now as it is used to determine if a reservation has been * consumed. */ - if (write_access && !pte_write(entry)) { + if ((flags & FAULT_FLAG_WRITE) && !pte_write(entry)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } - if (!(vma->vm_flags & VM_SHARED)) + if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, vma, address); } @@ -2083,7 +2195,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, goto out_page_table_lock; - if (write_access) { + if (flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); @@ -2092,7 +2204,8 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) + if (huge_ptep_set_access_flags(vma, address, ptep, entry, + flags & FAULT_FLAG_WRITE)) update_mmu_cache(vma, address, entry); out_page_table_lock: @@ -2118,54 +2231,55 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return NULL; } -static int huge_zeropage_ok(pte_t *ptep, int write, int shared) -{ - if (!ptep || write || shared) - return 0; - else - return huge_pte_none(huge_ptep_get(ptep)); -} - int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, - int write) + unsigned int flags) { unsigned long pfn_offset; unsigned long vaddr = *position; int remainder = *length; struct hstate *h = hstate_vma(vma); - int zeropage_ok = 0; - int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { pte_t *pte; + int absent; struct page *page; /* * Some archs (sparc64, sh*) have multiple pte_ts to - * each hugepage. We have to make * sure we get the + * each hugepage. We have to make sure we get the * first, for the page indexing below to work. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); - if (huge_zeropage_ok(pte, write, shared)) - zeropage_ok = 1; + absent = !pte || huge_pte_none(huge_ptep_get(pte)); + + /* + * When coredumping, it suits get_dump_page if we just return + * an error where there's an empty slot with no huge pagecache + * to back it. This way, we avoid allocating a hugepage, and + * the sparse dumpfile avoids allocating disk blocks, but its + * huge holes still show up with zeroes where they need to be. + */ + if (absent && (flags & FOLL_DUMP) && + !hugetlbfs_pagecache_present(h, vma, vaddr)) { + remainder = 0; + break; + } - if (!pte || - (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || - (write && !pte_write(huge_ptep_get(pte)))) { + if (absent || + ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) { int ret; spin_unlock(&mm->page_table_lock); - ret = hugetlb_fault(mm, vma, vaddr, write); + ret = hugetlb_fault(mm, vma, vaddr, + (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0); spin_lock(&mm->page_table_lock); if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; - if (!i) - i = -EFAULT; break; } @@ -2173,10 +2287,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { - if (zeropage_ok) - pages[i] = ZERO_PAGE(0); - else - pages[i] = mem_map_offset(page, pfn_offset); + pages[i] = mem_map_offset(page, pfn_offset); get_page(pages[i]); } @@ -2200,7 +2311,7 @@ same_page: *length = remainder; *position = vaddr; - return i; + return i ? i : -EFAULT; } void hugetlb_change_protection(struct vm_area_struct *vma, @@ -2237,12 +2348,18 @@ void hugetlb_change_protection(struct vm_area_struct *vma, int hugetlb_reserve_pages(struct inode *inode, long from, long to, - struct vm_area_struct *vma) + struct vm_area_struct *vma, + int acctflag) { long ret, chg; struct hstate *h = hstate_inode(inode); - if (vma && vma->vm_flags & VM_NORESERVE) + /* + * Only apply hugepage reservation if asked. At fault time, an + * attempt will be made for VM_NORESERVE to allocate a page + * and filesystem quota without using reserves + */ + if (acctflag & VM_NORESERVE) return 0; /* @@ -2251,7 +2368,7 @@ int hugetlb_reserve_pages(struct inode *inode, * to reserve the full area even if read-only as mprotect() may be * called to make the mapping read-write. Assume !vma is a shm mapping */ - if (!vma || vma->vm_flags & VM_SHARED) + if (!vma || vma->vm_flags & VM_MAYSHARE) chg = region_chg(&inode->i_mapping->private_list, from, to); else { struct resv_map *resv_map = resv_map_alloc(); @@ -2267,14 +2384,32 @@ int hugetlb_reserve_pages(struct inode *inode, if (chg < 0) return chg; + /* There must be enough filesystem quota for the mapping */ if (hugetlb_get_quota(inode->i_mapping, chg)) return -ENOSPC; + + /* + * Check enough hugepages are available for the reservation. + * Hand back the quota if there are not + */ ret = hugetlb_acct_memory(h, chg); if (ret < 0) { hugetlb_put_quota(inode->i_mapping, chg); return ret; } - if (!vma || vma->vm_flags & VM_SHARED) + + /* + * Account for the reservations made. Shared mappings record regions + * that have reservations as they are shared by multiple VMAs. + * When the last VMA disappears, the region map says how much + * the reservation was and the page cache tells how much of + * the reservation was consumed. Private mappings are per-VMA and + * only the consumed reservations are tracked. When the VMA + * disappears, the original reservation is the VMA size and the + * consumed reservations are stored in the map. Hence, nothing + * else has to be done for private mappings here + */ + if (!vma || vma->vm_flags & VM_MAYSHARE) region_add(&inode->i_mapping->private_list, from, to); return 0; } @@ -2285,7 +2420,7 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) long chg = region_truncate(&inode->i_mapping->private_list, offset); spin_lock(&inode->i_lock); - inode->i_blocks -= blocks_per_huge_page(h); + inode->i_blocks -= (blocks_per_huge_page(h) * freed); spin_unlock(&inode->i_lock); hugetlb_put_quota(inode->i_mapping, (chg - freed));