* balanced over allowed nodes.
* Called with hugetlb_lock locked.
*/
-static int free_pool_huge_page(struct hstate *h)
+static int free_pool_huge_page(struct hstate *h, bool acct_surplus)
{
int start_nid;
int next_nid;
next_nid = start_nid;
do {
- if (!list_empty(&h->hugepage_freelists[next_nid])) {
+ /*
+ * If we're returning unused surplus pages, only examine
+ * nodes with surplus pages.
+ */
+ if ((!acct_surplus || h->surplus_huge_pages_node[next_nid]) &&
+ !list_empty(&h->hugepage_freelists[next_nid])) {
struct page *page =
list_entry(h->hugepage_freelists[next_nid].next,
struct page, lru);
list_del(&page->lru);
h->free_huge_pages--;
h->free_huge_pages_node[next_nid]--;
+ if (acct_surplus) {
+ h->surplus_huge_pages--;
+ h->surplus_huge_pages_node[next_nid]--;
+ }
update_and_free_page(h, page);
ret = 1;
}
* When releasing a hugetlb pool reservation, any surplus pages that were
* allocated to satisfy the reservation must be explicitly freed if they were
* never used.
+ * Called with hugetlb_lock held.
*/
static void return_unused_surplus_pages(struct hstate *h,
unsigned long unused_resv_pages)
{
- static int nid = -1;
- struct page *page;
unsigned long nr_pages;
- /*
- * We want to release as many surplus pages as possible, spread
- * evenly across all nodes. Iterate across all nodes until we
- * can no longer free unreserved surplus pages. This occurs when
- * the nodes with surplus pages have no free pages.
- */
- unsigned long remaining_iterations = nr_online_nodes;
-
/* Uncommit the reservation */
h->resv_huge_pages -= unused_resv_pages;
nr_pages = min(unused_resv_pages, h->surplus_huge_pages);
- while (remaining_iterations-- && nr_pages) {
- nid = next_node(nid, node_online_map);
- if (nid == MAX_NUMNODES)
- nid = first_node(node_online_map);
-
- if (!h->surplus_huge_pages_node[nid])
- continue;
-
- if (!list_empty(&h->hugepage_freelists[nid])) {
- page = list_entry(h->hugepage_freelists[nid].next,
- struct page, lru);
- list_del(&page->lru);
- update_and_free_page(h, page);
- h->free_huge_pages--;
- h->free_huge_pages_node[nid]--;
- h->surplus_huge_pages--;
- h->surplus_huge_pages_node[nid]--;
- nr_pages--;
- remaining_iterations = nr_online_nodes;
- }
+ /*
+ * We want to release as many surplus pages as possible, spread
+ * evenly across all nodes. Iterate across all nodes until we
+ * can no longer free unreserved surplus pages. This occurs when
+ * the nodes with surplus pages have no free pages.
+ * free_pool_huge_page() will balance the the frees across the
+ * on-line nodes for us and will handle the hstate accounting.
+ */
+ while (nr_pages--) {
+ if (!free_pool_huge_page(h, 1))
+ break;
}
}
NODE_DATA(h->next_nid_to_alloc),
huge_page_size(h), huge_page_size(h), 0);
+ hstate_next_node_to_alloc(h);
if (addr) {
/*
* Use the beginning of the huge page to store the
m = addr;
goto found;
}
- hstate_next_node_to_alloc(h);
nr_nodes--;
}
return 0;
min_count = max(count, min_count);
try_to_free_low(h, min_count);
while (min_count < persistent_huge_pages(h)) {
- if (!free_pool_huge_page(h))
+ if (!free_pool_huge_page(h, 0))
break;
}
while (count < persistent_huge_pages(h)) {
#ifdef CONFIG_SYSCTL
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
+ void __user *buffer,
size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+ proc_doulongvec_minmax(table, write, buffer, length, ppos);
if (write)
h->max_huge_pages = set_max_huge_pages(h, tmp);
}
int hugetlb_treat_movable_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
+ void __user *buffer,
size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, file, buffer, length, ppos);
+ proc_dointvec(table, write, buffer, length, ppos);
if (hugepages_treat_as_movable)
htlb_alloc_mask = GFP_HIGHUSER_MOVABLE;
else
}
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
- struct file *file, void __user *buffer,
+ void __user *buffer,
size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
table->data = &tmp;
table->maxlen = sizeof(unsigned long);
- proc_doulongvec_minmax(table, write, file, buffer, length, ppos);
+ proc_doulongvec_minmax(table, write, buffer, length, ppos);
if (write) {
spin_lock(&hugetlb_lock);
return find_lock_page(mapping, idx);
}
+/*
+ * Return whether there is a pagecache page to back given address within VMA.
+ * Caller follow_hugetlb_page() holds page_table_lock so we cannot lock_page.
+ */
+static bool hugetlbfs_pagecache_present(struct hstate *h,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct address_space *mapping;
+ pgoff_t idx;
+ struct page *page;
+
+ mapping = vma->vm_file->f_mapping;
+ idx = vma_hugecache_offset(h, vma, address);
+
+ page = find_get_page(mapping, idx);
+ if (page)
+ put_page(page);
+ return page != NULL;
+}
+
static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, unsigned int flags)
{
return NULL;
}
-static int huge_zeropage_ok(pte_t *ptep, int write, int shared)
-{
- if (!ptep || write || shared)
- return 0;
- else
- return huge_pte_none(huge_ptep_get(ptep));
-}
-
int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i,
- int write)
+ unsigned int flags)
{
unsigned long pfn_offset;
unsigned long vaddr = *position;
int remainder = *length;
struct hstate *h = hstate_vma(vma);
- int zeropage_ok = 0;
- int shared = vma->vm_flags & VM_SHARED;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
+ int absent;
struct page *page;
/*
* Some archs (sparc64, sh*) have multiple pte_ts to
- * each hugepage. We have to make * sure we get the
+ * each hugepage. We have to make sure we get the
* first, for the page indexing below to work.
*/
pte = huge_pte_offset(mm, vaddr & huge_page_mask(h));
- if (huge_zeropage_ok(pte, write, shared))
- zeropage_ok = 1;
+ absent = !pte || huge_pte_none(huge_ptep_get(pte));
+
+ /*
+ * When coredumping, it suits get_dump_page if we just return
+ * an error where there's an empty slot with no huge pagecache
+ * to back it. This way, we avoid allocating a hugepage, and
+ * the sparse dumpfile avoids allocating disk blocks, but its
+ * huge holes still show up with zeroes where they need to be.
+ */
+ if (absent && (flags & FOLL_DUMP) &&
+ !hugetlbfs_pagecache_present(h, vma, vaddr)) {
+ remainder = 0;
+ break;
+ }
- if (!pte ||
- (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) ||
- (write && !pte_write(huge_ptep_get(pte)))) {
+ if (absent ||
+ ((flags & FOLL_WRITE) && !pte_write(huge_ptep_get(pte)))) {
int ret;
spin_unlock(&mm->page_table_lock);
- ret = hugetlb_fault(mm, vma, vaddr, write);
+ ret = hugetlb_fault(mm, vma, vaddr,
+ (flags & FOLL_WRITE) ? FAULT_FLAG_WRITE : 0);
spin_lock(&mm->page_table_lock);
if (!(ret & VM_FAULT_ERROR))
continue;
remainder = 0;
- if (!i)
- i = -EFAULT;
break;
}
page = pte_page(huge_ptep_get(pte));
same_page:
if (pages) {
- if (zeropage_ok)
- pages[i] = ZERO_PAGE(0);
- else
- pages[i] = mem_map_offset(page, pfn_offset);
+ pages[i] = mem_map_offset(page, pfn_offset);
get_page(pages[i]);
}
*length = remainder;
*position = vaddr;
- return i;
+ return i ? i : -EFAULT;
}
void hugetlb_change_protection(struct vm_area_struct *vma,