#include "internal.h"
const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
-static unsigned long nr_huge_pages, free_huge_pages;
+static unsigned long nr_huge_pages, free_huge_pages, reserved_huge_pages;
unsigned long max_huge_pages;
static struct list_head hugepage_freelists[MAX_NUMNODES];
static unsigned int nr_huge_pages_node[MAX_NUMNODES];
return page;
}
+static void free_huge_page(struct page *page)
+{
+ BUG_ON(page_count(page));
+
+ INIT_LIST_HEAD(&page->lru);
+
+ spin_lock(&hugetlb_lock);
+ enqueue_huge_page(page);
+ spin_unlock(&hugetlb_lock);
+}
+
static int alloc_fresh_huge_page(void)
{
static int nid = 0;
struct page *page;
page = alloc_pages_node(nid, GFP_HIGHUSER|__GFP_COMP|__GFP_NOWARN,
HUGETLB_PAGE_ORDER);
- nid = (nid + 1) % num_online_nodes();
+ nid = next_node(nid, node_online_map);
+ if (nid == MAX_NUMNODES)
+ nid = first_node(node_online_map);
if (page) {
page[1].lru.next = (void *)free_huge_page; /* dtor */
spin_lock(&hugetlb_lock);
return 0;
}
-void free_huge_page(struct page *page)
+static struct page *alloc_huge_page(struct vm_area_struct *vma,
+ unsigned long addr)
{
- BUG_ON(page_count(page));
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct page *page;
+ int use_reserve = 0;
+ unsigned long idx;
- INIT_LIST_HEAD(&page->lru);
+ spin_lock(&hugetlb_lock);
+
+ if (vma->vm_flags & VM_MAYSHARE) {
+
+ /* idx = radix tree index, i.e. offset into file in
+ * HPAGE_SIZE units */
+ idx = ((addr - vma->vm_start) >> HPAGE_SHIFT)
+ + (vma->vm_pgoff >> (HPAGE_SHIFT - PAGE_SHIFT));
+
+ /* The hugetlbfs specific inode info stores the number
+ * of "guaranteed available" (huge) pages. That is,
+ * the first 'prereserved_hpages' pages of the inode
+ * are either already instantiated, or have been
+ * pre-reserved (by hugetlb_reserve_for_inode()). Here
+ * we're in the process of instantiating the page, so
+ * we use this to determine whether to draw from the
+ * pre-reserved pool or the truly free pool. */
+ if (idx < HUGETLBFS_I(inode)->prereserved_hpages)
+ use_reserve = 1;
+ }
+
+ if (!use_reserve) {
+ if (free_huge_pages <= reserved_huge_pages)
+ goto fail;
+ } else {
+ BUG_ON(reserved_huge_pages == 0);
+ reserved_huge_pages--;
+ }
+
+ page = dequeue_huge_page(vma, addr);
+ if (!page)
+ goto fail;
+
+ spin_unlock(&hugetlb_lock);
+ set_page_refcounted(page);
+ return page;
+
+ fail:
+ WARN_ON(use_reserve); /* reserved allocations shouldn't fail */
+ spin_unlock(&hugetlb_lock);
+ return NULL;
+}
+
+/* hugetlb_extend_reservation()
+ *
+ * Ensure that at least 'atleast' hugepages are, and will remain,
+ * available to instantiate the first 'atleast' pages of the given
+ * inode. If the inode doesn't already have this many pages reserved
+ * or instantiated, set aside some hugepages in the reserved pool to
+ * satisfy later faults (or fail now if there aren't enough, rather
+ * than getting the SIGBUS later).
+ */
+int hugetlb_extend_reservation(struct hugetlbfs_inode_info *info,
+ unsigned long atleast)
+{
+ struct inode *inode = &info->vfs_inode;
+ unsigned long change_in_reserve = 0;
+ int ret = 0;
spin_lock(&hugetlb_lock);
- enqueue_huge_page(page);
+ read_lock_irq(&inode->i_mapping->tree_lock);
+
+ if (info->prereserved_hpages >= atleast)
+ goto out;
+
+ /* Because we always call this on shared mappings, none of the
+ * pages beyond info->prereserved_hpages can have been
+ * instantiated, so we need to reserve all of them now. */
+ change_in_reserve = atleast - info->prereserved_hpages;
+
+ if ((reserved_huge_pages + change_in_reserve) > free_huge_pages) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ reserved_huge_pages += change_in_reserve;
+ info->prereserved_hpages = atleast;
+
+ out:
+ read_unlock_irq(&inode->i_mapping->tree_lock);
spin_unlock(&hugetlb_lock);
+
+ return ret;
}
-struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr)
+/* hugetlb_truncate_reservation()
+ *
+ * This returns pages reserved for the given inode to the general free
+ * hugepage pool. If the inode has any pages prereserved, but not
+ * instantiated, beyond offset (atmost << HPAGE_SIZE), then release
+ * them.
+ */
+void hugetlb_truncate_reservation(struct hugetlbfs_inode_info *info,
+ unsigned long atmost)
{
+ struct inode *inode = &info->vfs_inode;
+ struct address_space *mapping = inode->i_mapping;
+ unsigned long idx;
+ unsigned long change_in_reserve = 0;
struct page *page;
spin_lock(&hugetlb_lock);
- page = dequeue_huge_page(vma, addr);
- if (!page) {
- spin_unlock(&hugetlb_lock);
- return NULL;
+ read_lock_irq(&inode->i_mapping->tree_lock);
+
+ if (info->prereserved_hpages <= atmost)
+ goto out;
+
+ /* Count pages which were reserved, but not instantiated, and
+ * which we can now release. */
+ for (idx = atmost; idx < info->prereserved_hpages; idx++) {
+ page = radix_tree_lookup(&mapping->page_tree, idx);
+ if (!page)
+ /* Pages which are already instantiated can't
+ * be unreserved (and in fact have already
+ * been removed from the reserved pool) */
+ change_in_reserve++;
}
+
+ BUG_ON(reserved_huge_pages < change_in_reserve);
+ reserved_huge_pages -= change_in_reserve;
+ info->prereserved_hpages = atmost;
+
+ out:
+ read_unlock_irq(&inode->i_mapping->tree_lock);
spin_unlock(&hugetlb_lock);
- set_page_refcounted(page);
- return page;
}
static int __init hugetlb_init(void)
return nr_huge_pages;
spin_lock(&hugetlb_lock);
+ count = max(count, reserved_huge_pages);
try_to_free_low(count);
while (count < nr_huge_pages) {
struct page *page = dequeue_huge_page(NULL, 0);
return sprintf(buf,
"HugePages_Total: %5lu\n"
"HugePages_Free: %5lu\n"
+ "HugePages_Rsvd: %5lu\n"
"Hugepagesize: %5lu kB\n",
nr_huge_pages,
free_huge_pages,
+ reserved_huge_pages,
HPAGE_SIZE/1024);
}
nid, free_huge_pages_node[nid]);
}
-int is_hugepage_mem_enough(size_t size)
-{
- return (size + ~HPAGE_MASK)/HPAGE_SIZE <= free_huge_pages;
-}
-
/* Return the number pages of memory we physically have, in PAGE_SIZE units. */
unsigned long hugetlb_total_pages(void)
{
struct page **pages, struct vm_area_struct **vmas,
unsigned long *position, int *length, int i)
{
- unsigned long vpfn, vaddr = *position;
+ unsigned long pfn_offset;
+ unsigned long vaddr = *position;
int remainder = *length;
- vpfn = vaddr/PAGE_SIZE;
spin_lock(&mm->page_table_lock);
while (vaddr < vma->vm_end && remainder) {
pte_t *pte;
break;
}
+ pfn_offset = (vaddr & ~HPAGE_MASK) >> PAGE_SHIFT;
+ page = pte_page(*pte);
+same_page:
if (pages) {
- page = &pte_page(*pte)[vpfn % (HPAGE_SIZE/PAGE_SIZE)];
get_page(page);
- pages[i] = page;
+ pages[i] = page + pfn_offset;
}
if (vmas)
vmas[i] = vma;
vaddr += PAGE_SIZE;
- ++vpfn;
+ ++pfn_offset;
--remainder;
++i;
+ if (vaddr < vma->vm_end && remainder &&
+ pfn_offset < HPAGE_SIZE/PAGE_SIZE) {
+ /*
+ * We use pfn_offset to avoid touching the pageframes
+ * of this compound page.
+ */
+ goto same_page;
+ }
}
spin_unlock(&mm->page_table_lock);
*length = remainder;