#include <linux/bootmem.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
+#include <linux/kmemcheck.h>
#include <linux/module.h>
#include <linux/suspend.h>
#include <linux/pagevec.h>
#include <linux/page_cgroup.h>
#include <linux/debugobjects.h>
#include <linux/kmemleak.h>
+#include <trace/events/kmem.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
unsigned long totalram_pages __read_mostly;
unsigned long totalreserve_pages __read_mostly;
-unsigned long highest_memmap_pfn __read_mostly;
int percpu_pagelist_fraction;
+gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
#ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE
int pageblock_order __read_mostly;
int min_free_kbytes = 1024;
-unsigned long __meminitdata nr_kernel_pages;
-unsigned long __meminitdata nr_all_pages;
+static unsigned long __meminitdata nr_kernel_pages;
+static unsigned long __meminitdata nr_all_pages;
static unsigned long __meminitdata dma_reserve;
#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
PB_migrate, PB_migrate_end);
}
+bool oom_killer_disabled __read_mostly;
+
#ifdef CONFIG_DEBUG_VM
static int page_outside_zone_boundaries(struct zone *zone, struct page *page)
{
static unsigned long nr_shown;
static unsigned long nr_unshown;
+ /* Don't complain about poisoned pages */
+ if (PageHWPoison(page)) {
+ __ClearPageBuddy(page);
+ return;
+ }
+
/*
* Allow a burst of 60 reports, then keep quiet for that minute;
* or allow a steady drip of one report per second.
*/
static inline void free_page_mlock(struct page *page)
{
- __ClearPageMlocked(page);
__dec_zone_page_state(page, NR_MLOCK);
__count_vm_event(UNEVICTABLE_MLOCKFREED);
}
}
/*
- * Frees a list of pages.
+ * Frees a number of pages from the PCP lists
* Assumes all pages on list are in same zone, and of same order.
* count is the number of pages to free.
*
* And clear the zone's pages_scanned counter, to hold off the "all pages are
* pinned" detection logic.
*/
-static void free_pages_bulk(struct zone *zone, int count,
- struct list_head *list, int order)
+static void free_pcppages_bulk(struct zone *zone, int count,
+ struct per_cpu_pages *pcp)
{
+ int migratetype = 0;
+ int batch_free = 0;
+
spin_lock(&zone->lock);
zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE);
zone->pages_scanned = 0;
- __mod_zone_page_state(zone, NR_FREE_PAGES, count << order);
- while (count--) {
+ __mod_zone_page_state(zone, NR_FREE_PAGES, count);
+ while (count) {
struct page *page;
+ struct list_head *list;
- VM_BUG_ON(list_empty(list));
- page = list_entry(list->prev, struct page, lru);
- /* have to delete it as __free_one_page list manipulates */
- list_del(&page->lru);
- __free_one_page(page, zone, order, page_private(page));
+ /*
+ * Remove pages from lists in a round-robin fashion. A
+ * batch_free count is maintained that is incremented when an
+ * empty list is encountered. This is so more pages are freed
+ * off fuller lists instead of spinning excessively around empty
+ * lists
+ */
+ do {
+ batch_free++;
+ if (++migratetype == MIGRATE_PCPTYPES)
+ migratetype = 0;
+ list = &pcp->lists[migratetype];
+ } while (list_empty(list));
+
+ do {
+ page = list_entry(list->prev, struct page, lru);
+ /* must delete as __free_one_page list manipulates */
+ list_del(&page->lru);
+ __free_one_page(page, zone, 0, migratetype);
+ trace_mm_page_pcpu_drain(page, 0, migratetype);
+ } while (--count && --batch_free && !list_empty(list));
}
spin_unlock(&zone->lock);
}
unsigned long flags;
int i;
int bad = 0;
- int clearMlocked = PageMlocked(page);
+ int wasMlocked = __TestClearPageMlocked(page);
+
+ kmemcheck_free_shadow(page, order);
for (i = 0 ; i < (1 << order) ; ++i)
bad += free_pages_check(page + i);
kernel_map_pages(page, 1 << order, 0);
local_irq_save(flags);
- if (unlikely(clearMlocked))
+ if (unlikely(wasMlocked))
free_page_mlock(page);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, order,
/*
* This page is about to be returned from the page allocator
*/
-static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+static inline int check_new_page(struct page *page)
{
if (unlikely(page_mapcount(page) |
(page->mapping != NULL) |
bad_page(page);
return 1;
}
+ return 0;
+}
+
+static int prep_new_page(struct page *page, int order, gfp_t gfp_flags)
+{
+ int i;
+
+ for (i = 0; i < (1 << order); i++) {
+ struct page *p = page + i;
+ if (unlikely(check_new_page(p)))
+ return 1;
+ }
set_page_private(page, 0);
set_page_refcounted(page);
return move_freepages(zone, start_page, end_page, migratetype);
}
+static void change_pageblock_range(struct page *pageblock_page,
+ int start_order, int migratetype)
+{
+ int nr_pageblocks = 1 << (start_order - pageblock_order);
+
+ while (nr_pageblocks--) {
+ set_pageblock_migratetype(pageblock_page, migratetype);
+ pageblock_page += pageblock_nr_pages;
+ }
+}
+
/* Remove an element from the buddy allocator from the fallback list */
static inline struct page *
__rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
* agressive about taking ownership of free pages
*/
if (unlikely(current_order >= (pageblock_order >> 1)) ||
- start_migratetype == MIGRATE_RECLAIMABLE) {
+ start_migratetype == MIGRATE_RECLAIMABLE ||
+ page_group_by_mobility_disabled) {
unsigned long pages;
pages = move_freepages_block(zone, page,
start_migratetype);
/* Claim the whole block if over half of it is free */
- if (pages >= (1 << (pageblock_order-1)))
+ if (pages >= (1 << (pageblock_order-1)) ||
+ page_group_by_mobility_disabled)
set_pageblock_migratetype(page,
start_migratetype);
list_del(&page->lru);
rmv_page_order(page);
- if (current_order == pageblock_order)
- set_pageblock_migratetype(page,
+ /* Take ownership for orders >= pageblock_order */
+ if (current_order >= pageblock_order)
+ change_pageblock_range(page, current_order,
start_migratetype);
expand(zone, page, order, current_order, area, migratetype);
+
+ trace_mm_page_alloc_extfrag(page, order, current_order,
+ start_migratetype, migratetype);
+
return page;
}
}
}
}
+ trace_mm_page_alloc_zone_locked(page, order, migratetype);
return page;
}
*/
static int rmqueue_bulk(struct zone *zone, unsigned int order,
unsigned long count, struct list_head *list,
- int migratetype)
+ int migratetype, int cold)
{
int i;
* merge IO requests if the physical pages are ordered
* properly.
*/
- list_add(&page->lru, list);
+ if (likely(cold == 0))
+ list_add(&page->lru, list);
+ else
+ list_add_tail(&page->lru, list);
set_page_private(page, migratetype);
list = &page->lru;
}
to_drain = pcp->batch;
else
to_drain = pcp->count;
- free_pages_bulk(zone, to_drain, &pcp->list, 0);
+ free_pcppages_bulk(zone, to_drain, pcp);
pcp->count -= to_drain;
local_irq_restore(flags);
}
pcp = &pset->pcp;
local_irq_save(flags);
- free_pages_bulk(zone, pcp->count, &pcp->list, 0);
+ free_pcppages_bulk(zone, pcp->count, pcp);
pcp->count = 0;
local_irq_restore(flags);
}
struct zone *zone = page_zone(page);
struct per_cpu_pages *pcp;
unsigned long flags;
- int clearMlocked = PageMlocked(page);
+ int migratetype;
+ int wasMlocked = __TestClearPageMlocked(page);
+
+ kmemcheck_free_shadow(page, 0);
if (PageAnon(page))
page->mapping = NULL;
kernel_map_pages(page, 1, 0);
pcp = &zone_pcp(zone, get_cpu())->pcp;
- set_page_private(page, get_pageblock_migratetype(page));
+ migratetype = get_pageblock_migratetype(page);
+ set_page_private(page, migratetype);
local_irq_save(flags);
- if (unlikely(clearMlocked))
+ if (unlikely(wasMlocked))
free_page_mlock(page);
__count_vm_event(PGFREE);
+ /*
+ * We only track unmovable, reclaimable and movable on pcp lists.
+ * Free ISOLATE pages back to the allocator because they are being
+ * offlined but treat RESERVE as movable pages so we can get those
+ * areas back if necessary. Otherwise, we may have to free
+ * excessively into the page allocator
+ */
+ if (migratetype >= MIGRATE_PCPTYPES) {
+ if (unlikely(migratetype == MIGRATE_ISOLATE)) {
+ free_one_page(zone, page, 0, migratetype);
+ goto out;
+ }
+ migratetype = MIGRATE_MOVABLE;
+ }
+
if (cold)
- list_add_tail(&page->lru, &pcp->list);
+ list_add_tail(&page->lru, &pcp->lists[migratetype]);
else
- list_add(&page->lru, &pcp->list);
+ list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
if (pcp->count >= pcp->high) {
- free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
+ free_pcppages_bulk(zone, pcp->batch, pcp);
pcp->count -= pcp->batch;
}
+
+out:
local_irq_restore(flags);
put_cpu();
}
void free_hot_page(struct page *page)
{
+ trace_mm_page_free_direct(page, 0);
free_hot_cold_page(page, 0);
}
-void free_cold_page(struct page *page)
-{
- free_hot_cold_page(page, 1);
-}
-
/*
* split_page takes a non-compound higher-order page, and splits it into
* n (1<<order) sub-pages: page[0..n]
VM_BUG_ON(PageCompound(page));
VM_BUG_ON(!page_count(page));
+
+#ifdef CONFIG_KMEMCHECK
+ /*
+ * Split shadow pages too, because free(page[0]) would
+ * otherwise free the whole shadow.
+ */
+ if (kmemcheck_page_is_tracked(page))
+ split_page(virt_to_page(page[0].shadow), order);
+#endif
+
for (i = 1; i < (1 << order); i++)
set_page_refcounted(page + i);
}
cpu = get_cpu();
if (likely(order == 0)) {
struct per_cpu_pages *pcp;
+ struct list_head *list;
pcp = &zone_pcp(zone, cpu)->pcp;
+ list = &pcp->lists[migratetype];
local_irq_save(flags);
- if (!pcp->count) {
- pcp->count = rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list, migratetype);
- if (unlikely(!pcp->count))
+ if (list_empty(list)) {
+ pcp->count += rmqueue_bulk(zone, 0,
+ pcp->batch, list,
+ migratetype, cold);
+ if (unlikely(list_empty(list)))
goto failed;
}
- /* Find a page of the appropriate migrate type */
- if (cold) {
- list_for_each_entry_reverse(page, &pcp->list, lru)
- if (page_private(page) == migratetype)
- break;
- } else {
- list_for_each_entry(page, &pcp->list, lru)
- if (page_private(page) == migratetype)
- break;
- }
-
- /* Allocate more to the pcp list if necessary */
- if (unlikely(&page->lru == &pcp->list)) {
- pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, &pcp->list, migratetype);
- page = list_entry(pcp->list.next, struct page, lru);
- }
+ if (cold)
+ page = list_entry(list->prev, struct page, lru);
+ else
+ page = list_entry(list->next, struct page, lru);
list_del(&page->lru);
pcp->count--;
* properly detect and handle allocation failures.
*
* We most definitely don't want callers attempting to
- * allocate greater than single-page units with
+ * allocate greater than order-1 page units with
* __GFP_NOFAIL.
*/
- WARN_ON_ONCE(order > 0);
+ WARN_ON_ONCE(order > 1);
}
spin_lock_irqsave(&zone->lock, flags);
page = __rmqueue(zone, order, migratetype);
BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK);
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
+ int ret;
+
mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK];
- if (!zone_watermark_ok(zone, order, mark,
- classzone_idx, alloc_flags)) {
- if (!zone_reclaim_mode ||
- !zone_reclaim(zone, gfp_mask, order))
+ if (zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags))
+ goto try_this_zone;
+
+ if (zone_reclaim_mode == 0)
+ goto this_zone_full;
+
+ ret = zone_reclaim(zone, gfp_mask, order);
+ switch (ret) {
+ case ZONE_RECLAIM_NOSCAN:
+ /* did not scan */
+ goto try_next_zone;
+ case ZONE_RECLAIM_FULL:
+ /* scanned but unreclaimable */
+ goto this_zone_full;
+ default:
+ /* did we reclaim enough */
+ if (!zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags))
goto this_zone_full;
}
}
+try_this_zone:
page = buffered_rmqueue(preferred_zone, zone, order,
gfp_mask, migratetype);
if (page)
goto out;
/* The OOM killer will not help higher order allocs */
- if (order > PAGE_ALLOC_COSTLY_ORDER)
+ if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL))
goto out;
/* Exhausted what can be done so it's blamo time */
/* We now go into synchronous reclaim */
cpuset_memory_pressure_bump();
-
- /*
- * The task's cpuset might have expanded its set of allowable nodes
- */
p->flags |= PF_MEMALLOC;
lockdep_set_current_reclaim_state(gfp_mask);
reclaim_state.reclaimed_slab = 0;
preferred_zone, migratetype);
if (!page && gfp_mask & __GFP_NOFAIL)
- congestion_wait(WRITE, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
} while (!page && (gfp_mask & __GFP_NOFAIL));
return page;
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
alloc_flags &= ~ALLOC_CPUSET;
- } else if (unlikely(rt_task(p)))
+ } else if (unlikely(rt_task(p)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
* be using allocators in order of preference for an area that is
* too large.
*/
- if (WARN_ON_ONCE(order >= MAX_ORDER))
+ if (order >= MAX_ORDER) {
+ WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN));
return NULL;
+ }
/*
* GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
goto nopage;
+restart:
wake_all_kswapd(order, zonelist, high_zoneidx);
/*
*/
alloc_flags = gfp_to_alloc_flags(gfp_mask);
-restart:
/* This is the last chance, in general, before the goto nopage. */
page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist,
high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS,
if (p->flags & PF_MEMALLOC)
goto nopage;
+ /* Avoid allocations with no watermarks from looping endlessly */
+ if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL))
+ goto nopage;
+
/* Try direct reclaim and then allocating */
page = __alloc_pages_direct_reclaim(gfp_mask, order,
zonelist, high_zoneidx,
*/
if (!did_some_progress) {
if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
+ if (oom_killer_disabled)
+ goto nopage;
page = __alloc_pages_may_oom(gfp_mask, order,
zonelist, high_zoneidx,
nodemask, preferred_zone,
goto got_pg;
/*
- * The OOM killer does not trigger for high-order allocations
- * but if no progress is being made, there are no other
- * options and retrying is unlikely to help
+ * The OOM killer does not trigger for high-order
+ * ~__GFP_NOFAIL allocations so if no progress is being
+ * made, there are no other options and retrying is
+ * unlikely to help.
*/
- if (order > PAGE_ALLOC_COSTLY_ORDER)
+ if (order > PAGE_ALLOC_COSTLY_ORDER &&
+ !(gfp_mask & __GFP_NOFAIL))
goto nopage;
goto restart;
pages_reclaimed += did_some_progress;
if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) {
/* Wait for some write requests to complete then retry */
- congestion_wait(WRITE, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, HZ/50);
goto rebalance;
}
dump_stack();
show_mem();
}
+ return page;
got_pg:
+ if (kmemcheck_enabled)
+ kmemcheck_pagealloc_alloc(page, order, gfp_mask);
return page;
}
struct page *page;
int migratetype = allocflags_to_migratetype(gfp_mask);
+ gfp_mask &= gfp_allowed_mask;
+
lockdep_trace_alloc(gfp_mask);
might_sleep_if(gfp_mask & __GFP_WAIT);
zonelist, high_zoneidx, nodemask,
preferred_zone, migratetype);
+ trace_mm_page_alloc(page, order, gfp_mask, migratetype);
return page;
}
EXPORT_SYMBOL(__alloc_pages_nodemask);
*/
unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
{
- struct page * page;
+ struct page *page;
+
+ /*
+ * __get_free_pages() returns a 32-bit address, which cannot represent
+ * a highmem page
+ */
+ VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
+
page = alloc_pages(gfp_mask, order);
if (!page)
return 0;
return (unsigned long) page_address(page);
}
-
EXPORT_SYMBOL(__get_free_pages);
unsigned long get_zeroed_page(gfp_t gfp_mask)
{
- struct page * page;
-
- /*
- * get_zeroed_page() returns a 32-bit address, which cannot represent
- * a highmem page
- */
- VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
-
- page = alloc_pages(gfp_mask | __GFP_ZERO, 0);
- if (page)
- return (unsigned long) page_address(page);
- return 0;
+ return __get_free_pages(gfp_mask | __GFP_ZERO, 0);
}
-
EXPORT_SYMBOL(get_zeroed_page);
void __pagevec_free(struct pagevec *pvec)
{
int i = pagevec_count(pvec);
- while (--i >= 0)
+ while (--i >= 0) {
+ trace_mm_pagevec_free(pvec->pages[i], pvec->cold);
free_hot_cold_page(pvec->pages[i], pvec->cold);
+ }
}
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page)) {
+ trace_mm_page_free_direct(page, order);
if (order == 0)
free_hot_page(page);
else
unsigned long alloc_end = addr + (PAGE_SIZE << order);
unsigned long used = addr + PAGE_ALIGN(size);
- split_page(virt_to_page(addr), order);
+ split_page(virt_to_page((void *)addr), order);
while (used < alloc_end) {
free_page(used);
used += PAGE_SIZE;
}
}
- printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n"
- " inactive_file:%lu"
-//TODO: check/adjust line lengths
-#ifdef CONFIG_UNEVICTABLE_LRU
+ printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
+ " active_file:%lu inactive_file:%lu isolated_file:%lu\n"
" unevictable:%lu"
-#endif
" dirty:%lu writeback:%lu unstable:%lu\n"
- " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n",
+ " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n"
+ " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n",
global_page_state(NR_ACTIVE_ANON),
- global_page_state(NR_ACTIVE_FILE),
global_page_state(NR_INACTIVE_ANON),
+ global_page_state(NR_ISOLATED_ANON),
+ global_page_state(NR_ACTIVE_FILE),
global_page_state(NR_INACTIVE_FILE),
-#ifdef CONFIG_UNEVICTABLE_LRU
+ global_page_state(NR_ISOLATED_FILE),
global_page_state(NR_UNEVICTABLE),
-#endif
global_page_state(NR_FILE_DIRTY),
global_page_state(NR_WRITEBACK),
global_page_state(NR_UNSTABLE_NFS),
global_page_state(NR_FREE_PAGES),
- global_page_state(NR_SLAB_RECLAIMABLE) +
- global_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_page_state(NR_SLAB_RECLAIMABLE),
+ global_page_state(NR_SLAB_UNRECLAIMABLE),
global_page_state(NR_FILE_MAPPED),
+ global_page_state(NR_SHMEM),
global_page_state(NR_PAGETABLE),
global_page_state(NR_BOUNCE));
" inactive_anon:%lukB"
" active_file:%lukB"
" inactive_file:%lukB"
-#ifdef CONFIG_UNEVICTABLE_LRU
" unevictable:%lukB"
-#endif
+ " isolated(anon):%lukB"
+ " isolated(file):%lukB"
" present:%lukB"
+ " mlocked:%lukB"
+ " dirty:%lukB"
+ " writeback:%lukB"
+ " mapped:%lukB"
+ " shmem:%lukB"
+ " slab_reclaimable:%lukB"
+ " slab_unreclaimable:%lukB"
+ " kernel_stack:%lukB"
+ " pagetables:%lukB"
+ " unstable:%lukB"
+ " bounce:%lukB"
+ " writeback_tmp:%lukB"
" pages_scanned:%lu"
" all_unreclaimable? %s"
"\n",
K(zone_page_state(zone, NR_INACTIVE_ANON)),
K(zone_page_state(zone, NR_ACTIVE_FILE)),
K(zone_page_state(zone, NR_INACTIVE_FILE)),
-#ifdef CONFIG_UNEVICTABLE_LRU
K(zone_page_state(zone, NR_UNEVICTABLE)),
-#endif
+ K(zone_page_state(zone, NR_ISOLATED_ANON)),
+ K(zone_page_state(zone, NR_ISOLATED_FILE)),
K(zone->present_pages),
+ K(zone_page_state(zone, NR_MLOCK)),
+ K(zone_page_state(zone, NR_FILE_DIRTY)),
+ K(zone_page_state(zone, NR_WRITEBACK)),
+ K(zone_page_state(zone, NR_FILE_MAPPED)),
+ K(zone_page_state(zone, NR_SHMEM)),
+ K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)),
+ K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)),
+ zone_page_state(zone, NR_KERNEL_STACK) *
+ THREAD_SIZE / 1024,
+ K(zone_page_state(zone, NR_PAGETABLE)),
+ K(zone_page_state(zone, NR_UNSTABLE_NFS)),
+ K(zone_page_state(zone, NR_BOUNCE)),
+ K(zone_page_state(zone, NR_WRITEBACK_TEMP)),
zone->pages_scanned,
(zone_is_all_unreclaimable(zone) ? "yes" : "no")
);
* sysctl handler for numa_zonelist_order
*/
int numa_zonelist_order_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length,
+ void __user *buffer, size_t *length,
loff_t *ppos)
{
char saved_string[NUMA_ZONELIST_ORDER_LEN];
if (write)
strncpy(saved_string, (char*)table->data,
NUMA_ZONELIST_ORDER_LEN);
- ret = proc_dostring(table, write, file, buffer, length, ppos);
+ ret = proc_dostring(table, write, buffer, length, ppos);
if (ret)
return ret;
if (write) {
prev_node = local_node;
nodes_clear(used_mask);
- memset(node_load, 0, sizeof(node_load));
memset(node_order, 0, sizeof(node_order));
j = 0;
{
int nid;
+#ifdef CONFIG_NUMA
+ memset(node_load, 0, sizeof(node_load));
+#endif
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
{
unsigned long start_pfn, pfn, end_pfn;
struct page *page;
- unsigned long reserve, block_migratetype;
+ unsigned long block_migratetype;
+ int reserve;
/* Get the start pfn, end pfn and the number of blocks to reserve */
start_pfn = zone->zone_start_pfn;
reserve = roundup(min_wmark_pages(zone), pageblock_nr_pages) >>
pageblock_order;
+ /*
+ * Reserve blocks are generally in place to help high-order atomic
+ * allocations that are short-lived. A min_free_kbytes value that
+ * would result in more than 2 reserve blocks for atomic allocations
+ * is assumed to be in place to help anti-fragmentation for the
+ * future allocation of hugepages at runtime.
+ */
+ reserve = min(2, reserve);
+
for (pfn = start_pfn; pfn < end_pfn; pfn += pageblock_nr_pages) {
if (!pfn_valid(pfn))
continue;
static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
{
struct per_cpu_pages *pcp;
+ int migratetype;
memset(p, 0, sizeof(*p));
pcp->count = 0;
pcp->high = 6 * batch;
pcp->batch = max(1UL, 1 * batch);
- INIT_LIST_HEAD(&pcp->list);
+ for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
+ INIT_LIST_HEAD(&pcp->lists[migratetype]);
}
/*
if (dzone == zone)
break;
kfree(zone_pcp(dzone, cpu));
- zone_pcp(dzone, cpu) = NULL;
+ zone_pcp(dzone, cpu) = &boot_pageset[cpu];
}
return -ENOMEM;
}
/* Free per_cpu_pageset if it is slab allocated */
if (pset != &boot_pageset[cpu])
kfree(pset);
- zone_pcp(zone, cpu) = NULL;
+ zone_pcp(zone, cpu) = &boot_pageset[cpu];
}
}
return 0;
}
+static int __zone_pcp_update(void *data)
+{
+ struct zone *zone = data;
+ int cpu;
+ unsigned long batch = zone_batchsize(zone), flags;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ struct per_cpu_pageset *pset;
+ struct per_cpu_pages *pcp;
+
+ pset = zone_pcp(zone, cpu);
+ pcp = &pset->pcp;
+
+ local_irq_save(flags);
+ free_pcppages_bulk(zone, pcp->count, pcp);
+ setup_pageset(pset, batch);
+ local_irq_restore(flags);
+ }
+ return 0;
+}
+
+void zone_pcp_update(struct zone *zone)
+{
+ stop_machine(__zone_pcp_update, zone, NULL);
+}
+
static __meminit void zone_pcp_init(struct zone *zone)
{
int cpu;
zone_pcp_init(zone);
for_each_lru(l) {
INIT_LIST_HEAD(&zone->lru[l].list);
- zone->lru[l].nr_saved_scan = 0;
+ zone->reclaim_stat.nr_saved_scan[l] = 0;
}
zone->reclaim_stat.recent_rotated[0] = 0;
zone->reclaim_stat.recent_rotated[1] = 0;
int i, nid;
unsigned long usable_startpfn;
unsigned long kernelcore_node, kernelcore_remaining;
+ /* save the state before borrow the nodemask */
+ nodemask_t saved_node_state = node_states[N_HIGH_MEMORY];
unsigned long totalpages = early_calculate_totalpages();
int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]);
/* If kernelcore was not specified, there is no ZONE_MOVABLE */
if (!required_kernelcore)
- return;
+ goto out;
/* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */
find_usable_zone_for_movable();
for (nid = 0; nid < MAX_NUMNODES; nid++)
zone_movable_pfn[nid] =
roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES);
+
+out:
+ /* restore the node_state */
+ node_states[N_HIGH_MEMORY] = saved_node_state;
}
/* Any regular memory on that node ? */
}
/**
- * setup_per_zone_pages_min - called when min_free_kbytes changes.
+ * setup_per_zone_wmarks - called when min_free_kbytes changes
+ * or when memory is hot-{added|removed}
*
- * Ensures that the pages_{min,low,high} values for each zone are set correctly
- * with respect to min_free_kbytes.
+ * Ensures that the watermark[min,low,high] values for each zone are set
+ * correctly with respect to min_free_kbytes.
*/
-void setup_per_zone_pages_min(void)
+void setup_per_zone_wmarks(void)
{
unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
unsigned long lowmem_pages = 0;
calculate_totalreserve_pages();
}
-/**
+/*
* The inactive anon list should be small enough that the VM never has to
* do too much work, but large enough that each inactive page has a chance
* to be referenced again before it is swapped out.
* 1TB 101 10GB
* 10TB 320 32GB
*/
-static void __init setup_per_zone_inactive_ratio(void)
+void calculate_zone_inactive_ratio(struct zone *zone)
{
- struct zone *zone;
+ unsigned int gb, ratio;
- for_each_zone(zone) {
- unsigned int gb, ratio;
+ /* Zone size in gigabytes */
+ gb = zone->present_pages >> (30 - PAGE_SHIFT);
+ if (gb)
+ ratio = int_sqrt(10 * gb);
+ else
+ ratio = 1;
- /* Zone size in gigabytes */
- gb = zone->present_pages >> (30 - PAGE_SHIFT);
- if (gb)
- ratio = int_sqrt(10 * gb);
- else
- ratio = 1;
+ zone->inactive_ratio = ratio;
+}
- zone->inactive_ratio = ratio;
- }
+static void __init setup_per_zone_inactive_ratio(void)
+{
+ struct zone *zone;
+
+ for_each_zone(zone)
+ calculate_zone_inactive_ratio(zone);
}
/*
* 8192MB: 11584k
* 16384MB: 16384k
*/
-static int __init init_per_zone_pages_min(void)
+static int __init init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
min_free_kbytes = 128;
if (min_free_kbytes > 65536)
min_free_kbytes = 65536;
- setup_per_zone_pages_min();
+ setup_per_zone_wmarks();
setup_per_zone_lowmem_reserve();
setup_per_zone_inactive_ratio();
return 0;
}
-module_init(init_per_zone_pages_min)
+module_init(init_per_zone_wmark_min)
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
* changes.
*/
int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec(table, write, file, buffer, length, ppos);
+ proc_dointvec(table, write, buffer, length, ppos);
if (write)
- setup_per_zone_pages_min();
+ setup_per_zone_wmarks();
return 0;
}
#ifdef CONFIG_NUMA
int sysctl_min_unmapped_ratio_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int rc;
- rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
}
int sysctl_min_slab_ratio_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int rc;
- rc = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (rc)
return rc;
* if in function of the boot time zone sizes.
*/
int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
- proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ proc_dointvec_minmax(table, write, buffer, length, ppos);
setup_per_zone_lowmem_reserve();
return 0;
}
*/
int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
- struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
+ void __user *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
unsigned int cpu;
int ret;
- ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
+ ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
if (!write || (ret == -EINVAL))
return ret;
- for_each_zone(zone) {
+ for_each_populated_zone(zone) {
for_each_online_cpu(cpu) {
unsigned long high;
high = zone->present_pages / percpu_pagelist_fraction;
numentries <<= (PAGE_SHIFT - scale);
/* Make sure we've got at least a 0-order allocation.. */
- if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ if (unlikely(flags & HASH_SMALL)) {
+ /* Makes no sense without HASH_EARLY */
+ WARN_ON(!(flags & HASH_EARLY));
+ if (!(numentries >> *_hash_shift)) {
+ numentries = 1UL << *_hash_shift;
+ BUG_ON(!numentries);
+ }
+ } else if (unlikely((numentries * bucketsize) < PAGE_SIZE))
numentries = PAGE_SIZE / bucketsize;
}
numentries = roundup_pow_of_two(numentries);
* some pages at the end of hash table which
* alloc_pages_exact() automatically does
*/
- if (get_order(size) < MAX_ORDER)
+ if (get_order(size) < MAX_ORDER) {
table = alloc_pages_exact(size, GFP_ATOMIC);
+ kmemleak_alloc(table, size, 1, GFP_ATOMIC);
+ }
}
} while (!table && size > PAGE_SIZE && --log2qty);
if (_hash_mask)
*_hash_mask = (1 << log2qty) - 1;
- /*
- * If hashdist is set, the table allocation is done with __vmalloc()
- * which invokes the kmemleak_alloc() callback. This function may also
- * be called before the slab and kmemleak are initialised when
- * kmemleak simply buffers the request to be executed later
- * (GFP_ATOMIC flag ignored in this case).
- */
- if (!hashdist)
- kmemleak_alloc(table, size, 1, GFP_ATOMIC);
-
return table;
}
struct zone *zone;
unsigned long flags;
int ret = -EBUSY;
+ int zone_idx;
zone = page_zone(page);
+ zone_idx = zone_idx(zone);
spin_lock_irqsave(&zone->lock, flags);
/*
* In future, more migrate types will be able to be isolation target.
*/
- if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE)
+ if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE &&
+ zone_idx != ZONE_MOVABLE)
goto out;
set_pageblock_migratetype(page, MIGRATE_ISOLATE);
move_freepages_block(zone, page, MIGRATE_ISOLATE);