#include <linux/stop_machine.h>
#include <linux/sort.h>
#include <linux/pfn.h>
+#include <linux/backing-dev.h>
+#include <linux/fault-inject.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
EXPORT_SYMBOL(totalram_pages);
-/*
- * Used by page_zone() to look up the address of the struct zone whose
- * id is encoded in the upper bits of page->flags
- */
-struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
-EXPORT_SYMBOL(zone_table);
-
-static char *zone_names[MAX_NR_ZONES] = {
+static char * const zone_names[MAX_NR_ZONES] = {
"DMA",
#ifdef CONFIG_ZONE_DMA32
"DMA32",
int __initdata nr_nodemap_entries;
unsigned long __initdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES];
unsigned long __initdata arch_zone_highest_possible_pfn[MAX_NR_ZONES];
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+ unsigned long __initdata node_boundary_start_pfn[MAX_NUMNODES];
+ unsigned long __initdata node_boundary_end_pfn[MAX_NUMNODES];
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
#ifdef CONFIG_DEBUG_VM
int i;
int nr_pages = 1 << order;
- page[1].lru.next = (void *)free_compound_page; /* set dtor */
+ set_compound_page_dtor(page, free_compound_page);
page[1].lru.prev = (void *)order;
for (i = 0; i < nr_pages; i++) {
struct page *p = page + i;
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __free_one_page(page, zone ,order);
+ __free_one_page(page, zone, order);
spin_unlock(&zone->lock);
}
int i;
int reserved = 0;
- arch_free_page(page, order);
- if (!PageHighMem(page))
- debug_check_no_locks_freed(page_address(page),
- PAGE_SIZE<<order);
-
for (i = 0 ; i < (1 << order) ; ++i)
reserved += free_pages_check(page + i);
if (reserved)
return;
+ if (!PageHighMem(page))
+ debug_check_no_locks_freed(page_address(page),PAGE_SIZE<<order);
+ arch_free_page(page, order);
kernel_map_pages(page, 1 << order, 0);
+
local_irq_save(flags);
__count_vm_events(PGFREE, 1 << order);
free_one_page(page_zone(page), page, order);
1 << PG_checked | 1 << PG_mappedtodisk);
set_page_private(page, 0);
set_page_refcounted(page);
+
+ arch_alloc_page(page, order);
kernel_map_pages(page, 1 << order, 1);
if (gfp_flags & __GFP_ZERO)
pcp = &pset->pcp[i];
if (pcp->count) {
+ int to_drain;
+
local_irq_save(flags);
- free_pages_bulk(zone, pcp->count, &pcp->list, 0);
- pcp->count = 0;
+ if (pcp->count >= pcp->batch)
+ to_drain = pcp->batch;
+ else
+ to_drain = pcp->count;
+ free_pages_bulk(zone, to_drain, &pcp->list, 0);
+ pcp->count -= to_drain;
local_irq_restore(flags);
}
}
}
#endif
-#if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU)
static void __drain_pages(unsigned int cpu)
{
unsigned long flags;
for_each_zone(zone) {
struct per_cpu_pageset *pset;
+ if (!populated_zone(zone))
+ continue;
+
pset = zone_pcp(zone, cpu);
for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
struct per_cpu_pages *pcp;
}
}
}
-#endif /* CONFIG_PM || CONFIG_HOTPLUG_CPU */
#ifdef CONFIG_PM
struct per_cpu_pages *pcp;
unsigned long flags;
- arch_free_page(page, 0);
-
if (PageAnon(page))
page->mapping = NULL;
if (free_pages_check(page))
return;
+ if (!PageHighMem(page))
+ debug_check_no_locks_freed(page_address(page), PAGE_SIZE);
+ arch_free_page(page, 0);
kernel_map_pages(page, 1, 0);
pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
pcp = &zone_pcp(zone, cpu)->pcp[cold];
local_irq_save(flags);
if (!pcp->count) {
- pcp->count += rmqueue_bulk(zone, 0,
+ pcp->count = rmqueue_bulk(zone, 0,
pcp->batch, &pcp->list);
if (unlikely(!pcp->count))
goto failed;
#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
+#ifdef CONFIG_FAIL_PAGE_ALLOC
+
+static struct fail_page_alloc_attr {
+ struct fault_attr attr;
+
+ u32 ignore_gfp_highmem;
+ u32 ignore_gfp_wait;
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+ struct dentry *ignore_gfp_highmem_file;
+ struct dentry *ignore_gfp_wait_file;
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+} fail_page_alloc = {
+ .attr = FAULT_ATTR_INITIALIZER,
+ .ignore_gfp_wait = 1,
+ .ignore_gfp_highmem = 1,
+};
+
+static int __init setup_fail_page_alloc(char *str)
+{
+ return setup_fault_attr(&fail_page_alloc.attr, str);
+}
+__setup("fail_page_alloc=", setup_fail_page_alloc);
+
+static int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ if (gfp_mask & __GFP_NOFAIL)
+ return 0;
+ if (fail_page_alloc.ignore_gfp_highmem && (gfp_mask & __GFP_HIGHMEM))
+ return 0;
+ if (fail_page_alloc.ignore_gfp_wait && (gfp_mask & __GFP_WAIT))
+ return 0;
+
+ return should_fail(&fail_page_alloc.attr, 1 << order);
+}
+
+#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS
+
+static int __init fail_page_alloc_debugfs(void)
+{
+ mode_t mode = S_IFREG | S_IRUSR | S_IWUSR;
+ struct dentry *dir;
+ int err;
+
+ err = init_fault_attr_dentries(&fail_page_alloc.attr,
+ "fail_page_alloc");
+ if (err)
+ return err;
+ dir = fail_page_alloc.attr.dentries.dir;
+
+ fail_page_alloc.ignore_gfp_wait_file =
+ debugfs_create_bool("ignore-gfp-wait", mode, dir,
+ &fail_page_alloc.ignore_gfp_wait);
+
+ fail_page_alloc.ignore_gfp_highmem_file =
+ debugfs_create_bool("ignore-gfp-highmem", mode, dir,
+ &fail_page_alloc.ignore_gfp_highmem);
+
+ if (!fail_page_alloc.ignore_gfp_wait_file ||
+ !fail_page_alloc.ignore_gfp_highmem_file) {
+ err = -ENOMEM;
+ debugfs_remove(fail_page_alloc.ignore_gfp_wait_file);
+ debugfs_remove(fail_page_alloc.ignore_gfp_highmem_file);
+ cleanup_fault_attr_dentries(&fail_page_alloc.attr);
+ }
+
+ return err;
+}
+
+late_initcall(fail_page_alloc_debugfs);
+
+#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */
+
+#else /* CONFIG_FAIL_PAGE_ALLOC */
+
+static inline int should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
+{
+ return 0;
+}
+
+#endif /* CONFIG_FAIL_PAGE_ALLOC */
+
/*
* Return 1 if free pages are above 'mark'. This takes into account the order
* of the allocation.
return 1;
}
+#ifdef CONFIG_NUMA
+/*
+ * zlc_setup - Setup for "zonelist cache". Uses cached zone data to
+ * skip over zones that are not allowed by the cpuset, or that have
+ * been recently (in last second) found to be nearly full. See further
+ * comments in mmzone.h. Reduces cache footprint of zonelist scans
+ * that have to skip over alot of full or unallowed zones.
+ *
+ * If the zonelist cache is present in the passed in zonelist, then
+ * returns a pointer to the allowed node mask (either the current
+ * tasks mems_allowed, or node_online_map.)
+ *
+ * If the zonelist cache is not available for this zonelist, does
+ * nothing and returns NULL.
+ *
+ * If the fullzones BITMAP in the zonelist cache is stale (more than
+ * a second since last zap'd) then we zap it out (clear its bits.)
+ *
+ * We hold off even calling zlc_setup, until after we've checked the
+ * first zone in the zonelist, on the theory that most allocations will
+ * be satisfied from that first zone, so best to examine that zone as
+ * quickly as we can.
+ */
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ nodemask_t *allowednodes; /* zonelist_cache approximation */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return NULL;
+
+ if (jiffies - zlc->last_full_zap > 1 * HZ) {
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ zlc->last_full_zap = jiffies;
+ }
+
+ allowednodes = !in_interrupt() && (alloc_flags & ALLOC_CPUSET) ?
+ &cpuset_current_mems_allowed :
+ &node_online_map;
+ return allowednodes;
+}
+
+/*
+ * Given 'z' scanning a zonelist, run a couple of quick checks to see
+ * if it is worth looking at further for free memory:
+ * 1) Check that the zone isn't thought to be full (doesn't have its
+ * bit set in the zonelist_cache fullzones BITMAP).
+ * 2) Check that the zones node (obtained from the zonelist_cache
+ * z_to_n[] mapping) is allowed in the passed in allowednodes mask.
+ * Return true (non-zero) if zone is worth looking at further, or
+ * else return false (zero) if it is not.
+ *
+ * This check -ignores- the distinction between various watermarks,
+ * such as GFP_HIGH, GFP_ATOMIC, PF_MEMALLOC, ... If a zone is
+ * found to be full for any variation of these watermarks, it will
+ * be considered full for up to one second by all requests, unless
+ * we are so low on memory on all allowed nodes that we are forced
+ * into the second scan of the zonelist.
+ *
+ * In the second scan we ignore this zonelist cache and exactly
+ * apply the watermarks to all zones, even it is slower to do so.
+ * We are low on memory in the second scan, and should leave no stone
+ * unturned looking for a free page.
+ */
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+ nodemask_t *allowednodes)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ int i; /* index of *z in zonelist zones */
+ int n; /* node that zone *z is on */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return 1;
+
+ i = z - zonelist->zones;
+ n = zlc->z_to_n[i];
+
+ /* This zone is worth trying if it is allowed but not full */
+ return node_isset(n, *allowednodes) && !test_bit(i, zlc->fullzones);
+}
+
+/*
+ * Given 'z' scanning a zonelist, set the corresponding bit in
+ * zlc->fullzones, so that subsequent attempts to allocate a page
+ * from that zone don't waste time re-examining it.
+ */
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+ struct zonelist_cache *zlc; /* cached zonelist speedup info */
+ int i; /* index of *z in zonelist zones */
+
+ zlc = zonelist->zlcache_ptr;
+ if (!zlc)
+ return;
+
+ i = z - zonelist->zones;
+
+ set_bit(i, zlc->fullzones);
+}
+
+#else /* CONFIG_NUMA */
+
+static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags)
+{
+ return NULL;
+}
+
+static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zone **z,
+ nodemask_t *allowednodes)
+{
+ return 1;
+}
+
+static void zlc_mark_zone_full(struct zonelist *zonelist, struct zone **z)
+{
+}
+#endif /* CONFIG_NUMA */
+
/*
- * get_page_from_freeliest goes through the zonelist trying to allocate
+ * get_page_from_freelist goes through the zonelist trying to allocate
* a page.
*/
static struct page *
get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
struct zonelist *zonelist, int alloc_flags)
{
- struct zone **z = zonelist->zones;
+ struct zone **z;
struct page *page = NULL;
- int classzone_idx = zone_idx(*z);
+ int classzone_idx = zone_idx(zonelist->zones[0]);
struct zone *zone;
+ nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */
+ int zlc_active = 0; /* set if using zonelist_cache */
+ int did_zlc_setup = 0; /* just call zlc_setup() one time */
+zonelist_scan:
/*
- * Go through the zonelist once, looking for a zone with enough free.
+ * Scan zonelist, looking for a zone with enough free.
* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
*/
+ z = zonelist->zones;
+
do {
+ if (NUMA_BUILD && zlc_active &&
+ !zlc_zone_worth_trying(zonelist, z, allowednodes))
+ continue;
zone = *z;
- if (unlikely((gfp_mask & __GFP_THISNODE) &&
+ if (unlikely(NUMA_BUILD && (gfp_mask & __GFP_THISNODE) &&
zone->zone_pgdat != zonelist->zones[0]->zone_pgdat))
break;
if ((alloc_flags & ALLOC_CPUSET) &&
- !cpuset_zone_allowed(zone, gfp_mask))
- continue;
+ !cpuset_zone_allowed_softwall(zone, gfp_mask))
+ goto try_next_zone;
if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
unsigned long mark;
mark = zone->pages_low;
else
mark = zone->pages_high;
- if (!zone_watermark_ok(zone , order, mark,
- classzone_idx, alloc_flags))
+ if (!zone_watermark_ok(zone, order, mark,
+ classzone_idx, alloc_flags)) {
if (!zone_reclaim_mode ||
!zone_reclaim(zone, gfp_mask, order))
- continue;
+ goto this_zone_full;
+ }
}
page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
- if (page) {
+ if (page)
break;
+this_zone_full:
+ if (NUMA_BUILD)
+ zlc_mark_zone_full(zonelist, z);
+try_next_zone:
+ if (NUMA_BUILD && !did_zlc_setup) {
+ /* we do zlc_setup after the first zone is tried */
+ allowednodes = zlc_setup(zonelist, alloc_flags);
+ zlc_active = 1;
+ did_zlc_setup = 1;
}
} while (*(++z) != NULL);
+
+ if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) {
+ /* Disable zlc cache for second zonelist scan */
+ zlc_active = 0;
+ goto zonelist_scan;
+ }
return page;
}
might_sleep_if(wait);
+ if (should_fail_alloc_page(gfp_mask, order))
+ return NULL;
+
restart:
z = zonelist->zones; /* the list of zones suitable for gfp_mask */
if (page)
goto got_pg;
- do {
+ /*
+ * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and
+ * __GFP_NOWARN set) should not cause reclaim since the subsystem
+ * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim
+ * using a larger set of nodes after it has established that the
+ * allowed per node queues are empty and that nodes are
+ * over allocated.
+ */
+ if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE)
+ goto nopage;
+
+ for (z = zonelist->zones; *z; z++)
wakeup_kswapd(*z, order);
- } while (*(++z));
/*
* OK, we're below the kswapd watermark and have kicked background
/* This allocation should allow future memory freeing. */
+rebalance:
if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
&& !in_interrupt()) {
if (!(gfp_mask & __GFP_NOMEMALLOC)) {
if (page)
goto got_pg;
if (gfp_mask & __GFP_NOFAIL) {
- blk_congestion_wait(WRITE, HZ/50);
+ congestion_wait(WRITE, HZ/50);
goto nofail_alloc;
}
}
if (!wait)
goto nopage;
-rebalance:
cond_resched();
/* We now go into synchronous reclaim */
do_retry = 1;
}
if (do_retry) {
- blk_congestion_wait(WRITE, HZ/50);
+ congestion_wait(WRITE, HZ/50);
goto rebalance;
}
{
return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
}
-#ifdef CONFIG_NUMA
-static void show_node(struct zone *zone)
+
+static inline void show_node(struct zone *zone)
{
- printk("Node %ld ", zone_to_nid(zone));
+ if (NUMA_BUILD)
+ printk("Node %d ", zone_to_nid(zone));
}
-#else
-#define show_node(zone) do { } while (0)
-#endif
void si_meminfo(struct sysinfo *val)
{
*/
void show_free_areas(void)
{
- int cpu, temperature;
+ int cpu;
unsigned long active;
unsigned long inactive;
unsigned long free;
struct zone *zone;
for_each_zone(zone) {
- show_node(zone);
- printk("%s per-cpu:", zone->name);
-
- if (!populated_zone(zone)) {
- printk(" empty\n");
+ if (!populated_zone(zone))
continue;
- } else
- printk("\n");
+
+ show_node(zone);
+ printk("%s per-cpu:\n", zone->name);
for_each_online_cpu(cpu) {
struct per_cpu_pageset *pageset;
pageset = zone_pcp(zone, cpu);
- for (temperature = 0; temperature < 2; temperature++)
- printk("cpu %d %s: high %d, batch %d used:%d\n",
- cpu,
- temperature ? "cold" : "hot",
- pageset->pcp[temperature].high,
- pageset->pcp[temperature].batch,
- pageset->pcp[temperature].count);
+ printk("CPU %4d: Hot: hi:%5d, btch:%4d usd:%4d "
+ "Cold: hi:%5d, btch:%4d usd:%4d\n",
+ cpu, pageset->pcp[0].high,
+ pageset->pcp[0].batch, pageset->pcp[0].count,
+ pageset->pcp[1].high, pageset->pcp[1].batch,
+ pageset->pcp[1].count);
}
}
for_each_zone(zone) {
int i;
+ if (!populated_zone(zone))
+ continue;
+
show_node(zone);
printk("%s"
" free:%lukB"
for_each_zone(zone) {
unsigned long nr[MAX_ORDER], flags, order, total = 0;
+ if (!populated_zone(zone))
+ continue;
+
show_node(zone);
printk("%s: ", zone->name);
- if (!populated_zone(zone)) {
- printk("empty\n");
- continue;
- }
spin_lock_irqsave(&zone->lock, flags);
for (order = 0; order < MAX_ORDER; order++) {
}
}
+/* Construct the zonelist performance cache - see further mmzone.h */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ struct zonelist *zonelist;
+ struct zonelist_cache *zlc;
+ struct zone **z;
+
+ zonelist = pgdat->node_zonelists + i;
+ zonelist->zlcache_ptr = zlc = &zonelist->zlcache;
+ bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST);
+ for (z = zonelist->zones; *z; z++)
+ zlc->z_to_n[z - zonelist->zones] = zone_to_nid(*z);
+ }
+}
+
#else /* CONFIG_NUMA */
static void __meminit build_zonelists(pg_data_t *pgdat)
}
}
+/* non-NUMA variant of zonelist performance cache - just NULL zlcache_ptr */
+static void __meminit build_zonelist_cache(pg_data_t *pgdat)
+{
+ int i;
+
+ for (i = 0; i < MAX_NR_ZONES; i++)
+ pgdat->node_zonelists[i].zlcache_ptr = NULL;
+}
+
#endif /* CONFIG_NUMA */
/* return values int ....just for stop_machine_run() */
static int __meminit __build_all_zonelists(void *dummy)
{
int nid;
- for_each_online_node(nid)
+
+ for_each_online_node(nid) {
build_zonelists(NODE_DATA(nid));
+ build_zonelist_cache(NODE_DATA(nid));
+ }
return 0;
}
void __meminit build_all_zonelists(void)
{
if (system_state == SYSTEM_BOOTING) {
- __build_all_zonelists(0);
+ __build_all_zonelists(NULL);
cpuset_init_current_mems_allowed();
} else {
/* we have to stop all cpus to guaranntee there is no user
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn)
+ unsigned long start_pfn, enum memmap_context context)
{
struct page *page;
unsigned long end_pfn = start_pfn + size;
unsigned long pfn;
for (pfn = start_pfn; pfn < end_pfn; pfn++) {
- if (!early_pfn_valid(pfn))
- continue;
+ /*
+ * There can be holes in boot-time mem_map[]s
+ * handed to this function. They do not
+ * exist on hotplugged memory.
+ */
+ if (context == MEMMAP_EARLY) {
+ if (!early_pfn_valid(pfn))
+ continue;
+ if (!early_pfn_in_nid(pfn, nid))
+ continue;
+ }
page = pfn_to_page(pfn);
set_page_links(page, zone, nid, pfn);
init_page_count(page);
}
}
-#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) | zone_nr)
-void zonetable_add(struct zone *zone, int nid, enum zone_type zid,
- unsigned long pfn, unsigned long size)
-{
- unsigned long snum = pfn_to_section_nr(pfn);
- unsigned long end = pfn_to_section_nr(pfn + size);
-
- if (FLAGS_HAS_NODE)
- zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
- else
- for (; snum <= end; snum++)
- zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
-}
-
#ifndef __HAVE_ARCH_MEMMAP_INIT
#define memmap_init(size, nid, zone, start_pfn) \
- memmap_init_zone((size), (nid), (zone), (start_pfn))
+ memmap_init_zone((size), (nid), (zone), (start_pfn), MEMMAP_EARLY)
#endif
static int __cpuinit zone_batchsize(struct zone *zone)
for_each_zone(zone) {
+ if (!populated_zone(zone))
+ continue;
+
zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
GFP_KERNEL, cpu_to_node(cpu));
if (!zone_pcp(zone, cpu))
int ret = NOTIFY_OK;
switch (action) {
- case CPU_UP_PREPARE:
- if (process_zones(cpu))
- ret = NOTIFY_BAD;
- break;
- case CPU_UP_CANCELED:
- case CPU_DEAD:
- free_zone_pagesets(cpu);
- break;
- default:
- break;
+ case CPU_UP_PREPARE:
+ if (process_zones(cpu))
+ ret = NOTIFY_BAD;
+ break;
+ case CPU_UP_CANCELED:
+ case CPU_DEAD:
+ free_zone_pagesets(cpu);
+ break;
+ default:
+ break;
}
return ret;
}
__meminit int init_currently_empty_zone(struct zone *zone,
unsigned long zone_start_pfn,
- unsigned long size)
+ unsigned long size,
+ enum memmap_context context)
{
struct pglist_data *pgdat = zone->zone_pgdat;
int ret;
/**
* free_bootmem_with_active_regions - Call free_bootmem_node for each active range
- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed
- * @max_low_pfn: The highest PFN that till be passed to free_bootmem_node
+ * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
+ * @max_low_pfn: The highest PFN that will be passed to free_bootmem_node
*
* If an architecture guarantees that all ranges registered with
* add_active_ranges() contain no holes and may be freed, this
/**
* sparse_memory_present_with_active_regions - Call memory_present for each active range
- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used
+ * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
*
* If an architecture guarantees that all ranges registered with
* add_active_ranges() contain no holes and may be freed, this
- * this function may be used instead of calling memory_present() manually.
+ * function may be used instead of calling memory_present() manually.
*/
void __init sparse_memory_present_with_active_regions(int nid)
{
}
/**
+ * push_node_boundaries - Push node boundaries to at least the requested boundary
+ * @nid: The nid of the node to push the boundary for
+ * @start_pfn: The start pfn of the node
+ * @end_pfn: The end pfn of the node
+ *
+ * In reserve-based hot-add, mem_map is allocated that is unused until hotadd
+ * time. Specifically, on x86_64, SRAT will report ranges that can potentially
+ * be hotplugged even though no physical memory exists. This function allows
+ * an arch to push out the node boundaries so mem_map is allocated that can
+ * be used later.
+ */
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+void __init push_node_boundaries(unsigned int nid,
+ unsigned long start_pfn, unsigned long end_pfn)
+{
+ printk(KERN_DEBUG "Entering push_node_boundaries(%u, %lu, %lu)\n",
+ nid, start_pfn, end_pfn);
+
+ /* Initialise the boundary for this node if necessary */
+ if (node_boundary_end_pfn[nid] == 0)
+ node_boundary_start_pfn[nid] = -1UL;
+
+ /* Update the boundaries */
+ if (node_boundary_start_pfn[nid] > start_pfn)
+ node_boundary_start_pfn[nid] = start_pfn;
+ if (node_boundary_end_pfn[nid] < end_pfn)
+ node_boundary_end_pfn[nid] = end_pfn;
+}
+
+/* If necessary, push the node boundary out for reserve hotadd */
+static void __init account_node_boundary(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn)
+{
+ printk(KERN_DEBUG "Entering account_node_boundary(%u, %lu, %lu)\n",
+ nid, *start_pfn, *end_pfn);
+
+ /* Return if boundary information has not been provided */
+ if (node_boundary_end_pfn[nid] == 0)
+ return;
+
+ /* Check the boundaries and update if necessary */
+ if (node_boundary_start_pfn[nid] < *start_pfn)
+ *start_pfn = node_boundary_start_pfn[nid];
+ if (node_boundary_end_pfn[nid] > *end_pfn)
+ *end_pfn = node_boundary_end_pfn[nid];
+}
+#else
+void __init push_node_boundaries(unsigned int nid,
+ unsigned long start_pfn, unsigned long end_pfn) {}
+
+static void __init account_node_boundary(unsigned int nid,
+ unsigned long *start_pfn, unsigned long *end_pfn) {}
+#endif
+
+
+/**
* get_pfn_range_for_nid - Return the start and end page frames for a node
- * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned
- * @start_pfn: Passed by reference. On return, it will have the node start_pfn
- * @end_pfn: Passed by reference. On return, it will have the node end_pfn
+ * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
+ * @start_pfn: Passed by reference. On return, it will have the node start_pfn.
+ * @end_pfn: Passed by reference. On return, it will have the node end_pfn.
*
* It returns the start and end page frame of a node based on information
* provided by an arch calling add_active_range(). If called for a node
* with no available memory, a warning is printed and the start and end
- * PFNs will be 0
+ * PFNs will be 0.
*/
void __init get_pfn_range_for_nid(unsigned int nid,
unsigned long *start_pfn, unsigned long *end_pfn)
printk(KERN_WARNING "Node %u active with no memory\n", nid);
*start_pfn = 0;
}
+
+ /* Push the node boundaries out if requested */
+ account_node_boundary(nid, start_pfn, end_pfn);
}
/*
/*
* Return the number of holes in a range on a node. If nid is MAX_NUMNODES,
- * then all holes in the requested range will be accounted for
+ * then all holes in the requested range will be accounted for.
*/
unsigned long __init __absent_pages_in_range(int nid,
unsigned long range_start_pfn,
if (i == -1)
return 0;
+ /* Account for ranges before physical memory on this node */
+ if (early_node_map[i].start_pfn > range_start_pfn)
+ hole_pages = early_node_map[i].start_pfn - range_start_pfn;
+
prev_end_pfn = early_node_map[i].start_pfn;
/* Find all holes for the zone within the node */
prev_end_pfn = early_node_map[i].end_pfn;
}
+ /* Account for ranges past physical memory on this node */
+ if (range_end_pfn > prev_end_pfn)
+ hole_pages += range_end_pfn -
+ max(range_start_pfn, prev_end_pfn);
+
return hole_pages;
}
* @start_pfn: The start PFN to start searching for holes
* @end_pfn: The end PFN to stop searching for holes
*
- * It returns the number of pages frames in memory holes within a range
+ * It returns the number of pages frames in memory holes within a range.
*/
unsigned long __init absent_pages_in_range(unsigned long start_pfn,
unsigned long end_pfn)
unsigned long zone_type,
unsigned long *ignored)
{
- return __absent_pages_in_range(nid,
- arch_zone_lowest_possible_pfn[zone_type],
- arch_zone_highest_possible_pfn[zone_type]);
-}
-
-/* Return the zone index a PFN is in */
-int memmap_zone_idx(struct page *lmem_map)
-{
- int i;
- unsigned long phys_addr = virt_to_phys(lmem_map);
- unsigned long pfn = phys_addr >> PAGE_SHIFT;
+ unsigned long node_start_pfn, node_end_pfn;
+ unsigned long zone_start_pfn, zone_end_pfn;
- for (i = 0; i < MAX_NR_ZONES; i++)
- if (pfn < arch_zone_highest_possible_pfn[i])
- break;
+ get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+ zone_start_pfn = max(arch_zone_lowest_possible_pfn[zone_type],
+ node_start_pfn);
+ zone_end_pfn = min(arch_zone_highest_possible_pfn[zone_type],
+ node_end_pfn);
- return i;
+ return __absent_pages_in_range(nid, zone_start_pfn, zone_end_pfn);
}
+
#else
static inline unsigned long zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
return zholes_size[zone_type];
}
-static inline int memmap_zone_idx(struct page *lmem_map)
-{
- return MAX_NR_ZONES;
-}
#endif
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
zone->spanned_pages = size;
zone->present_pages = realsize;
#ifdef CONFIG_NUMA
+ zone->node = nid;
zone->min_unmapped_pages = (realsize*sysctl_min_unmapped_ratio)
/ 100;
zone->min_slab_pages = (realsize * sysctl_min_slab_ratio) / 100;
zone->zone_pgdat = pgdat;
zone->free_pages = 0;
- zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
+ zone->prev_priority = DEF_PRIORITY;
zone_pcp_init(zone);
INIT_LIST_HEAD(&zone->active_list);
if (!size)
continue;
- zonetable_add(zone, nid, j, zone_start_pfn, size);
- ret = init_currently_empty_zone(zone, zone_start_pfn, size);
+ ret = init_currently_empty_zone(zone, zone_start_pfn,
+ size, MEMMAP_EARLY);
BUG_ON(ret);
zone_start_pfn += size;
}
/**
* remove_all_active_ranges - Remove all currently registered regions
+ *
* During discovery, it may be found that a table like SRAT is invalid
* and an alternative discovery method must be used. This function removes
* all currently registered regions.
*/
-void __init remove_all_active_ranges()
+void __init remove_all_active_ranges(void)
{
memset(early_node_map, 0, sizeof(early_node_map));
nr_nodemap_entries = 0;
+#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE
+ memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn));
+ memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn));
+#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */
}
/* Compare two active node_active_regions */
{
int i;
+ /* Regions in the early_node_map can be in any order */
+ sort_node_map();
+
/* Assuming a sorted map, the first range found has the starting pfn */
for_each_active_range_index_in_nid(i, nid)
return early_node_map[i].start_pfn;
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
* It returns the minimum PFN based on information provided via
- * add_active_range()
+ * add_active_range().
*/
unsigned long __init find_min_pfn_with_active_regions(void)
{
* find_max_pfn_with_active_regions - Find the maximum PFN registered
*
* It returns the maximum PFN based on information provided via
- * add_active_range()
+ * add_active_range().
*/
unsigned long __init find_max_pfn_with_active_regions(void)
{
/**
* free_area_init_nodes - Initialise all pg_data_t and zone data
- * @arch_max_dma_pfn: The maximum PFN usable for ZONE_DMA
- * @arch_max_dma32_pfn: The maximum PFN usable for ZONE_DMA32
- * @arch_max_low_pfn: The maximum PFN usable for ZONE_NORMAL
- * @arch_max_high_pfn: The maximum PFN usable for ZONE_HIGHMEM
+ * @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
* Using the page ranges provided by add_active_range(), the size of each
max(max_zone_pfn[i], arch_zone_lowest_possible_pfn[i]);
}
- /* Regions in the early_node_map can be in any order */
- sort_node_map();
-
/* Print out the zone ranges */
printk("Zone PFN ranges:\n");
for (i = 0; i < MAX_NR_ZONES; i++)
#endif /* CONFIG_ARCH_POPULATES_NODE_MAP */
/**
- * set_dma_reserve - Account the specified number of pages reserved in ZONE_DMA
- * @new_dma_reserve - The number of pages to mark reserved
+ * set_dma_reserve - set the specified number of pages reserved in the first zone
+ * @new_dma_reserve: The number of pages to mark reserved
*
* The per-cpu batchsize and zone watermarks are determined by present_pages.
* In the DMA zone, a significant percentage may be consumed by kernel image
* and other unfreeable allocations which can skew the watermarks badly. This
- * function may optionally be used to account for unfreeable pages in
- * ZONE_DMA. The effect will be lower watermarks and smaller per-cpu batchsize
+ * function may optionally be used to account for unfreeable pages in the
+ * first zone (e.g., ZONE_DMA). The effect will be lower watermarks and
+ * smaller per-cpu batchsize.
*/
void __init set_dma_reserve(unsigned long new_dma_reserve)
{
__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
}
-#ifdef CONFIG_HOTPLUG_CPU
static int page_alloc_cpu_notify(struct notifier_block *self,
unsigned long action, void *hcpu)
{
}
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
void __init page_alloc_init(void)
{
calculate_totalreserve_pages();
}
-/*
- * setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
- * that the pages_{min,low,high} values for each zone are set correctly
- * with respect to min_free_kbytes.
+/**
+ * setup_per_zone_pages_min - called when min_free_kbytes changes.
+ *
+ * Ensures that the pages_{min,low,high} values for each zone are set correctly
+ * with respect to min_free_kbytes.
*/
void setup_per_zone_pages_min(void)
{
/* allow the kernel cmdline to have a say */
if (!numentries) {
/* round applicable memory size up to nearest megabyte */
- numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
+ numentries = nr_kernel_pages;
numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
numentries >>= 20 - PAGE_SHIFT;
numentries <<= 20 - PAGE_SHIFT;
numentries >>= (scale - PAGE_SHIFT);
else
numentries <<= (PAGE_SHIFT - scale);
+
+ /* Make sure we've got at least a 0-order allocation.. */
+ if (unlikely((numentries * bucketsize) < PAGE_SIZE))
+ numentries = PAGE_SIZE / bucketsize;
}
numentries = roundup_pow_of_two(numentries);
if (numentries > max)
numentries = max;
- log2qty = long_log2(numentries);
+ log2qty = ilog2(numentries);
do {
size = bucketsize << log2qty;
printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
tablename,
(1U << log2qty),
- long_log2(size) - PAGE_SHIFT,
+ ilog2(size) - PAGE_SHIFT,
size);
if (_hash_shift)
EXPORT_SYMBOL(pfn_to_page);
EXPORT_SYMBOL(page_to_pfn);
#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */
+
+#if MAX_NUMNODES > 1
+/*
+ * Find the highest possible node id.
+ */
+int highest_possible_node_id(void)
+{
+ unsigned int node;
+ unsigned int highest = 0;
+
+ for_each_node_mask(node, node_possible_map)
+ highest = node;
+ return highest;
+}
+EXPORT_SYMBOL(highest_possible_node_id);
+#endif