X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=caa92689aac951e5d6c08c89993526898bc695d1;hb=c180604a87c5abb0a117998009d01a4499d58653;hp=abe26003124de3d3720a87684ee96daf7c720499;hpb=418589663d6011de9006425b6c5721e1544fb47a;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index abe2600..caa9268 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; +gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE int pageblock_order __read_mostly; @@ -161,7 +163,9 @@ static unsigned long __meminitdata dma_reserve; #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; +int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); +EXPORT_SYMBOL(nr_online_nodes); #endif int page_group_by_mobility_disabled __read_mostly; @@ -176,6 +180,8 @@ static void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } +bool oom_killer_disabled __read_mostly; + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -298,23 +304,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -#ifdef CONFIG_HUGETLBFS -void prep_compound_gigantic_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - struct page *p = page + 1; - - set_compound_page_dtor(page, free_compound_page); - set_compound_order(page, order); - __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); - p->first_page = page; - } -} -#endif - static int destroy_compound_page(struct page *page, unsigned long order) { int i; @@ -456,7 +445,6 @@ static inline void __free_one_page(struct page *page, int migratetype) { unsigned long page_idx; - int order_size = 1 << order; if (unlikely(PageCompound(page))) if (unlikely(destroy_compound_page(page, order))) @@ -466,10 +454,9 @@ static inline void __free_one_page(struct page *page, page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); - VM_BUG_ON(page_idx & (order_size - 1)); + VM_BUG_ON(page_idx & ((1 << order) - 1)); VM_BUG_ON(bad_range(zone, page)); - __mod_zone_page_state(zone, NR_FREE_PAGES, order_size); while (order < MAX_ORDER-1) { unsigned long combined_idx; struct page *buddy; @@ -493,6 +480,21 @@ static inline void __free_one_page(struct page *page, zone->free_area[order].nr_free++; } +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT +/* + * free_page_mlock() -- clean up attempts to free and mlocked() page. + * Page should not be on lru, so no need to fix that up. + * free_pages_check() will verify... + */ +static inline void free_page_mlock(struct page *page) +{ + __dec_zone_page_state(page, NR_MLOCK); + __count_vm_event(UNEVICTABLE_MLOCKFREED); +} +#else +static void free_page_mlock(struct page *page) { } +#endif + static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | @@ -524,6 +526,8 @@ static void free_pages_bulk(struct zone *zone, int count, spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; + + __mod_zone_page_state(zone, NR_FREE_PAGES, count << order); while (count--) { struct page *page; @@ -542,6 +546,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order, spin_lock(&zone->lock); zone_clear_flag(zone, ZONE_ALL_UNRECLAIMABLE); zone->pages_scanned = 0; + + __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order); __free_one_page(page, zone, order, migratetype); spin_unlock(&zone->lock); } @@ -551,7 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) unsigned long flags; int i; int bad = 0; - int clearMlocked = PageMlocked(page); + int wasMlocked = TestClearPageMlocked(page); + + kmemcheck_free_shadow(page, order); for (i = 0 ; i < (1 << order) ; ++i) bad += free_pages_check(page + i); @@ -567,7 +575,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) kernel_map_pages(page, 1 << order, 0); local_irq_save(flags); - if (unlikely(clearMlocked)) + if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order, @@ -686,7 +694,6 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order, list_del(&page->lru); rmv_page_order(page); area->nr_free--; - __mod_zone_page_state(zone, NR_FREE_PAGES, - (1UL << order)); expand(zone, page, order, current_order, area, migratetype); return page; } @@ -826,8 +833,6 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) /* Remove the page from the freelists */ list_del(&page->lru); rmv_page_order(page); - __mod_zone_page_state(zone, NR_FREE_PAGES, - -(1UL << order)); if (current_order == pageblock_order) set_pageblock_migratetype(page, @@ -900,6 +905,7 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, set_page_private(page, migratetype); list = &page->lru; } + __mod_zone_page_state(zone, NR_FREE_PAGES, -(i << order)); spin_unlock(&zone->lock); return i; } @@ -1015,7 +1021,9 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; - int clearMlocked = PageMlocked(page); + int wasMlocked = TestClearPageMlocked(page); + + kmemcheck_free_shadow(page, 0); if (PageAnon(page)) page->mapping = NULL; @@ -1030,8 +1038,9 @@ static void free_hot_cold_page(struct page *page, int cold) kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp; + set_page_private(page, get_pageblock_migratetype(page)); local_irq_save(flags); - if (unlikely(clearMlocked)) + if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_event(PGFREE); @@ -1039,7 +1048,6 @@ static void free_hot_cold_page(struct page *page, int cold) list_add_tail(&page->lru, &pcp->list); else list_add(&page->lru, &pcp->list); - set_page_private(page, get_pageblock_migratetype(page)); pcp->count++; if (pcp->count >= pcp->high) { free_pages_bulk(zone, pcp->batch, &pcp->list, 0); @@ -1073,6 +1081,16 @@ void split_page(struct page *page, unsigned int order) VM_BUG_ON(PageCompound(page)); VM_BUG_ON(!page_count(page)); + +#ifdef CONFIG_KMEMCHECK + /* + * Split shadow pages too, because free(page[0]) would + * otherwise free the whole shadow. + */ + if (kmemcheck_page_is_tracked(page)) + split_page(virt_to_page(page[0].shadow), order); +#endif + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } @@ -1127,8 +1145,22 @@ again: list_del(&page->lru); pcp->count--; } else { + if (unlikely(gfp_flags & __GFP_NOFAIL)) { + /* + * __GFP_NOFAIL is not to be used in new code. + * + * All __GFP_NOFAIL callers should be fixed so that they + * properly detect and handle allocation failures. + * + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with + * __GFP_NOFAIL. + */ + WARN_ON_ONCE(order > 1); + } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); + __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); spin_unlock(&zone->lock); if (!page) goto failed; @@ -1427,9 +1459,6 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ - if (WARN_ON_ONCE(order >= MAX_ORDER)) - return NULL; - classzone_idx = zone_idx(preferred_zone); zonelist_scan: /* @@ -1448,15 +1477,33 @@ zonelist_scan: BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; + int ret; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (!zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) { - if (!zone_reclaim_mode || - !zone_reclaim(zone, gfp_mask, order)) + if (zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) + goto try_this_zone; + + if (zone_reclaim_mode == 0) + goto this_zone_full; + + ret = zone_reclaim(zone, gfp_mask, order); + switch (ret) { + case ZONE_RECLAIM_NOSCAN: + /* did not scan */ + goto try_next_zone; + case ZONE_RECLAIM_FULL: + /* scanned but unreclaimable */ + goto this_zone_full; + default: + /* did we reclaim enough */ + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) goto this_zone_full; } } +try_this_zone: page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask, migratetype); if (page) @@ -1465,7 +1512,7 @@ this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); try_next_zone: - if (NUMA_BUILD && !did_zlc_setup && num_online_nodes() > 1) { + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { /* * we do zlc_setup after the first zone is tried but only * if there are multiple nodes make it worthwhile @@ -1547,7 +1594,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) goto out; /* Exhausted what can be done so it's blamo time */ @@ -1619,7 +1666,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1688,6 +1735,15 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct task_struct *p = current; /* + * In the slowpath, we sanity check order to avoid ever trying to + * reclaim >= MAX_ORDER areas which will never succeed. Callers may + * be using allocators in order of preference for an area that is + * too large. + */ + if (WARN_ON_ONCE(order >= MAX_ORDER)) + return NULL; + + /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and * __GFP_NOWARN set) should not cause reclaim since the subsystem * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim @@ -1748,6 +1804,8 @@ rebalance: */ if (!did_some_progress) { if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (oom_killer_disabled) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, @@ -1756,11 +1814,13 @@ rebalance: goto got_pg; /* - * The OOM killer does not trigger for high-order allocations - * but if no progress is being made, there are no other - * options and retrying is unlikely to help + * The OOM killer does not trigger for high-order + * ~__GFP_NOFAIL allocations so if no progress is being + * made, there are no other options and retrying is + * unlikely to help. */ - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (order > PAGE_ALLOC_COSTLY_ORDER && + !(gfp_mask & __GFP_NOFAIL)) goto nopage; goto restart; @@ -1771,7 +1831,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto rebalance; } @@ -1783,7 +1843,10 @@ nopage: dump_stack(); show_mem(); } + return page; got_pg: + if (kmemcheck_enabled) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); return page; } @@ -1800,6 +1863,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page; int migratetype = allocflags_to_migratetype(gfp_mask); + gfp_mask &= gfp_allowed_mask; + lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_WAIT); @@ -1918,7 +1983,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long alloc_end = addr + (PAGE_SIZE << order); unsigned long used = addr + PAGE_ALIGN(size); - split_page(virt_to_page(addr), order); + split_page(virt_to_page((void *)addr), order); while (used < alloc_end) { free_page(used); used += PAGE_SIZE; @@ -2052,19 +2117,14 @@ void show_free_areas(void) printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" " inactive_file:%lu" -//TODO: check/adjust line lengths -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lu" -#endif " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), global_page_state(NR_INACTIVE_FILE), -#ifdef CONFIG_UNEVICTABLE_LRU global_page_state(NR_UNEVICTABLE), -#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -2088,9 +2148,7 @@ void show_free_areas(void) " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lukB" -#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -2104,9 +2162,7 @@ void show_free_areas(void) K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU K(zone_page_state(zone, NR_UNEVICTABLE)), -#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") @@ -2264,7 +2320,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, } -#define MAX_NODE_LOAD (num_online_nodes()) +#define MAX_NODE_LOAD (nr_online_nodes) static int node_load[MAX_NUMNODES]; /** @@ -2473,7 +2529,7 @@ static void build_zonelists(pg_data_t *pgdat) /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = num_online_nodes(); + load = nr_online_nodes; prev_node = local_node; nodes_clear(used_mask); @@ -2624,7 +2680,7 @@ void build_all_zonelists(void) printk("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", - num_online_nodes(), + nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); @@ -2970,7 +3026,7 @@ bad: if (dzone == zone) break; kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = NULL; + zone_pcp(dzone, cpu) = &boot_pageset[cpu]; } return -ENOMEM; } @@ -2985,7 +3041,7 @@ static inline void free_zone_pagesets(int cpu) /* Free per_cpu_pageset if it is slab allocated */ if (pset != &boot_pageset[cpu]) kfree(pset); - zone_pcp(zone, cpu) = NULL; + zone_pcp(zone, cpu) = &boot_pageset[cpu]; } } @@ -3649,7 +3705,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; @@ -3976,6 +4032,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) int i, nid; unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; + /* save the state before borrow the nodemask */ + nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); @@ -4003,7 +4061,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) /* If kernelcore was not specified, there is no ZONE_MOVABLE */ if (!required_kernelcore) - return; + goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ find_usable_zone_for_movable(); @@ -4102,6 +4160,10 @@ restart: for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + +out: + /* restore the node_state */ + node_states[N_HIGH_MEMORY] = saved_node_state; } /* Any regular memory on that node ? */ @@ -4371,12 +4433,13 @@ static void setup_per_zone_lowmem_reserve(void) } /** - * setup_per_zone_pages_min - called when min_free_kbytes changes. + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} * - * Ensures that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. */ -void setup_per_zone_pages_min(void) +void setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -4432,8 +4495,6 @@ void setup_per_zone_pages_min(void) } /** - * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. - * * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. @@ -4454,21 +4515,26 @@ void setup_per_zone_pages_min(void) * 1TB 101 10GB * 10TB 320 32GB */ -static void setup_per_zone_inactive_ratio(void) +void calculate_zone_inactive_ratio(struct zone *zone) { - struct zone *zone; - - for_each_zone(zone) { - unsigned int gb, ratio; + unsigned int gb, ratio; - /* Zone size in gigabytes */ - gb = zone->present_pages >> (30 - PAGE_SHIFT); + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + if (gb) ratio = int_sqrt(10 * gb); - if (!ratio) - ratio = 1; + else + ratio = 1; - zone->inactive_ratio = ratio; - } + zone->inactive_ratio = ratio; +} + +static void __init setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) + calculate_zone_inactive_ratio(zone); } /* @@ -4495,7 +4561,7 @@ static void setup_per_zone_inactive_ratio(void) * 8192MB: 11584k * 16384MB: 16384k */ -static int __init init_per_zone_pages_min(void) +static int __init init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; @@ -4506,12 +4572,12 @@ static int __init init_per_zone_pages_min(void) min_free_kbytes = 128; if (min_free_kbytes > 65536) min_free_kbytes = 65536; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_pages_min) +module_init(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so @@ -4523,7 +4589,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, { proc_dointvec(table, write, file, buffer, length, ppos); if (write) - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); return 0; } @@ -4594,7 +4660,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; - for_each_zone(zone) { + for_each_populated_zone(zone) { for_each_online_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; @@ -4674,25 +4740,14 @@ void *__init alloc_large_system_hash(const char *tablename, else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { - unsigned long order = get_order(size); - - if (order < MAX_ORDER) - table = (void *)__get_free_pages(GFP_ATOMIC, - order); /* * If bucketsize is not a power-of-two, we may free - * some pages at the end of hash table. + * some pages at the end of hash table which + * alloc_pages_exact() automatically does */ - if (table) { - unsigned long alloc_end = (unsigned long)table + - (PAGE_SIZE << order); - unsigned long used = (unsigned long)table + - PAGE_ALIGN(size); - split_page(virt_to_page(table), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } + if (get_order(size) < MAX_ORDER) { + table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -4711,16 +4766,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; - /* - * If hashdist is set, the table allocation is done with __vmalloc() - * which invokes the kmemleak_alloc() callback. This function may also - * be called before the slab and kmemleak are initialised when - * kmemleak simply buffers the request to be executed later - * (GFP_ATOMIC flag ignored in this case). - */ - if (!hashdist) - kmemleak_alloc(table, size, 1, GFP_ATOMIC); - return table; }