X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=20759803a64ad151ef7d49f85308ff99b2b5c073;hb=2f66a68f3fac2e94da360c342ff78ab45553f86c;hp=d56e377ad0852d06cc8ad1de57f03d4636dd0c29;hpb=f2260e6b1f4eba0f5b5906795117791b5c660154;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d56e377..2075980 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -72,6 +73,7 @@ unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; +gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE int pageblock_order __read_mostly; @@ -161,7 +163,9 @@ static unsigned long __meminitdata dma_reserve; #if MAX_NUMNODES > 1 int nr_node_ids __read_mostly = MAX_NUMNODES; +int nr_online_nodes __read_mostly = 1; EXPORT_SYMBOL(nr_node_ids); +EXPORT_SYMBOL(nr_online_nodes); #endif int page_group_by_mobility_disabled __read_mostly; @@ -176,6 +180,8 @@ static void set_pageblock_migratetype(struct page *page, int migratetype) PB_migrate, PB_migrate_end); } +bool oom_killer_disabled __read_mostly; + #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { @@ -298,23 +304,6 @@ void prep_compound_page(struct page *page, unsigned long order) } } -#ifdef CONFIG_HUGETLBFS -void prep_compound_gigantic_page(struct page *page, unsigned long order) -{ - int i; - int nr_pages = 1 << order; - struct page *p = page + 1; - - set_compound_page_dtor(page, free_compound_page); - set_compound_order(page, order); - __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { - __SetPageTail(p); - p->first_page = page; - } -} -#endif - static int destroy_compound_page(struct page *page, unsigned long order) { int i; @@ -491,6 +480,21 @@ static inline void __free_one_page(struct page *page, zone->free_area[order].nr_free++; } +#ifdef CONFIG_HAVE_MLOCKED_PAGE_BIT +/* + * free_page_mlock() -- clean up attempts to free and mlocked() page. + * Page should not be on lru, so no need to fix that up. + * free_pages_check() will verify... + */ +static inline void free_page_mlock(struct page *page) +{ + __dec_zone_page_state(page, NR_MLOCK); + __count_vm_event(UNEVICTABLE_MLOCKFREED); +} +#else +static void free_page_mlock(struct page *page) { } +#endif + static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | @@ -553,7 +557,9 @@ static void __free_pages_ok(struct page *page, unsigned int order) unsigned long flags; int i; int bad = 0; - int clearMlocked = PageMlocked(page); + int wasMlocked = __TestClearPageMlocked(page); + + kmemcheck_free_shadow(page, order); for (i = 0 ; i < (1 << order) ; ++i) bad += free_pages_check(page + i); @@ -569,7 +575,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) kernel_map_pages(page, 1 << order, 0); local_irq_save(flags); - if (unlikely(clearMlocked)) + if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, order, @@ -777,6 +783,17 @@ static int move_freepages_block(struct zone *zone, struct page *page, return move_freepages(zone, start_page, end_page, migratetype); } +static void change_pageblock_range(struct page *pageblock_page, + int start_order, int migratetype) +{ + int nr_pageblocks = 1 << (start_order - pageblock_order); + + while (nr_pageblocks--) { + set_pageblock_migratetype(pageblock_page, migratetype); + pageblock_page += pageblock_nr_pages; + } +} + /* Remove an element from the buddy allocator from the fallback list */ static inline struct page * __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) @@ -811,13 +828,15 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * agressive about taking ownership of free pages */ if (unlikely(current_order >= (pageblock_order >> 1)) || - start_migratetype == MIGRATE_RECLAIMABLE) { + start_migratetype == MIGRATE_RECLAIMABLE || + page_group_by_mobility_disabled) { unsigned long pages; pages = move_freepages_block(zone, page, start_migratetype); /* Claim the whole block if over half of it is free */ - if (pages >= (1 << (pageblock_order-1))) + if (pages >= (1 << (pageblock_order-1)) || + page_group_by_mobility_disabled) set_pageblock_migratetype(page, start_migratetype); @@ -828,8 +847,9 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) list_del(&page->lru); rmv_page_order(page); - if (current_order == pageblock_order) - set_pageblock_migratetype(page, + /* Take ownership for orders >= pageblock_order */ + if (current_order >= pageblock_order) + change_pageblock_range(page, current_order, start_migratetype); expand(zone, page, order, current_order, area, migratetype); @@ -876,7 +896,7 @@ retry_reserve: */ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list, - int migratetype) + int migratetype, int cold) { int i; @@ -895,7 +915,10 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, * merge IO requests if the physical pages are ordered * properly. */ - list_add(&page->lru, list); + if (likely(cold == 0)) + list_add(&page->lru, list); + else + list_add_tail(&page->lru, list); set_page_private(page, migratetype); list = &page->lru; } @@ -1015,7 +1038,9 @@ static void free_hot_cold_page(struct page *page, int cold) struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; unsigned long flags; - int clearMlocked = PageMlocked(page); + int wasMlocked = __TestClearPageMlocked(page); + + kmemcheck_free_shadow(page, 0); if (PageAnon(page)) page->mapping = NULL; @@ -1030,8 +1055,9 @@ static void free_hot_cold_page(struct page *page, int cold) kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp; + set_page_private(page, get_pageblock_migratetype(page)); local_irq_save(flags); - if (unlikely(clearMlocked)) + if (unlikely(wasMlocked)) free_page_mlock(page); __count_vm_event(PGFREE); @@ -1039,7 +1065,6 @@ static void free_hot_cold_page(struct page *page, int cold) list_add_tail(&page->lru, &pcp->list); else list_add(&page->lru, &pcp->list); - set_page_private(page, get_pageblock_migratetype(page)); pcp->count++; if (pcp->count >= pcp->high) { free_pages_bulk(zone, pcp->batch, &pcp->list, 0); @@ -1073,6 +1098,16 @@ void split_page(struct page *page, unsigned int order) VM_BUG_ON(PageCompound(page)); VM_BUG_ON(!page_count(page)); + +#ifdef CONFIG_KMEMCHECK + /* + * Split shadow pages too, because free(page[0]) would + * otherwise free the whole shadow. + */ + if (kmemcheck_page_is_tracked(page)) + split_page(virt_to_page(page[0].shadow), order); +#endif + for (i = 1; i < (1 << order); i++) set_page_refcounted(page + i); } @@ -1101,7 +1136,8 @@ again: local_irq_save(flags); if (!pcp->count) { pcp->count = rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); + pcp->batch, &pcp->list, + migratetype, cold); if (unlikely(!pcp->count)) goto failed; } @@ -1119,14 +1155,38 @@ again: /* Allocate more to the pcp list if necessary */ if (unlikely(&page->lru == &pcp->list)) { + int get_one_page = 0; + pcp->count += rmqueue_bulk(zone, 0, - pcp->batch, &pcp->list, migratetype); - page = list_entry(pcp->list.next, struct page, lru); + pcp->batch, &pcp->list, + migratetype, cold); + list_for_each_entry(page, &pcp->list, lru) { + if (get_pageblock_migratetype(page) != + MIGRATE_ISOLATE) { + get_one_page = 1; + break; + } + } + if (!get_one_page) + goto failed; } list_del(&page->lru); pcp->count--; } else { + if (unlikely(gfp_flags & __GFP_NOFAIL)) { + /* + * __GFP_NOFAIL is not to be used in new code. + * + * All __GFP_NOFAIL callers should be fixed so that they + * properly detect and handle allocation failures. + * + * We most definitely don't want callers attempting to + * allocate greater than order-1 page units with + * __GFP_NOFAIL. + */ + WARN_ON_ONCE(order > 1); + } spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order, migratetype); __mod_zone_page_state(zone, NR_FREE_PAGES, -(1 << order)); @@ -1428,9 +1488,6 @@ get_page_from_freelist(gfp_t gfp_mask, nodemask_t *nodemask, unsigned int order, int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ - if (WARN_ON_ONCE(order >= MAX_ORDER)) - return NULL; - classzone_idx = zone_idx(preferred_zone); zonelist_scan: /* @@ -1449,15 +1506,33 @@ zonelist_scan: BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { unsigned long mark; + int ret; + mark = zone->watermark[alloc_flags & ALLOC_WMARK_MASK]; - if (!zone_watermark_ok(zone, order, mark, - classzone_idx, alloc_flags)) { - if (!zone_reclaim_mode || - !zone_reclaim(zone, gfp_mask, order)) + if (zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) + goto try_this_zone; + + if (zone_reclaim_mode == 0) + goto this_zone_full; + + ret = zone_reclaim(zone, gfp_mask, order); + switch (ret) { + case ZONE_RECLAIM_NOSCAN: + /* did not scan */ + goto try_next_zone; + case ZONE_RECLAIM_FULL: + /* scanned but unreclaimable */ + goto this_zone_full; + default: + /* did we reclaim enough */ + if (!zone_watermark_ok(zone, order, mark, + classzone_idx, alloc_flags)) goto this_zone_full; } } +try_this_zone: page = buffered_rmqueue(preferred_zone, zone, order, gfp_mask, migratetype); if (page) @@ -1466,7 +1541,7 @@ this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); try_next_zone: - if (NUMA_BUILD && !did_zlc_setup && num_online_nodes() > 1) { + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { /* * we do zlc_setup after the first zone is tried but only * if there are multiple nodes make it worthwhile @@ -1548,7 +1623,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, goto out; /* The OOM killer will not help higher order allocs */ - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (order > PAGE_ALLOC_COSTLY_ORDER && !(gfp_mask & __GFP_NOFAIL)) goto out; /* Exhausted what can be done so it's blamo time */ @@ -1574,10 +1649,6 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - - /* - * The task's cpuset might have expanded its set of allowable nodes - */ p->flags |= PF_MEMALLOC; lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; @@ -1620,7 +1691,7 @@ __alloc_pages_high_priority(gfp_t gfp_mask, unsigned int order, preferred_zone, migratetype); if (!page && gfp_mask & __GFP_NOFAIL) - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); } while (!page && (gfp_mask & __GFP_NOFAIL)); return page; @@ -1689,6 +1760,17 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, struct task_struct *p = current; /* + * In the slowpath, we sanity check order to avoid ever trying to + * reclaim >= MAX_ORDER areas which will never succeed. Callers may + * be using allocators in order of preference for an area that is + * too large. + */ + if (order >= MAX_ORDER) { + WARN_ON_ONCE(!(gfp_mask & __GFP_NOWARN)); + return NULL; + } + + /* * GFP_THISNODE (meaning __GFP_THISNODE, __GFP_NORETRY and * __GFP_NOWARN set) should not cause reclaim since the subsystem * (f.e. slab) using GFP_THISNODE may choose to trigger reclaim @@ -1701,6 +1783,7 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, wake_all_kswapd(order, zonelist, high_zoneidx); +restart: /* * OK, we're below the kswapd watermark and have kicked background * reclaim. Now things get more complex, so set up alloc_flags according @@ -1708,7 +1791,6 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, */ alloc_flags = gfp_to_alloc_flags(gfp_mask); -restart: /* This is the last chance, in general, before the goto nopage. */ page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, alloc_flags & ~ALLOC_NO_WATERMARKS, @@ -1734,6 +1816,10 @@ rebalance: if (p->flags & PF_MEMALLOC) goto nopage; + /* Avoid allocations with no watermarks from looping endlessly */ + if (test_thread_flag(TIF_MEMDIE) && !(gfp_mask & __GFP_NOFAIL)) + goto nopage; + /* Try direct reclaim and then allocating */ page = __alloc_pages_direct_reclaim(gfp_mask, order, zonelist, high_zoneidx, @@ -1749,6 +1835,8 @@ rebalance: */ if (!did_some_progress) { if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { + if (oom_killer_disabled) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, @@ -1757,11 +1845,13 @@ rebalance: goto got_pg; /* - * The OOM killer does not trigger for high-order allocations - * but if no progress is being made, there are no other - * options and retrying is unlikely to help + * The OOM killer does not trigger for high-order + * ~__GFP_NOFAIL allocations so if no progress is being + * made, there are no other options and retrying is + * unlikely to help. */ - if (order > PAGE_ALLOC_COSTLY_ORDER) + if (order > PAGE_ALLOC_COSTLY_ORDER && + !(gfp_mask & __GFP_NOFAIL)) goto nopage; goto restart; @@ -1772,7 +1862,7 @@ rebalance: pages_reclaimed += did_some_progress; if (should_alloc_retry(gfp_mask, order, pages_reclaimed)) { /* Wait for some write requests to complete then retry */ - congestion_wait(WRITE, HZ/50); + congestion_wait(BLK_RW_ASYNC, HZ/50); goto rebalance; } @@ -1784,7 +1874,10 @@ nopage: dump_stack(); show_mem(); } + return page; got_pg: + if (kmemcheck_enabled) + kmemcheck_pagealloc_alloc(page, order, gfp_mask); return page; } @@ -1801,6 +1894,8 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, struct page *page; int migratetype = allocflags_to_migratetype(gfp_mask); + gfp_mask &= gfp_allowed_mask; + lockdep_trace_alloc(gfp_mask); might_sleep_if(gfp_mask & __GFP_WAIT); @@ -1839,31 +1934,25 @@ EXPORT_SYMBOL(__alloc_pages_nodemask); */ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) { - struct page * page; + struct page *page; + + /* + * __get_free_pages() returns a 32-bit address, which cannot represent + * a highmem page + */ + VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); + page = alloc_pages(gfp_mask, order); if (!page) return 0; return (unsigned long) page_address(page); } - EXPORT_SYMBOL(__get_free_pages); unsigned long get_zeroed_page(gfp_t gfp_mask) { - struct page * page; - - /* - * get_zeroed_page() returns a 32-bit address, which cannot represent - * a highmem page - */ - VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); - - page = alloc_pages(gfp_mask | __GFP_ZERO, 0); - if (page) - return (unsigned long) page_address(page); - return 0; + return __get_free_pages(gfp_mask | __GFP_ZERO, 0); } - EXPORT_SYMBOL(get_zeroed_page); void __pagevec_free(struct pagevec *pvec) @@ -1919,7 +2008,7 @@ void *alloc_pages_exact(size_t size, gfp_t gfp_mask) unsigned long alloc_end = addr + (PAGE_SIZE << order); unsigned long used = addr + PAGE_ALIGN(size); - split_page(virt_to_page(addr), order); + split_page(virt_to_page((void *)addr), order); while (used < alloc_end) { free_page(used); used += PAGE_SIZE; @@ -2051,28 +2140,28 @@ void show_free_areas(void) } } - printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" - " inactive_file:%lu" -//TODO: check/adjust line lengths -#ifdef CONFIG_UNEVICTABLE_LRU + printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" + " active_file:%lu inactive_file:%lu isolated_file:%lu\n" " unevictable:%lu" -#endif - " dirty:%lu writeback:%lu unstable:%lu\n" - " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", + " dirty:%lu writeback:%lu unstable:%lu buffer:%lu\n" + " free:%lu slab_reclaimable:%lu slab_unreclaimable:%lu\n" + " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n", global_page_state(NR_ACTIVE_ANON), - global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_ISOLATED_ANON), + global_page_state(NR_ACTIVE_FILE), global_page_state(NR_INACTIVE_FILE), -#ifdef CONFIG_UNEVICTABLE_LRU + global_page_state(NR_ISOLATED_FILE), global_page_state(NR_UNEVICTABLE), -#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), + nr_blockdev_pages(), global_page_state(NR_FREE_PAGES), - global_page_state(NR_SLAB_RECLAIMABLE) + - global_page_state(NR_SLAB_UNRECLAIMABLE), + global_page_state(NR_SLAB_RECLAIMABLE), + global_page_state(NR_SLAB_UNRECLAIMABLE), global_page_state(NR_FILE_MAPPED), + global_page_state(NR_SHMEM), global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); @@ -2089,10 +2178,22 @@ void show_free_areas(void) " inactive_anon:%lukB" " active_file:%lukB" " inactive_file:%lukB" -#ifdef CONFIG_UNEVICTABLE_LRU " unevictable:%lukB" -#endif + " isolated(anon):%lukB" + " isolated(file):%lukB" " present:%lukB" + " mlocked:%lukB" + " dirty:%lukB" + " writeback:%lukB" + " mapped:%lukB" + " shmem:%lukB" + " slab_reclaimable:%lukB" + " slab_unreclaimable:%lukB" + " kernel_stack:%lukB" + " pagetables:%lukB" + " unstable:%lukB" + " bounce:%lukB" + " writeback_tmp:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" "\n", @@ -2105,10 +2206,23 @@ void show_free_areas(void) K(zone_page_state(zone, NR_INACTIVE_ANON)), K(zone_page_state(zone, NR_ACTIVE_FILE)), K(zone_page_state(zone, NR_INACTIVE_FILE)), -#ifdef CONFIG_UNEVICTABLE_LRU K(zone_page_state(zone, NR_UNEVICTABLE)), -#endif + K(zone_page_state(zone, NR_ISOLATED_ANON)), + K(zone_page_state(zone, NR_ISOLATED_FILE)), K(zone->present_pages), + K(zone_page_state(zone, NR_MLOCK)), + K(zone_page_state(zone, NR_FILE_DIRTY)), + K(zone_page_state(zone, NR_WRITEBACK)), + K(zone_page_state(zone, NR_FILE_MAPPED)), + K(zone_page_state(zone, NR_SHMEM)), + K(zone_page_state(zone, NR_SLAB_RECLAIMABLE)), + K(zone_page_state(zone, NR_SLAB_UNRECLAIMABLE)), + zone_page_state(zone, NR_KERNEL_STACK) * + THREAD_SIZE / 1024, + K(zone_page_state(zone, NR_PAGETABLE)), + K(zone_page_state(zone, NR_UNSTABLE_NFS)), + K(zone_page_state(zone, NR_BOUNCE)), + K(zone_page_state(zone, NR_WRITEBACK_TEMP)), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") ); @@ -2265,7 +2379,7 @@ int numa_zonelist_order_handler(ctl_table *table, int write, } -#define MAX_NODE_LOAD (num_online_nodes()) +#define MAX_NODE_LOAD (nr_online_nodes) static int node_load[MAX_NUMNODES]; /** @@ -2474,11 +2588,10 @@ static void build_zonelists(pg_data_t *pgdat) /* NUMA-aware ordering of nodes */ local_node = pgdat->node_id; - load = num_online_nodes(); + load = nr_online_nodes; prev_node = local_node; nodes_clear(used_mask); - memset(node_load, 0, sizeof(node_load)); memset(node_order, 0, sizeof(node_order)); j = 0; @@ -2587,6 +2700,9 @@ static int __build_all_zonelists(void *dummy) { int nid; +#ifdef CONFIG_NUMA + memset(node_load, 0, sizeof(node_load)); +#endif for_each_online_node(nid) { pg_data_t *pgdat = NODE_DATA(nid); @@ -2625,7 +2741,7 @@ void build_all_zonelists(void) printk("Built %i zonelists in %s order, mobility grouping %s. " "Total pages: %ld\n", - num_online_nodes(), + nr_online_nodes, zonelist_order_name[current_zonelist_order], page_group_by_mobility_disabled ? "off" : "on", vm_total_pages); @@ -2971,7 +3087,7 @@ bad: if (dzone == zone) break; kfree(zone_pcp(dzone, cpu)); - zone_pcp(dzone, cpu) = NULL; + zone_pcp(dzone, cpu) = &boot_pageset[cpu]; } return -ENOMEM; } @@ -2986,7 +3102,7 @@ static inline void free_zone_pagesets(int cpu) /* Free per_cpu_pageset if it is slab allocated */ if (pset != &boot_pageset[cpu]) kfree(pset); - zone_pcp(zone, cpu) = NULL; + zone_pcp(zone, cpu) = &boot_pageset[cpu]; } } @@ -3076,6 +3192,32 @@ int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) return 0; } +static int __zone_pcp_update(void *data) +{ + struct zone *zone = data; + int cpu; + unsigned long batch = zone_batchsize(zone), flags; + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + struct per_cpu_pageset *pset; + struct per_cpu_pages *pcp; + + pset = zone_pcp(zone, cpu); + pcp = &pset->pcp; + + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + setup_pageset(pset, batch); + local_irq_restore(flags); + } + return 0; +} + +void zone_pcp_update(struct zone *zone) +{ + stop_machine(__zone_pcp_update, zone, NULL); +} + static __meminit void zone_pcp_init(struct zone *zone) { int cpu; @@ -3650,7 +3792,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone_pcp_init(zone); for_each_lru(l) { INIT_LIST_HEAD(&zone->lru[l].list); - zone->lru[l].nr_scan = 0; + zone->lru[l].nr_saved_scan = 0; } zone->reclaim_stat.recent_rotated[0] = 0; zone->reclaim_stat.recent_rotated[1] = 0; @@ -3977,6 +4119,8 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) int i, nid; unsigned long usable_startpfn; unsigned long kernelcore_node, kernelcore_remaining; + /* save the state before borrow the nodemask */ + nodemask_t saved_node_state = node_states[N_HIGH_MEMORY]; unsigned long totalpages = early_calculate_totalpages(); int usable_nodes = nodes_weight(node_states[N_HIGH_MEMORY]); @@ -4004,7 +4148,7 @@ static void __init find_zone_movable_pfns_for_nodes(unsigned long *movable_pfn) /* If kernelcore was not specified, there is no ZONE_MOVABLE */ if (!required_kernelcore) - return; + goto out; /* usable_startpfn is the lowest possible pfn ZONE_MOVABLE can be at */ find_usable_zone_for_movable(); @@ -4103,6 +4247,10 @@ restart: for (nid = 0; nid < MAX_NUMNODES; nid++) zone_movable_pfn[nid] = roundup(zone_movable_pfn[nid], MAX_ORDER_NR_PAGES); + +out: + /* restore the node_state */ + node_states[N_HIGH_MEMORY] = saved_node_state; } /* Any regular memory on that node ? */ @@ -4372,12 +4520,13 @@ static void setup_per_zone_lowmem_reserve(void) } /** - * setup_per_zone_pages_min - called when min_free_kbytes changes. + * setup_per_zone_wmarks - called when min_free_kbytes changes + * or when memory is hot-{added|removed} * - * Ensures that the pages_{min,low,high} values for each zone are set correctly - * with respect to min_free_kbytes. + * Ensures that the watermark[min,low,high] values for each zone are set + * correctly with respect to min_free_kbytes. */ -void setup_per_zone_pages_min(void) +void setup_per_zone_wmarks(void) { unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10); unsigned long lowmem_pages = 0; @@ -4432,9 +4581,7 @@ void setup_per_zone_pages_min(void) calculate_totalreserve_pages(); } -/** - * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. - * +/* * The inactive anon list should be small enough that the VM never has to * do too much work, but large enough that each inactive page has a chance * to be referenced again before it is swapped out. @@ -4455,21 +4602,26 @@ void setup_per_zone_pages_min(void) * 1TB 101 10GB * 10TB 320 32GB */ -static void setup_per_zone_inactive_ratio(void) +void calculate_zone_inactive_ratio(struct zone *zone) { - struct zone *zone; + unsigned int gb, ratio; - for_each_zone(zone) { - unsigned int gb, ratio; - - /* Zone size in gigabytes */ - gb = zone->present_pages >> (30 - PAGE_SHIFT); + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + if (gb) ratio = int_sqrt(10 * gb); - if (!ratio) - ratio = 1; + else + ratio = 1; - zone->inactive_ratio = ratio; - } + zone->inactive_ratio = ratio; +} + +static void __init setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) + calculate_zone_inactive_ratio(zone); } /* @@ -4496,7 +4648,7 @@ static void setup_per_zone_inactive_ratio(void) * 8192MB: 11584k * 16384MB: 16384k */ -static int __init init_per_zone_pages_min(void) +static int __init init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; @@ -4507,12 +4659,12 @@ static int __init init_per_zone_pages_min(void) min_free_kbytes = 128; if (min_free_kbytes > 65536) min_free_kbytes = 65536; - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); setup_per_zone_lowmem_reserve(); setup_per_zone_inactive_ratio(); return 0; } -module_init(init_per_zone_pages_min) +module_init(init_per_zone_wmark_min) /* * min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so @@ -4524,7 +4676,7 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, { proc_dointvec(table, write, file, buffer, length, ppos); if (write) - setup_per_zone_pages_min(); + setup_per_zone_wmarks(); return 0; } @@ -4595,7 +4747,7 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); if (!write || (ret == -EINVAL)) return ret; - for_each_zone(zone) { + for_each_populated_zone(zone) { for_each_online_cpu(cpu) { unsigned long high; high = zone->present_pages / percpu_pagelist_fraction; @@ -4675,25 +4827,14 @@ void *__init alloc_large_system_hash(const char *tablename, else if (hashdist) table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL); else { - unsigned long order = get_order(size); - - if (order < MAX_ORDER) - table = (void *)__get_free_pages(GFP_ATOMIC, - order); /* * If bucketsize is not a power-of-two, we may free - * some pages at the end of hash table. + * some pages at the end of hash table which + * alloc_pages_exact() automatically does */ - if (table) { - unsigned long alloc_end = (unsigned long)table + - (PAGE_SIZE << order); - unsigned long used = (unsigned long)table + - PAGE_ALIGN(size); - split_page(virt_to_page(table), order); - while (used < alloc_end) { - free_page(used); - used += PAGE_SIZE; - } + if (get_order(size) < MAX_ORDER) { + table = alloc_pages_exact(size, GFP_ATOMIC); + kmemleak_alloc(table, size, 1, GFP_ATOMIC); } } } while (!table && size > PAGE_SIZE && --log2qty); @@ -4712,16 +4853,6 @@ void *__init alloc_large_system_hash(const char *tablename, if (_hash_mask) *_hash_mask = (1 << log2qty) - 1; - /* - * If hashdist is set, the table allocation is done with __vmalloc() - * which invokes the kmemleak_alloc() callback. This function may also - * be called before the slab and kmemleak are initialised when - * kmemleak simply buffers the request to be executed later - * (GFP_ATOMIC flag ignored in this case). - */ - if (!hashdist) - kmemleak_alloc(table, size, 1, GFP_ATOMIC); - return table; } @@ -4815,13 +4946,16 @@ int set_migratetype_isolate(struct page *page) struct zone *zone; unsigned long flags; int ret = -EBUSY; + int zone_idx; zone = page_zone(page); + zone_idx = zone_idx(zone); spin_lock_irqsave(&zone->lock, flags); /* * In future, more migrate types will be able to be isolation target. */ - if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE) + if (get_pageblock_migratetype(page) != MIGRATE_MOVABLE && + zone_idx != ZONE_MOVABLE) goto out; set_pageblock_migratetype(page, MIGRATE_ISOLATE); move_freepages_block(zone, page, MIGRATE_ISOLATE);