X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=fe753ecf2aa5fd234dedb6916c333055bcfe8b36;hb=3a6be87fd1e5cdbbc3b6a14d02a3efa9ecba1d3f;hp=5886586fde6ca7c4971bec788b688128e0206c66;hpb=b291f000393f5a0b679012b39d79fbc85c018233;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5886586..fe753ec 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -44,7 +44,7 @@ #include #include #include -#include +#include #include #include @@ -69,7 +69,7 @@ EXPORT_SYMBOL(node_states); unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; -long nr_swap_pages; +unsigned long highest_memmap_pfn __read_mostly; int percpu_pagelist_fraction; #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE @@ -223,24 +223,41 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { - void *pc = page_get_page_cgroup(page); + static unsigned long resume; + static unsigned long nr_shown; + static unsigned long nr_unshown; - printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG - "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - current->comm, page, (int)(2*sizeof(unsigned long)), - (unsigned long)page->flags, page->mapping, - page_mapcount(page), page_count(page)); - if (pc) { - printk(KERN_EMERG "cgroup:%p\n", pc); - page_reset_bad_cgroup(page); + /* + * Allow a burst of 60 reports, then keep quiet for that minute; + * or allow a steady drip of one report per second. + */ + if (nr_shown == 60) { + if (time_before(jiffies, resume)) { + nr_unshown++; + goto out; + } + if (nr_unshown) { + printk(KERN_ALERT + "BUG: Bad page state: %lu messages suppressed\n", + nr_unshown); + nr_unshown = 0; + } + nr_shown = 0; } - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" - KERN_EMERG "Backtrace:\n"); + if (nr_shown++ == 0) + resume = jiffies + 60 * HZ; + + printk(KERN_ALERT "BUG: Bad page state in process %s pfn:%05lx\n", + current->comm, page_to_pfn(page)); + printk(KERN_ALERT + "page:%p flags:%p count:%d mapcount:%d mapping:%p index:%lx\n", + page, (void *)page->flags, page_count(page), + page_mapcount(page), page->mapping, page->index); + dump_stack(); - page->flags &= ~PAGE_FLAGS_CLEAR_WHEN_BAD; - set_page_count(page, 0); - reset_page_mapcount(page); - page->mapping = NULL; +out: + /* Leave bad fields for debug, except PageBuddy could make trouble */ + __ClearPageBuddy(page); add_taint(TAINT_BAD_PAGE); } @@ -268,40 +285,60 @@ void prep_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; - struct page *p = page + 1; set_compound_page_dtor(page, free_compound_page); set_compound_order(page, order); __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p++) { - if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) - p = pfn_to_page(page_to_pfn(page) + i); + for (i = 1; i < nr_pages; i++) { + struct page *p = page + i; + __SetPageTail(p); p->first_page = page; } } -static void destroy_compound_page(struct page *page, unsigned long order) +#ifdef CONFIG_HUGETLBFS +void prep_compound_gigantic_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; struct page *p = page + 1; - if (unlikely(compound_order(page) != order)) + set_compound_page_dtor(page, free_compound_page); + set_compound_order(page, order); + __SetPageHead(page); + for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + __SetPageTail(p); + p->first_page = page; + } +} +#endif + +static int destroy_compound_page(struct page *page, unsigned long order) +{ + int i; + int nr_pages = 1 << order; + int bad = 0; + + if (unlikely(compound_order(page) != order) || + unlikely(!PageHead(page))) { bad_page(page); + bad++; + } - if (unlikely(!PageHead(page))) - bad_page(page); __ClearPageHead(page); - for (i = 1; i < nr_pages; i++, p++) { - if (unlikely((i & (MAX_ORDER_NR_PAGES - 1)) == 0)) - p = pfn_to_page(page_to_pfn(page) + i); - if (unlikely(!PageTail(p) | - (p->first_page != page))) + for (i = 1; i < nr_pages; i++) { + struct page *p = page + i; + + if (unlikely(!PageTail(p) || (p->first_page != page))) { bad_page(page); + bad++; + } __ClearPageTail(p); } + + return bad; } static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) @@ -421,7 +458,8 @@ static inline void __free_one_page(struct page *page, int migratetype = get_pageblock_migratetype(page); if (unlikely(PageCompound(page))) - destroy_compound_page(page, order); + if (unlikely(destroy_compound_page(page, order))) + return; page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); @@ -454,22 +492,17 @@ static inline void __free_one_page(struct page *page, static inline int free_pages_check(struct page *page) { + free_page_mlock(page); if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) + (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) { bad_page(page); - if (PageDirty(page)) - __ClearPageDirty(page); - if (PageSwapBacked(page)) - __ClearPageSwapBacked(page); - /* - * For now, we report if PG_reserved was found set, but do not - * clear it, and do not free the page. But we shall soon need - * to do more, for when the ZERO_PAGE count wraps negative. - */ - return PageReserved(page); + return 1; + } + if (page->flags & PAGE_FLAGS_CHECK_AT_PREP) + page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; + return 0; } /* @@ -514,11 +547,11 @@ static void __free_pages_ok(struct page *page, unsigned int order) { unsigned long flags; int i; - int reserved = 0; + int bad = 0; for (i = 0 ; i < (1 << order) ; ++i) - reserved += free_pages_check(page + i); - if (reserved) + bad += free_pages_check(page + i); + if (bad) return; if (!PageHighMem(page)) { @@ -602,25 +635,12 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | - (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) + (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) { bad_page(page); - - /* - * For now, we report if PG_reserved was found set, but do not - * clear it, and do not allocate the page: as a safety net. - */ - if (PageReserved(page)) return 1; + } - page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | - 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk -#ifdef CONFIG_UNEVICTABLE_LRU - | 1 << PG_mlocked -#endif - ); set_page_private(page, 0); set_page_refcounted(page); @@ -902,13 +922,10 @@ static void drain_pages(unsigned int cpu) unsigned long flags; struct zone *zone; - for_each_zone(zone) { + for_each_populated_zone(zone) { struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; - if (!populated_zone(zone)) - continue; - pset = zone_pcp(zone, cpu); pcp = &pset->pcp; @@ -1459,6 +1476,8 @@ __alloc_pages_internal(gfp_t gfp_mask, unsigned int order, unsigned long did_some_progress; unsigned long pages_reclaimed = 0; + lockdep_trace_alloc(gfp_mask); + might_sleep_if(wait); if (should_fail_alloc_page(gfp_mask, order)) @@ -1553,13 +1572,21 @@ nofail_alloc: /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); + /* + * The task's cpuset might have expanded its set of allowable nodes + */ + cpuset_update_task_memory_state(); p->flags |= PF_MEMALLOC; + + lockdep_set_current_reclaim_state(gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - did_some_progress = try_to_free_pages(zonelist, order, gfp_mask); + did_some_progress = try_to_free_pages(zonelist, order, + gfp_mask, nodemask); p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); p->flags &= ~PF_MEMALLOC; cond_resched(); @@ -1850,10 +1877,7 @@ void show_free_areas(void) int cpu; struct zone *zone; - for_each_zone(zone) { - if (!populated_zone(zone)) - continue; - + for_each_populated_zone(zone) { show_node(zone); printk("%s per-cpu:\n", zone->name); @@ -1893,12 +1917,9 @@ void show_free_areas(void) global_page_state(NR_PAGETABLE), global_page_state(NR_BOUNCE)); - for_each_zone(zone) { + for_each_populated_zone(zone) { int i; - if (!populated_zone(zone)) - continue; - show_node(zone); printk("%s" " free:%lukB" @@ -1938,12 +1959,9 @@ void show_free_areas(void) printk("\n"); } - for_each_zone(zone) { + for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; - if (!populated_zone(zone)) - continue; - show_node(zone); printk("%s: ", zone->name); @@ -2110,7 +2128,7 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) int n, val; int min_val = INT_MAX; int best_node = -1; - node_to_cpumask_ptr(tmp, 0); + const struct cpumask *tmp = cpumask_of_node(0); /* Use the local node if we haven't already */ if (!node_isset(node, *used_node_mask)) { @@ -2131,8 +2149,8 @@ static int find_next_best_node(int node, nodemask_t *used_node_mask) val += (n < node); /* Give preference to headless and unused nodes */ - node_to_cpumask_ptr_next(tmp, n); - if (!cpus_empty(*tmp)) + tmp = cpumask_of_node(n); + if (!cpumask_empty(tmp)) val += PENALTY_FOR_NODE_WITH_CPUS; /* Slight preference for less loaded node */ @@ -2597,6 +2615,9 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long pfn; struct zone *z; + if (highest_memmap_pfn < end_pfn - 1) + highest_memmap_pfn = end_pfn - 1; + z = &NODE_DATA(nid)->node_zones[zone]; for (pfn = start_pfn; pfn < end_pfn; pfn++) { /* @@ -2660,6 +2681,7 @@ static void __meminit zone_init_free_lists(struct zone *zone) static int zone_batchsize(struct zone *zone) { +#ifdef CONFIG_MMU int batch; /* @@ -2685,9 +2707,26 @@ static int zone_batchsize(struct zone *zone) * of pages of one half of the possible page colors * and the other with pages of the other colors. */ - batch = (1 << (fls(batch + batch/2)-1)) - 1; + batch = rounddown_pow_of_two(batch + batch/2) - 1; return batch; + +#else + /* The deferral and batching of frees should be suppressed under NOMMU + * conditions. + * + * The problem is that NOMMU needs to be able to allocate large chunks + * of contiguous memory as there's no hardware page translation to + * assemble apparent contiguous memory from discontiguous pages. + * + * Queueing large contiguous runs of pages for batching, however, + * causes the pages to actually be freed in smaller chunks. As there + * can be a significant delay between the individual batches being + * recycled, this leads to the once large chunks of space being + * fragmented and becoming unavailable for high-order allocations. + */ + return 0; +#endif } static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) @@ -2752,11 +2791,7 @@ static int __cpuinit process_zones(int cpu) node_set_state(node, N_CPU); /* this node has a cpu */ - for_each_zone(zone) { - - if (!populated_zone(zone)) - continue; - + for_each_populated_zone(zone) { zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, node); if (!zone_pcp(zone, cpu)) @@ -2962,7 +2997,7 @@ static int __meminit next_active_region_index_in_nid(int index, int nid) * was used and there are no special requirements, this is a convenient * alternative */ -int __meminit early_pfn_to_nid(unsigned long pfn) +int __meminit __early_pfn_to_nid(unsigned long pfn) { int i; @@ -2973,10 +3008,33 @@ int __meminit early_pfn_to_nid(unsigned long pfn) if (start_pfn <= pfn && pfn < end_pfn) return early_node_map[i].nid; } + /* This is a memory hole */ + return -1; +} +#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + +int __meminit early_pfn_to_nid(unsigned long pfn) +{ + int nid; + nid = __early_pfn_to_nid(pfn); + if (nid >= 0) + return nid; + /* just returns 0 */ return 0; } -#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */ + +#ifdef CONFIG_NODES_SPAN_OTHER_NODES +bool __meminit early_pfn_in_nid(unsigned long pfn, int node) +{ + int nid; + + nid = __early_pfn_to_nid(pfn); + if (nid >= 0 && nid != node) + return false; + return true; +} +#endif /* Basic iterator support to walk early_node_map[] */ #define for_each_active_range_index_in_nid(i, nid) \ @@ -3369,10 +3427,8 @@ static void __init setup_usemap(struct pglist_data *pgdat, { unsigned long usemapsize = usemap_size(zonesize); zone->pageblock_flags = NULL; - if (usemapsize) { + if (usemapsize) zone->pageblock_flags = alloc_bootmem_node(pgdat, usemapsize); - memset(zone->pageblock_flags, 0, usemapsize); - } } #else static void inline setup_usemap(struct pglist_data *pgdat, @@ -3437,6 +3493,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; + pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; @@ -3456,9 +3513,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; if (realsize >= memmap_pages) { realsize -= memmap_pages; - mminit_dprintk(MMINIT_TRACE, "memmap_init", - "%s zone: %lu pages used for memmap\n", - zone_names[j], memmap_pages); + if (memmap_pages) + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", + zone_names[j], memmap_pages); } else printk(KERN_WARNING " %s zone: %lu pages exceeds realsize %lu\n", @@ -3467,8 +3525,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, /* Account for reserved pages */ if (j == 0 && realsize > dma_reserve) { realsize -= dma_reserve; - mminit_dprintk(MMINIT_TRACE, "memmap_init", - "%s zone: %lu pages reserved\n", + printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } @@ -3497,10 +3554,10 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, INIT_LIST_HEAD(&zone->lru[l].list); zone->lru[l].nr_scan = 0; } - zone->recent_rotated[0] = 0; - zone->recent_rotated[1] = 0; - zone->recent_scanned[0] = 0; - zone->recent_scanned[1] = 0; + zone->reclaim_stat.recent_rotated[0] = 0; + zone->reclaim_stat.recent_rotated[1] = 0; + zone->reclaim_stat.recent_scanned[0] = 0; + zone->reclaim_stat.recent_scanned[1] = 0; zap_zone_vm_stats(zone); zone->flags = 0; if (!size) @@ -4242,7 +4299,7 @@ void setup_per_zone_pages_min(void) for_each_zone(zone) { u64 tmp; - spin_lock_irqsave(&zone->lru_lock, flags); + spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone->present_pages; do_div(tmp, lowmem_pages); if (is_highmem(zone)) { @@ -4274,7 +4331,7 @@ void setup_per_zone_pages_min(void) zone->pages_low = zone->pages_min + (tmp >> 2); zone->pages_high = zone->pages_min + (tmp >> 1); setup_zone_migrate_reserve(zone); - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock_irqrestore(&zone->lock, flags); } /* update totalreserve_pages */ @@ -4304,7 +4361,7 @@ void setup_per_zone_pages_min(void) * 1TB 101 10GB * 10TB 320 32GB */ -void setup_per_zone_inactive_ratio(void) +static void setup_per_zone_inactive_ratio(void) { struct zone *zone; @@ -4561,19 +4618,6 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } -#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE -struct page *pfn_to_page(unsigned long pfn) -{ - return __pfn_to_page(pfn); -} -unsigned long page_to_pfn(struct page *page) -{ - return __page_to_pfn(page); -} -EXPORT_SYMBOL(pfn_to_page); -EXPORT_SYMBOL(page_to_pfn); -#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */ - /* Return a pointer to the bitmap storing bits affecting a block of pages */ static inline unsigned long *get_pageblock_bitmap(struct zone *zone, unsigned long pfn)