X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=df54e2fc8ee09760c67d2dd4e6bd496e07286124;hb=f8d65713332cf6306889a3036142a17e01e3447e;hp=23b84c4e1a57beaf3a3b2cd578318b3da592071c;hpb=664beed0190fae687ac51295694004902ddeb18e;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23b84c4..df54e2f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include "internal.h" @@ -52,6 +53,9 @@ struct pglist_data *pgdat_list __read_mostly; unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; long nr_swap_pages; +int percpu_pagelist_fraction; + +static void fastcall free_hot_cold_page(struct page *page, int cold); /* * results with 256, 32 in the lowmem_reserve sysctl: @@ -81,6 +85,7 @@ int min_free_kbytes = 1024; unsigned long __initdata nr_kernel_pages; unsigned long __initdata nr_all_pages; +#ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) { int ret = 0; @@ -122,16 +127,23 @@ static int bad_range(struct zone *zone, struct page *page) return 0; } -static void bad_page(const char *function, struct page *page) +#else +static inline int bad_range(struct zone *zone, struct page *page) +{ + return 0; +} +#endif + +static void bad_page(struct page *page) { - printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", - function, current->comm, page); - printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, - page->mapping, page_mapcount(page), page_count(page)); - printk(KERN_EMERG "Backtrace:\n"); + printk(KERN_EMERG "Bad page state in process '%s'\n" + KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" + KERN_EMERG "Trying to fix it up, but a reboot is needed\n" + KERN_EMERG "Backtrace:\n", + current->comm, page, (int)(2*sizeof(unsigned long)), + (unsigned long)page->flags, page->mapping, + page_mapcount(page), page_count(page)); dump_stack(); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); page->flags &= ~(1 << PG_lru | 1 << PG_private | 1 << PG_locked | @@ -140,8 +152,7 @@ static void bad_page(const char *function, struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved ); + 1 << PG_writeback ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -185,19 +196,15 @@ static void destroy_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - if (!PageCompound(page)) - return; - - if (page[1].index != order) - bad_page(__FUNCTION__, page); + if (unlikely(page[1].index != order)) + bad_page(page); for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - if (!PageCompound(p)) - bad_page(__FUNCTION__, page); - if (page_private(p) != (unsigned long)page) - bad_page(__FUNCTION__, page); + if (unlikely(!PageCompound(p) | + (page_private(p) != (unsigned long)page))) + bad_page(page); ClearPageCompound(p); } } @@ -256,14 +263,20 @@ __find_combined_index(unsigned long page_idx, unsigned int order) /* * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if - * (a) the buddy is free && - * (b) the buddy is on the buddy system && - * (c) a page and its buddy have the same order. + * (a) the buddy is not in a hole && + * (b) the buddy is free && + * (c) the buddy is on the buddy system && + * (d) a page and its buddy have the same order. * for recording page's order, we use page_private(page) and PG_private. * */ static inline int page_is_buddy(struct page *page, int order) { +#ifdef CONFIG_HOLES_IN_ZONE + if (!pfn_valid(page_to_pfn(page))) + return 0; +#endif + if (PagePrivate(page) && (page_order(page) == order) && page_count(page) == 0) @@ -295,13 +308,13 @@ static inline int page_is_buddy(struct page *page, int order) * -- wli */ -static inline void __free_pages_bulk (struct page *page, +static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order) { unsigned long page_idx; int order_size = 1 << order; - if (unlikely(order)) + if (unlikely(PageCompound(page))) destroy_compound_page(page, order); page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); @@ -315,17 +328,15 @@ static inline void __free_pages_bulk (struct page *page, struct free_area *area; struct page *buddy; - combined_idx = __find_combined_index(page_idx, order); buddy = __page_find_buddy(page, page_idx, order); - - if (bad_range(zone, buddy)) - break; if (!page_is_buddy(buddy, order)) break; /* Move the buddy up one level. */ + list_del(&buddy->lru); area = zone->free_area + order; area->nr_free--; rmv_page_order(buddy); + combined_idx = __find_combined_index(page_idx, order); page = page + (combined_idx - page_idx); page_idx = combined_idx; order++; @@ -335,11 +346,11 @@ static inline void __free_pages_bulk (struct page *page, zone->free_area[order].nr_free++; } -static inline void free_pages_check(const char *function, struct page *page) +static inline int free_pages_check(struct page *page) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -349,10 +360,16 @@ static inline void free_pages_check(const char *function, struct page *page) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved ))) - bad_page(function, page); + 1 << PG_reserved )))) + bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not free the page. But we shall soon need + * to do more, for when the ZERO_PAGE count wraps negative. + */ + return PageReserved(page); } /* @@ -366,48 +383,90 @@ static inline void free_pages_check(const char *function, struct page *page) * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) +static void free_pages_bulk(struct zone *zone, int count, + struct list_head *list, int order) { - unsigned long flags; - struct page *page = NULL; - int ret = 0; - - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - while (!list_empty(list) && count--) { + while (count--) { + struct page *page; + + BUG_ON(list_empty(list)); page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_pages_bulk list manipulates */ + /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); - __free_pages_bulk(page, zone, order); - ret++; + __free_one_page(page, zone, order); } - spin_unlock_irqrestore(&zone->lock, flags); - return ret; + spin_unlock(&zone->lock); } -void __free_pages_ok(struct page *page, unsigned int order) +static void free_one_page(struct zone *zone, struct page *page, int order) { LIST_HEAD(list); + list_add(&page->lru, &list); + free_pages_bulk(zone, 1, &list, order); +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; int i; + int reserved = 0; arch_free_page(page, order); - - mod_page_state(pgfree, 1 << order); + if (!PageHighMem(page)) + mutex_debug_check_no_locks_freed(page_address(page), + PAGE_SIZE< 0) - for (i = 1 ; i < (1 << order) ; ++i) - __put_page(page + i); + for (i = 1 ; i < (1 << order) ; ++i) + __put_page(page + i); #endif for (i = 0 ; i < (1 << order) ; ++i) - free_pages_check(__FUNCTION__, page + i); - list_add(&page->lru, &list); - kernel_map_pages(page, 1<lru, &list); + kernel_map_pages(page, 1 << order, 0); + free_pages_bulk(page_zone(page), 1, &list, order); + } } @@ -425,8 +484,7 @@ void __free_pages_ok(struct page *page, unsigned int order) * * -- wli */ -static inline struct page * -expand(struct zone *zone, struct page *page, +static inline void expand(struct zone *zone, struct page *page, int low, int high, struct free_area *area) { unsigned long size = 1 << high; @@ -440,34 +498,16 @@ expand(struct zone *zone, struct page *page, area->nr_free++; set_page_order(&page[size], high); } - return page; -} - -void set_page_refs(struct page *page, int order) -{ -#ifdef CONFIG_MMU - set_page_count(page, 1); -#else - int i; - - /* - * We need to reference all the pages for this order, otherwise if - * anyone accesses one of the pages with (get/put) it will be freed. - * - eg: access_process_vm() - */ - for (i = 0; i < (1 << order); i++) - set_page_count(page + i, 1); -#endif /* CONFIG_MMU */ } /* * This page is about to be returned from the page allocator */ -static void prep_new_page(struct page *page, int order) +static int prep_new_page(struct page *page, int order) { - if ( page_mapcount(page) || - page->mapping != NULL || - page_count(page) != 0 || + if (unlikely(page_mapcount(page) | + (page->mapping != NULL) | + (page_count(page) != 0) | (page->flags & ( 1 << PG_lru | 1 << PG_private | @@ -478,8 +518,15 @@ static void prep_new_page(struct page *page, int order) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved ))) - bad_page(__FUNCTION__, page); + 1 << PG_reserved )))) + bad_page(page); + + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not allocate the page: as a safety net. + */ + if (PageReserved(page)) + return 1; page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | @@ -487,6 +534,7 @@ static void prep_new_page(struct page *page, int order) set_page_private(page, 0); set_page_refs(page, order); kernel_map_pages(page, 1 << order, 1); + return 0; } /* @@ -509,7 +557,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) rmv_page_order(page); area->nr_free--; zone->free_pages -= 1UL << order; - return expand(zone, page, order, current_order, area); + expand(zone, page, order, current_order, area); + return page; } return NULL; @@ -523,21 +572,17 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list) { - unsigned long flags; int i; - int allocated = 0; - struct page *page; - spin_lock_irqsave(&zone->lock, flags); + spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + struct page *page = __rmqueue(zone, order); + if (unlikely(page == NULL)) break; - allocated++; list_add_tail(&page->lru, list); } - spin_unlock_irqrestore(&zone->lock, flags); - return allocated; + spin_unlock(&zone->lock); + return i; } #ifdef CONFIG_NUMA @@ -556,14 +601,13 @@ void drain_remote_pages(void) if (zone->zone_pgdat->node_id == numa_node_id()) continue; - pset = zone->pageset[smp_processor_id()]; + pset = zone_pcp(zone, smp_processor_id()); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; - if (pcp->count) - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; } } local_irq_restore(flags); @@ -573,6 +617,7 @@ void drain_remote_pages(void) #if defined(CONFIG_PM) || defined(CONFIG_HOTPLUG_CPU) static void __drain_pages(unsigned int cpu) { + unsigned long flags; struct zone *zone; int i; @@ -584,8 +629,10 @@ static void __drain_pages(unsigned int cpu) struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + local_irq_restore(flags); } } } @@ -631,18 +678,14 @@ void drain_local_pages(void) } #endif /* CONFIG_PM */ -static void zone_statistics(struct zonelist *zonelist, struct zone *z) +static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) { #ifdef CONFIG_NUMA - unsigned long flags; - int cpu; pg_data_t *pg = z->zone_pgdat; pg_data_t *orig = zonelist->zones[0]->zone_pgdat; struct per_cpu_pageset *p; - local_irq_save(flags); - cpu = smp_processor_id(); - p = zone_pcp(z,cpu); + p = zone_pcp(z, cpu); if (pg == orig) { p->numa_hit++; } else { @@ -653,14 +696,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) p->local_node++; else p->other_node++; - local_irq_restore(flags); #endif } /* * Free a 0-order page */ -static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); @@ -669,17 +710,22 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); - kernel_map_pages(page, 1, 0); - inc_page_state(pgfree); if (PageAnon(page)) page->mapping = NULL; - free_pages_check(__FUNCTION__, page); + if (free_pages_check(page)) + return; + + kernel_map_pages(page, 1, 0); + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); + __inc_page_state(pgfree); list_add(&page->lru, &pcp->list); pcp->count++; - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + if (pcp->count >= pcp->high) { + free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + pcp->count -= pcp->batch; + } local_irq_restore(flags); put_cpu(); } @@ -708,52 +754,67 @@ static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) * we cheat by calling it from here, in the order > 0 path. Saves a branch * or two. */ -static struct page * -buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) +static struct page *buffered_rmqueue(struct zonelist *zonelist, + struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; - struct page *page = NULL; + struct page *page; int cold = !!(gfp_flags & __GFP_COLD); + int cpu; - if (order == 0) { +again: + cpu = get_cpu(); + if (likely(order == 0)) { struct per_cpu_pages *pcp; - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); - if (pcp->count <= pcp->low) + if (!pcp->count) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); - if (pcp->count) { - page = list_entry(pcp->list.next, struct page, lru); - list_del(&page->lru); - pcp->count--; + if (unlikely(!pcp->count)) + goto failed; } - local_irq_restore(flags); - put_cpu(); + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); + if (!page) + goto failed; } - if (page != NULL) { - BUG_ON(bad_range(zone, page)); - mod_page_state_zone(zone, pgalloc, 1 << order); - prep_new_page(page, order); + __mod_page_state_zone(zone, pgalloc, 1 << order); + zone_statistics(zonelist, zone, cpu); + local_irq_restore(flags); + put_cpu(); + + BUG_ON(bad_range(zone, page)); + if (prep_new_page(page, order)) + goto again; - if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); - } + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); return page; + +failed: + local_irq_restore(flags); + put_cpu(); + return NULL; } #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ -#define ALLOC_HARDER 0x02 /* try to alloc harder */ -#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */ -#define ALLOC_CPUSET 0x08 /* check for correct cpuset */ +#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ +#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ +#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ /* * Return 1 if free pages are above 'mark'. This takes into account the order @@ -808,14 +869,22 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, continue; if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { - if (!zone_watermark_ok(*z, order, (*z)->pages_low, + unsigned long mark; + if (alloc_flags & ALLOC_WMARK_MIN) + mark = (*z)->pages_min; + else if (alloc_flags & ALLOC_WMARK_LOW) + mark = (*z)->pages_low; + else + mark = (*z)->pages_high; + if (!zone_watermark_ok(*z, order, mark, classzone_idx, alloc_flags)) - continue; + if (!zone_reclaim_mode || + !zone_reclaim(*z, gfp_mask, order)) + continue; } - page = buffered_rmqueue(*z, order, gfp_mask); + page = buffered_rmqueue(zonelist, *z, order, gfp_mask); if (page) { - zone_statistics(zonelist, *z); break; } } while (*(++z) != NULL); @@ -849,7 +918,7 @@ restart: } page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, - zonelist, ALLOC_CPUSET); + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); if (page) goto got_pg; @@ -864,15 +933,15 @@ restart: * * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling - * policy. + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ - alloc_flags = 0; + alloc_flags = ALLOC_WMARK_MIN; if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; - if (wait) - alloc_flags |= ALLOC_CPUSET; + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations @@ -894,7 +963,7 @@ restart: nofail_alloc: /* go through the zonelist yet again, ignoring mins */ page = get_page_from_freelist(gfp_mask, order, - zonelist, ALLOC_NO_WATERMARKS|ALLOC_CPUSET); + zonelist, ALLOC_NO_WATERMARKS); if (page) goto got_pg; if (gfp_mask & __GFP_NOFAIL) { @@ -913,6 +982,7 @@ rebalance: cond_resched(); /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -937,7 +1007,7 @@ rebalance: * under heavy pressure. */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, - zonelist, ALLOC_CPUSET); + zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); if (page) goto got_pg; @@ -1139,7 +1209,7 @@ EXPORT_SYMBOL(nr_pagecache); DEFINE_PER_CPU(long, nr_pagecache_local) = 0; #endif -void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) +static void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) { int cpu = 0; @@ -1192,7 +1262,7 @@ void get_full_page_state(struct page_state *ret) __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); } -unsigned long __read_page_state(unsigned long offset) +unsigned long read_page_state_offset(unsigned long offset) { unsigned long ret = 0; int cpu; @@ -1206,18 +1276,26 @@ unsigned long __read_page_state(unsigned long offset) return ret; } -void __mod_page_state(unsigned long offset, unsigned long delta) +void __mod_page_state_offset(unsigned long offset, unsigned long delta) +{ + void *ptr; + + ptr = &__get_cpu_var(page_states); + *(unsigned long *)(ptr + offset) += delta; +} +EXPORT_SYMBOL(__mod_page_state_offset); + +void mod_page_state_offset(unsigned long offset, unsigned long delta) { unsigned long flags; - void* ptr; + void *ptr; local_irq_save(flags); ptr = &__get_cpu_var(page_states); - *(unsigned long*)(ptr + offset) += delta; + *(unsigned long *)(ptr + offset) += delta; local_irq_restore(flags); } - -EXPORT_SYMBOL(__mod_page_state); +EXPORT_SYMBOL(mod_page_state_offset); void __get_zone_counts(unsigned long *active, unsigned long *inactive, unsigned long *free, struct pglist_data *pgdat) @@ -1303,7 +1381,7 @@ void show_free_areas(void) show_node(zone); printk("%s per-cpu:", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk(" empty\n"); continue; } else @@ -1315,10 +1393,9 @@ void show_free_areas(void) pageset = zone_pcp(zone, cpu); for (temperature = 0; temperature < 2; temperature++) - printk("cpu %d %s: low %d, high %d, batch %d used:%d\n", + printk("cpu %d %s: high %d, batch %d used:%d\n", cpu, temperature ? "cold" : "hot", - pageset->pcp[temperature].low, pageset->pcp[temperature].high, pageset->pcp[temperature].batch, pageset->pcp[temperature].count); @@ -1381,7 +1458,7 @@ void show_free_areas(void) show_node(zone); printk("%s: ", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk("empty\n"); continue; } @@ -1401,36 +1478,29 @@ void show_free_areas(void) /* * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) -{ - switch (k) { - struct zone *zone; - default: - BUG(); - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { +static int __init build_zonelists_node(pg_data_t *pgdat, + struct zonelist *zonelist, int nr_zones, int zone_type) +{ + struct zone *zone; + + BUG_ON(zone_type > ZONE_HIGHMEM); + + do { + zone = pgdat->node_zones + zone_type; + if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG(); + BUG_ON(zone_type > ZONE_NORMAL); #endif - zonelist->zones[j++] = zone; + zonelist->zones[nr_zones++] = zone; + check_highest_zone(zone_type); } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA32: - zone = pgdat->node_zones + ZONE_DMA32; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) - zonelist->zones[j++] = zone; - } + zone_type--; - return j; + } while (zone_type >= 0); + return nr_zones; } static inline int highest_zone(int zone_bits) @@ -1527,13 +1597,22 @@ static void __init build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + int distance = node_distance(local_node, node); + + /* + * If another node is sufficiently far away then it is better + * to reclaim pages in a zone before going off node. + */ + if (distance > RECLAIM_DISTANCE) + zone_reclaim_mode = 1; + /* * We don't want to pressure a particular node. * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) + + if (distance != node_distance(local_node, prev_node)) node_load[node] += load; prev_node = node; load--; @@ -1667,18 +1746,16 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; unsigned long end_pfn = start_pfn + size; unsigned long pfn; - for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { + for (pfn = start_pfn; pfn < end_pfn; pfn++) { if (!early_pfn_valid(pfn)) continue; - if (!early_pfn_in_nid(pfn, nid)) - continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); set_page_count(page, 1); @@ -1722,7 +1799,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, memmap_init_zone((size), (nid), (zone), (start_pfn)) #endif -static int __devinit zone_batchsize(struct zone *zone) +static int __meminit zone_batchsize(struct zone *zone) { int batch; @@ -1740,16 +1817,16 @@ static int __devinit zone_batchsize(struct zone *zone) batch = 1; /* - * We will be trying to allcoate bigger chunks of contiguous - * memory of the order of fls(batch). This should result in - * better cache coloring. + * Clamp the batch to a 2^n - 1 value. Having a power + * of 2 value was found to be more likely to have + * suboptimal cache aliasing properties in some cases. * - * A sanity check also to ensure that batch is still in limits. + * For example if 2 tasks are alternately allocating + * batches of pages, one task can end up with a lot + * of pages of one half of the possible page colors + * and the other with pages of the other colors. */ - batch = (1 << fls(batch + batch/2)); - - if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2)) - batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2); + batch = (1 << (fls(batch + batch/2)-1)) - 1; return batch; } @@ -1762,19 +1839,35 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp[0]; /* hot */ pcp->count = 0; - pcp->low = 0; pcp->high = 6 * batch; pcp->batch = max(1UL, 1 * batch); INIT_LIST_HEAD(&pcp->list); pcp = &p->pcp[1]; /* cold*/ pcp->count = 0; - pcp->low = 0; pcp->high = 2 * batch; pcp->batch = max(1UL, batch/2); INIT_LIST_HEAD(&pcp->list); } +/* + * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * to the value high for the pageset p. + */ + +static void setup_pagelist_highmark(struct per_cpu_pageset *p, + unsigned long high) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot list */ + pcp->high = high; + pcp->batch = max(1UL, high/4); + if ((high/4) > (PAGE_SHIFT * 8)) + pcp->batch = PAGE_SHIFT * 8; +} + + #ifdef CONFIG_NUMA /* * Boot pageset table. One per cpu which is going to be used for all @@ -1800,18 +1893,22 @@ static struct per_cpu_pageset * Dynamically allocate memory for the * per cpu pageset array in struct zone. */ -static int __devinit process_zones(int cpu) +static int __meminit process_zones(int cpu) { struct zone *zone, *dzone; for_each_zone(zone) { - zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), + zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, cpu_to_node(cpu)); - if (!zone->pageset[cpu]) + if (!zone_pcp(zone, cpu)) goto bad; - setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); + setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); + + if (percpu_pagelist_fraction) + setup_pagelist_highmark(zone_pcp(zone, cpu), + (zone->present_pages / percpu_pagelist_fraction)); } return 0; @@ -1819,15 +1916,14 @@ bad: for_each_zone(dzone) { if (dzone == zone) break; - kfree(dzone->pageset[cpu]); - dzone->pageset[cpu] = NULL; + kfree(zone_pcp(dzone, cpu)); + zone_pcp(dzone, cpu) = NULL; } return -ENOMEM; } static inline void free_zone_pagesets(int cpu) { -#ifdef CONFIG_NUMA struct zone *zone; for_each_zone(zone) { @@ -1836,10 +1932,9 @@ static inline void free_zone_pagesets(int cpu) zone_pcp(zone, cpu) = NULL; kfree(pset); } -#endif } -static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, +static int __meminit pageset_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1864,7 +1959,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, static struct notifier_block pageset_notifier = { &pageset_cpuup_callback, NULL, 0 }; -void __init setup_per_cpu_pageset() +void __init setup_per_cpu_pageset(void) { int err; @@ -1879,7 +1974,7 @@ void __init setup_per_cpu_pageset() #endif -static __devinit +static __meminit void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; @@ -1899,7 +1994,7 @@ void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) init_waitqueue_head(zone->wait_table + i); } -static __devinit void zone_pcp_init(struct zone *zone) +static __meminit void zone_pcp_init(struct zone *zone) { int cpu; unsigned long batch = zone_batchsize(zone); @@ -1907,7 +2002,7 @@ static __devinit void zone_pcp_init(struct zone *zone) for (cpu = 0; cpu < NR_CPUS; cpu++) { #ifdef CONFIG_NUMA /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; + zone_pcp(zone, cpu) = &boot_pageset[cpu]; setup_pageset(&boot_pageset[cpu],0); #else setup_pageset(zone_pcp(zone,cpu), batch); @@ -1917,7 +2012,7 @@ static __devinit void zone_pcp_init(struct zone *zone) zone->name, zone->present_pages, batch); } -static __devinit void init_currently_empty_zone(struct zone *zone, +static __meminit void init_currently_empty_zone(struct zone *zone, unsigned long zone_start_pfn, unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; @@ -2084,7 +2179,7 @@ static int frag_show(struct seq_file *m, void *arg) int order; for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!zone->present_pages) + if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2117,7 +2212,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { int i; - if (!zone->present_pages) + if (!populated_zone(zone)) continue; spin_lock_irqsave(&zone->lock, flags); @@ -2150,7 +2245,7 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, ")" "\n pagesets"); - for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { + for_each_online_cpu(i) { struct per_cpu_pageset *pageset; int j; @@ -2165,12 +2260,10 @@ static int zoneinfo_show(struct seq_file *m, void *arg) seq_printf(m, "\n cpu: %i pcp: %i" "\n count: %i" - "\n low: %i" "\n high: %i" "\n batch: %i", i, j, pageset->pcp[j].count, - pageset->pcp[j].low, pageset->pcp[j].high, pageset->pcp[j].batch); } @@ -2225,32 +2318,40 @@ static char *vmstat_text[] = { "pgpgout", "pswpin", "pswpout", - "pgalloc_high", + "pgalloc_high", "pgalloc_normal", + "pgalloc_dma32", "pgalloc_dma", + "pgfree", "pgactivate", "pgdeactivate", "pgfault", "pgmajfault", + "pgrefill_high", "pgrefill_normal", + "pgrefill_dma32", "pgrefill_dma", "pgsteal_high", "pgsteal_normal", + "pgsteal_dma32", "pgsteal_dma", + "pgscan_kswapd_high", "pgscan_kswapd_normal", - + "pgscan_kswapd_dma32", "pgscan_kswapd_dma", + "pgscan_direct_high", "pgscan_direct_normal", + "pgscan_direct_dma32", "pgscan_direct_dma", - "pginodesteal", + "pginodesteal", "slabs_scanned", "kswapd_steal", "kswapd_inodesteal", @@ -2507,6 +2608,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, return 0; } +/* + * percpu_pagelist_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist + * can have before it gets flushed back to buddy allocator. + */ + +int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + unsigned int cpu; + int ret; + + ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (!write || (ret == -EINVAL)) + return ret; + for_each_zone(zone) { + for_each_online_cpu(cpu) { + unsigned long high; + high = zone->present_pages / percpu_pagelist_fraction; + setup_pagelist_highmark(zone_pcp(zone, cpu), high); + } + } + return 0; +} + __initdata int hashdist = HASHDIST_DEFAULT; #ifdef CONFIG_NUMA