X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fpage_alloc.c;h=87dc1297fe39691dfc9dbb16d84a07f75ba3c889;hb=f6ac2354d791195ca40822b84d73d48a4e8b7f2b;hp=7cff958e78136b8971825ceab4c21d5510533e65;hpb=2d92c5c9150a2a9ca3dc25da58d5042e17a96b6a;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7cff958..87dc129 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -36,8 +36,11 @@ #include #include #include +#include +#include #include +#include #include "internal.h" /* @@ -48,10 +51,13 @@ nodemask_t node_online_map __read_mostly = { { [0] = 1UL } }; EXPORT_SYMBOL(node_online_map); nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL; EXPORT_SYMBOL(node_possible_map); -struct pglist_data *pgdat_list __read_mostly; unsigned long totalram_pages __read_mostly; unsigned long totalhigh_pages __read_mostly; +unsigned long totalreserve_pages __read_mostly; long nr_swap_pages; +int percpu_pagelist_fraction; + +static void __free_pages_ok(struct page *page, unsigned int order); /* * results with 256, 32 in the lowmem_reserve sysctl: @@ -78,8 +84,8 @@ EXPORT_SYMBOL(zone_table); static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" }; int min_free_kbytes = 1024; -unsigned long __initdata nr_kernel_pages; -unsigned long __initdata nr_all_pages; +unsigned long __meminitdata nr_kernel_pages; +unsigned long __meminitdata nr_all_pages; #ifdef CONFIG_DEBUG_VM static int page_outside_zone_boundaries(struct zone *zone, struct page *page) @@ -130,16 +136,16 @@ static inline int bad_range(struct zone *zone, struct page *page) } #endif -static void bad_page(const char *function, struct page *page) +static void bad_page(struct page *page) { - printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n", - function, current->comm, page); - printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", - (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, - page->mapping, page_mapcount(page), page_count(page)); - printk(KERN_EMERG "Backtrace:\n"); + printk(KERN_EMERG "Bad page state in process '%s'\n" + KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n" + KERN_EMERG "Trying to fix it up, but a reboot is needed\n" + KERN_EMERG "Backtrace:\n", + current->comm, page, (int)(2*sizeof(unsigned long)), + (unsigned long)page->flags, page->mapping, + page_mapcount(page), page_count(page)); dump_stack(); - printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n"); page->flags &= ~(1 << PG_lru | 1 << PG_private | 1 << PG_locked | @@ -148,7 +154,8 @@ static void bad_page(const char *function, struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback ); + 1 << PG_writeback | + 1 << PG_buddy ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; @@ -165,24 +172,27 @@ static void bad_page(const char *function, struct page *page) * All pages have PG_compound set. All pages have their ->private pointing at * the head page (even the head page has this). * - * The first tail page's ->mapping, if non-zero, holds the address of the - * compound page's put_page() function. - * - * The order of the allocation is stored in the first tail page's ->index - * This is only for debug at present. This usage means that zero-order pages - * may not be compound. + * The first tail page's ->lru.next holds the address of the compound page's + * put_page() function. Its ->lru.prev holds the order of allocation. + * This usage means that zero-order pages may not be compound. */ + +static void free_compound_page(struct page *page) +{ + __free_pages_ok(page, (unsigned long)page[1].lru.prev); +} + static void prep_compound_page(struct page *page, unsigned long order) { int i; int nr_pages = 1 << order; - page[1].mapping = NULL; - page[1].index = order; + page[1].lru.next = (void *)free_compound_page; /* set dtor */ + page[1].lru.prev = (void *)order; for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - SetPageCompound(p); + __SetPageCompound(p); set_page_private(p, (unsigned long)page); } } @@ -192,40 +202,52 @@ static void destroy_compound_page(struct page *page, unsigned long order) int i; int nr_pages = 1 << order; - if (!PageCompound(page)) - return; - - if (page[1].index != order) - bad_page(__FUNCTION__, page); + if (unlikely((unsigned long)page[1].lru.prev != order)) + bad_page(page); for (i = 0; i < nr_pages; i++) { struct page *p = page + i; - if (!PageCompound(p)) - bad_page(__FUNCTION__, page); - if (page_private(p) != (unsigned long)page) - bad_page(__FUNCTION__, page); - ClearPageCompound(p); + if (unlikely(!PageCompound(p) | + (page_private(p) != (unsigned long)page))) + bad_page(page); + __ClearPageCompound(p); } } +static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +{ + int i; + + BUG_ON((gfp_flags & (__GFP_WAIT | __GFP_HIGHMEM)) == __GFP_HIGHMEM); + /* + * clear_highpage() will use KM_USER0, so it's a bug to use __GFP_ZERO + * and __GFP_HIGHMEM from hard or soft interrupt context. + */ + BUG_ON((gfp_flags & __GFP_HIGHMEM) && in_interrupt()); + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +} + /* * function for dealing with page's order in buddy system. * zone->lock is already acquired when we use these. * So, we don't need atomic page->flags operations here. */ -static inline unsigned long page_order(struct page *page) { +static inline unsigned long page_order(struct page *page) +{ return page_private(page); } -static inline void set_page_order(struct page *page, int order) { +static inline void set_page_order(struct page *page, int order) +{ set_page_private(page, order); - __SetPagePrivate(page); + __SetPageBuddy(page); } static inline void rmv_page_order(struct page *page) { - __ClearPagePrivate(page); + __ClearPageBuddy(page); set_page_private(page, 0); } @@ -244,7 +266,7 @@ static inline void rmv_page_order(struct page *page) * satisfies the following equation: * P = B & ~(1 << O) * - * Assumption: *_mem_map is contigious at least up to MAX_ORDER + * Assumption: *_mem_map is contiguous at least up to MAX_ORDER */ static inline struct page * __page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order) @@ -264,24 +286,31 @@ __find_combined_index(unsigned long page_idx, unsigned int order) * This function checks whether a page is free && is the buddy * we can do coalesce a page and its buddy if * (a) the buddy is not in a hole && - * (b) the buddy is free && - * (c) the buddy is on the buddy system && - * (d) a page and its buddy have the same order. - * for recording page's order, we use page_private(page) and PG_private. + * (b) the buddy is in the buddy system && + * (c) a page and its buddy have the same order && + * (d) a page and its buddy are in the same zone. + * + * For recording whether a page is in the buddy system, we use PG_buddy. + * Setting, clearing, and testing PG_buddy is serialized by zone->lock. * + * For recording page's order, we use page_private(page). */ -static inline int page_is_buddy(struct page *page, int order) +static inline int page_is_buddy(struct page *page, struct page *buddy, + int order) { #ifdef CONFIG_HOLES_IN_ZONE - if (!pfn_valid(page_to_pfn(page))) + if (!pfn_valid(page_to_pfn(buddy))) return 0; #endif - if (PagePrivate(page) && - (page_order(page) == order) && - page_count(page) == 0) - return 1; - return 0; + if (page_zone_id(page) != page_zone_id(buddy)) + return 0; + + if (PageBuddy(buddy) && page_order(buddy) == order) { + BUG_ON(page_count(buddy) != 0); + return 1; + } + return 0; } /* @@ -297,7 +326,7 @@ static inline int page_is_buddy(struct page *page, int order) * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with PG_Private.Page's + * free pages of length of (1 << order) and marked with PG_buddy. Page's * order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were @@ -308,13 +337,13 @@ static inline int page_is_buddy(struct page *page, int order) * -- wli */ -static inline void __free_pages_bulk (struct page *page, +static inline void __free_one_page(struct page *page, struct zone *zone, unsigned int order) { unsigned long page_idx; int order_size = 1 << order; - if (unlikely(order)) + if (unlikely(PageCompound(page))) destroy_compound_page(page, order); page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1); @@ -329,7 +358,7 @@ static inline void __free_pages_bulk (struct page *page, struct page *buddy; buddy = __page_find_buddy(page, page_idx, order); - if (!page_is_buddy(buddy, order)) + if (!page_is_buddy(page, buddy, order)) break; /* Move the buddy up one level. */ list_del(&buddy->lru); @@ -346,7 +375,7 @@ static inline void __free_pages_bulk (struct page *page, zone->free_area[order].nr_free++; } -static inline int free_pages_check(const char *function, struct page *page) +static inline int free_pages_check(struct page *page) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -360,8 +389,9 @@ static inline int free_pages_check(const char *function, struct page *page) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved )))) - bad_page(function, page); + 1 << PG_reserved | + 1 << PG_buddy )))) + bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); /* @@ -383,55 +413,82 @@ static inline int free_pages_check(const char *function, struct page *page) * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static int -free_pages_bulk(struct zone *zone, int count, - struct list_head *list, unsigned int order) +static void free_pages_bulk(struct zone *zone, int count, + struct list_head *list, int order) { - struct page *page = NULL; - int ret = 0; - spin_lock(&zone->lock); zone->all_unreclaimable = 0; zone->pages_scanned = 0; - while (!list_empty(list) && count--) { + while (count--) { + struct page *page; + + BUG_ON(list_empty(list)); page = list_entry(list->prev, struct page, lru); - /* have to delete it as __free_pages_bulk list manipulates */ + /* have to delete it as __free_one_page list manipulates */ list_del(&page->lru); - __free_pages_bulk(page, zone, order); - ret++; + __free_one_page(page, zone, order); } spin_unlock(&zone->lock); - return ret; } -void __free_pages_ok(struct page *page, unsigned int order) +static void free_one_page(struct zone *zone, struct page *page, int order) { - unsigned long flags; LIST_HEAD(list); + list_add(&page->lru, &list); + free_pages_bulk(zone, 1, &list, order); +} + +static void __free_pages_ok(struct page *page, unsigned int order) +{ + unsigned long flags; int i; int reserved = 0; arch_free_page(page, order); - -#ifndef CONFIG_MMU - if (order > 0) - for (i = 1 ; i < (1 << order) ; ++i) - __put_page(page + i); -#endif + if (!PageHighMem(page)) + debug_check_no_locks_freed(page_address(page), + PAGE_SIZE<lru, &list); - mod_page_state(pgfree, 1 << order); - kernel_map_pages(page, 1<nr_free++; set_page_order(&page[size], high); } - return page; } /* * This page is about to be returned from the page allocator */ -static int prep_new_page(struct page *page, int order) +static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | @@ -483,8 +538,9 @@ static int prep_new_page(struct page *page, int order) 1 << PG_slab | 1 << PG_swapcache | 1 << PG_writeback | - 1 << PG_reserved )))) - bad_page(__FUNCTION__, page); + 1 << PG_reserved | + 1 << PG_buddy )))) + bad_page(page); /* * For now, we report if PG_reserved was found set, but do not @@ -497,8 +553,15 @@ static int prep_new_page(struct page *page, int order) 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); - set_page_refs(page, order); + set_page_refcounted(page); kernel_map_pages(page, 1 << order, 1); + + if (gfp_flags & __GFP_ZERO) + prep_zero_page(page, order, gfp_flags); + + if (order && (gfp_flags & __GFP_COMP)) + prep_compound_page(page, order); + return 0; } @@ -522,7 +585,8 @@ static struct page *__rmqueue(struct zone *zone, unsigned int order) rmv_page_order(page); area->nr_free--; zone->free_pages -= 1UL << order; - return expand(zone, page, order, current_order, area); + expand(zone, page, order, current_order, area); + return page; } return NULL; @@ -537,48 +601,47 @@ static int rmqueue_bulk(struct zone *zone, unsigned int order, unsigned long count, struct list_head *list) { int i; - int allocated = 0; - struct page *page; spin_lock(&zone->lock); for (i = 0; i < count; ++i) { - page = __rmqueue(zone, order); - if (page == NULL) + struct page *page = __rmqueue(zone, order); + if (unlikely(page == NULL)) break; - allocated++; list_add_tail(&page->lru, list); } spin_unlock(&zone->lock); - return allocated; + return i; } #ifdef CONFIG_NUMA -/* Called from the slab reaper to drain remote pagesets */ -void drain_remote_pages(void) +/* + * Called from the slab reaper to drain pagesets on a particular node that + * belong to the currently executing processor. + * Note that this function must be called with the thread pinned to + * a single processor. + */ +void drain_node_pages(int nodeid) { - struct zone *zone; - int i; + int i, z; unsigned long flags; - local_irq_save(flags); - for_each_zone(zone) { + for (z = 0; z < MAX_NR_ZONES; z++) { + struct zone *zone = NODE_DATA(nodeid)->node_zones + z; struct per_cpu_pageset *pset; - /* Do not drain local pagesets */ - if (zone->zone_pgdat->node_id == numa_node_id()) - continue; - - pset = zone->pageset[smp_processor_id()]; + pset = zone_pcp(zone, smp_processor_id()); for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) { struct per_cpu_pages *pcp; pcp = &pset->pcp[i]; - if (pcp->count) - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + if (pcp->count) { + local_irq_save(flags); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; + local_irq_restore(flags); + } } } - local_irq_restore(flags); } #endif @@ -598,8 +661,8 @@ static void __drain_pages(unsigned int cpu) pcp = &pset->pcp[i]; local_irq_save(flags); - pcp->count -= free_pages_bulk(zone, pcp->count, - &pcp->list, 0); + free_pages_bulk(zone, pcp->count, &pcp->list, 0); + pcp->count = 0; local_irq_restore(flags); } } @@ -646,18 +709,14 @@ void drain_local_pages(void) } #endif /* CONFIG_PM */ -static void zone_statistics(struct zonelist *zonelist, struct zone *z) +static void zone_statistics(struct zonelist *zonelist, struct zone *z, int cpu) { #ifdef CONFIG_NUMA - unsigned long flags; - int cpu; pg_data_t *pg = z->zone_pgdat; pg_data_t *orig = zonelist->zones[0]->zone_pgdat; struct per_cpu_pageset *p; - local_irq_save(flags); - cpu = smp_processor_id(); - p = zone_pcp(z,cpu); + p = zone_pcp(z, cpu); if (pg == orig) { p->numa_hit++; } else { @@ -668,14 +727,12 @@ static void zone_statistics(struct zonelist *zonelist, struct zone *z) p->local_node++; else p->other_node++; - local_irq_restore(flags); #endif } /* * Free a 0-order page */ -static void FASTCALL(free_hot_cold_page(struct page *page, int cold)); static void fastcall free_hot_cold_page(struct page *page, int cold) { struct zone *zone = page_zone(page); @@ -686,18 +743,20 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) if (PageAnon(page)) page->mapping = NULL; - if (free_pages_check(__FUNCTION__, page)) + if (free_pages_check(page)) return; - inc_page_state(pgfree); kernel_map_pages(page, 1, 0); pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); + __inc_page_state(pgfree); list_add(&page->lru, &pcp->list); pcp->count++; - if (pcp->count >= pcp->high) - pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + if (pcp->count >= pcp->high) { + free_pages_bulk(zone, pcp->batch, &pcp->list, 0); + pcp->count -= pcp->batch; + } local_irq_restore(flags); put_cpu(); } @@ -712,13 +771,22 @@ void fastcall free_cold_page(struct page *page) free_hot_cold_page(page, 1); } -static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags) +/* + * split_page takes a non-compound higher-order page, and splits it into + * n (1< 0 path. Saves a branch * or two. */ -static struct page * -buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) +static struct page *buffered_rmqueue(struct zonelist *zonelist, + struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; struct page *page; int cold = !!(gfp_flags & __GFP_COLD); + int cpu; again: - if (order == 0) { + cpu = get_cpu(); + if (likely(order == 0)) { struct per_cpu_pages *pcp; - page = NULL; - pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; + pcp = &zone_pcp(zone, cpu)->pcp[cold]; local_irq_save(flags); - if (!pcp->count) + if (!pcp->count) { pcp->count += rmqueue_bulk(zone, 0, pcp->batch, &pcp->list); - if (likely(pcp->count)) { - page = list_entry(pcp->list.next, struct page, lru); - list_del(&page->lru); - pcp->count--; + if (unlikely(!pcp->count)) + goto failed; } - local_irq_restore(flags); - put_cpu(); + page = list_entry(pcp->list.next, struct page, lru); + list_del(&page->lru); + pcp->count--; } else { spin_lock_irqsave(&zone->lock, flags); page = __rmqueue(zone, order); - spin_unlock_irqrestore(&zone->lock, flags); + spin_unlock(&zone->lock); + if (!page) + goto failed; } - if (page != NULL) { - BUG_ON(bad_range(zone, page)); - mod_page_state_zone(zone, pgalloc, 1 << order); - if (prep_new_page(page, order)) - goto again; - - if (gfp_flags & __GFP_ZERO) - prep_zero_page(page, order, gfp_flags); + __mod_page_state_zone(zone, pgalloc, 1 << order); + zone_statistics(zonelist, zone, cpu); + local_irq_restore(flags); + put_cpu(); - if (order && (gfp_flags & __GFP_COMP)) - prep_compound_page(page, order); - } + BUG_ON(bad_range(zone, page)); + if (prep_new_page(page, order, gfp_flags)) + goto again; return page; + +failed: + local_irq_restore(flags); + put_cpu(); + return NULL; } #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ @@ -841,12 +912,13 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, mark = (*z)->pages_high; if (!zone_watermark_ok(*z, order, mark, classzone_idx, alloc_flags)) - continue; + if (!zone_reclaim_mode || + !zone_reclaim(*z, gfp_mask, order)) + continue; } - page = buffered_rmqueue(*z, order, gfp_mask); + page = buffered_rmqueue(zonelist, *z, order, gfp_mask); if (page) { - zone_statistics(zonelist, *z); break; } } while (*(++z) != NULL); @@ -895,14 +967,16 @@ restart: * * The caller may dip into page reserves a bit more if the caller * cannot run direct reclaim, or if the caller has realtime scheduling - * policy. + * policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will + * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH). */ alloc_flags = ALLOC_WMARK_MIN; if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) alloc_flags |= ALLOC_HIGH; - alloc_flags |= ALLOC_CPUSET; + if (wait) + alloc_flags |= ALLOC_CPUSET; /* * Go through the zonelist again. Let __GFP_HIGH and allocations @@ -943,6 +1017,7 @@ rebalance: cond_resched(); /* We now go into synchronous reclaim */ + cpuset_memory_pressure_bump(); p->flags |= PF_MEMALLOC; reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -971,7 +1046,7 @@ rebalance: if (page) goto got_pg; - out_of_memory(gfp_mask, order); + out_of_memory(zonelist, gfp_mask, order); goto restart; } @@ -1140,7 +1215,7 @@ unsigned int nr_free_highpages (void) pg_data_t *pgdat; unsigned int pages = 0; - for_each_pgdat(pgdat) + for_each_online_pgdat(pgdat) pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages; return pages; @@ -1156,132 +1231,6 @@ static void show_node(struct zone *zone) #define show_node(zone) do { } while (0) #endif -/* - * Accumulate the page_state information across all CPUs. - * The result is unavoidably approximate - it can change - * during and after execution of this function. - */ -static DEFINE_PER_CPU(struct page_state, page_states) = {0}; - -atomic_t nr_pagecache = ATOMIC_INIT(0); -EXPORT_SYMBOL(nr_pagecache); -#ifdef CONFIG_SMP -DEFINE_PER_CPU(long, nr_pagecache_local) = 0; -#endif - -void __get_page_state(struct page_state *ret, int nr, cpumask_t *cpumask) -{ - int cpu = 0; - - memset(ret, 0, sizeof(*ret)); - cpus_and(*cpumask, *cpumask, cpu_online_map); - - cpu = first_cpu(*cpumask); - while (cpu < NR_CPUS) { - unsigned long *in, *out, off; - - in = (unsigned long *)&per_cpu(page_states, cpu); - - cpu = next_cpu(cpu, *cpumask); - - if (cpu < NR_CPUS) - prefetch(&per_cpu(page_states, cpu)); - - out = (unsigned long *)ret; - for (off = 0; off < nr; off++) - *out++ += *in++; - } -} - -void get_page_state_node(struct page_state *ret, int node) -{ - int nr; - cpumask_t mask = node_to_cpumask(node); - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); - - __get_page_state(ret, nr+1, &mask); -} - -void get_page_state(struct page_state *ret) -{ - int nr; - cpumask_t mask = CPU_MASK_ALL; - - nr = offsetof(struct page_state, GET_PAGE_STATE_LAST); - nr /= sizeof(unsigned long); - - __get_page_state(ret, nr + 1, &mask); -} - -void get_full_page_state(struct page_state *ret) -{ - cpumask_t mask = CPU_MASK_ALL; - - __get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask); -} - -unsigned long __read_page_state(unsigned long offset) -{ - unsigned long ret = 0; - int cpu; - - for_each_online_cpu(cpu) { - unsigned long in; - - in = (unsigned long)&per_cpu(page_states, cpu) + offset; - ret += *((unsigned long *)in); - } - return ret; -} - -void __mod_page_state(unsigned long offset, unsigned long delta) -{ - unsigned long flags; - void* ptr; - - local_irq_save(flags); - ptr = &__get_cpu_var(page_states); - *(unsigned long*)(ptr + offset) += delta; - local_irq_restore(flags); -} - -EXPORT_SYMBOL(__mod_page_state); - -void __get_zone_counts(unsigned long *active, unsigned long *inactive, - unsigned long *free, struct pglist_data *pgdat) -{ - struct zone *zones = pgdat->node_zones; - int i; - - *active = 0; - *inactive = 0; - *free = 0; - for (i = 0; i < MAX_NR_ZONES; i++) { - *active += zones[i].nr_active; - *inactive += zones[i].nr_inactive; - *free += zones[i].free_pages; - } -} - -void get_zone_counts(unsigned long *active, - unsigned long *inactive, unsigned long *free) -{ - struct pglist_data *pgdat; - - *active = 0; - *inactive = 0; - *free = 0; - for_each_pgdat(pgdat) { - unsigned long l, m, n; - __get_zone_counts(&l, &m, &n, pgdat); - *active += l; - *inactive += m; - *free += n; - } -} - void si_meminfo(struct sysinfo *val) { val->totalram = totalram_pages; @@ -1333,7 +1282,7 @@ void show_free_areas(void) show_node(zone); printk("%s per-cpu:", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk(" empty\n"); continue; } else @@ -1406,22 +1355,23 @@ void show_free_areas(void) } for_each_zone(zone) { - unsigned long nr, flags, order, total = 0; + unsigned long nr[MAX_ORDER], flags, order, total = 0; show_node(zone); printk("%s: ", zone->name); - if (!zone->present_pages) { + if (!populated_zone(zone)) { printk("empty\n"); continue; } spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - nr = zone->free_area[order].nr_free; - total += nr << order; - printk("%lu*%lukB ", nr, K(1UL) << order); + nr[order] = zone->free_area[order].nr_free; + total += nr[order] << order; } spin_unlock_irqrestore(&zone->lock, flags); + for (order = 0; order < MAX_ORDER; order++) + printk("%lu*%lukB ", nr[order], K(1UL) << order); printk("= %lukB\n", K(total)); } @@ -1430,36 +1380,29 @@ void show_free_areas(void) /* * Builds allocation fallback zone lists. + * + * Add all populated zones of a node to the zonelist. */ -static int __init build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, int j, int k) -{ - switch (k) { - struct zone *zone; - default: - BUG(); - case ZONE_HIGHMEM: - zone = pgdat->node_zones + ZONE_HIGHMEM; - if (zone->present_pages) { +static int __meminit build_zonelists_node(pg_data_t *pgdat, + struct zonelist *zonelist, int nr_zones, int zone_type) +{ + struct zone *zone; + + BUG_ON(zone_type > ZONE_HIGHMEM); + + do { + zone = pgdat->node_zones + zone_type; + if (populated_zone(zone)) { #ifndef CONFIG_HIGHMEM - BUG(); + BUG_ON(zone_type > ZONE_NORMAL); #endif - zonelist->zones[j++] = zone; + zonelist->zones[nr_zones++] = zone; + check_highest_zone(zone_type); } - case ZONE_NORMAL: - zone = pgdat->node_zones + ZONE_NORMAL; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA32: - zone = pgdat->node_zones + ZONE_DMA32; - if (zone->present_pages) - zonelist->zones[j++] = zone; - case ZONE_DMA: - zone = pgdat->node_zones + ZONE_DMA; - if (zone->present_pages) - zonelist->zones[j++] = zone; - } + zone_type--; - return j; + } while (zone_type >= 0); + return nr_zones; } static inline int highest_zone(int zone_bits) @@ -1476,7 +1419,7 @@ static inline int highest_zone(int zone_bits) #ifdef CONFIG_NUMA #define MAX_NODE_LOAD (num_online_nodes()) -static int __initdata node_load[MAX_NUMNODES]; +static int __meminitdata node_load[MAX_NUMNODES]; /** * find_next_best_node - find the next node that should appear in a given node's fallback list * @node: node whose fallback list we're appending @@ -1491,31 +1434,31 @@ static int __initdata node_load[MAX_NUMNODES]; * on them otherwise. * It returns -1 if no node is found. */ -static int __init find_next_best_node(int node, nodemask_t *used_node_mask) +static int __meminit find_next_best_node(int node, nodemask_t *used_node_mask) { - int i, n, val; + int n, val; int min_val = INT_MAX; int best_node = -1; - for_each_online_node(i) { - cpumask_t tmp; + /* Use the local node if we haven't already */ + if (!node_isset(node, *used_node_mask)) { + node_set(node, *used_node_mask); + return node; + } - /* Start from local node */ - n = (node+i) % num_online_nodes(); + for_each_online_node(n) { + cpumask_t tmp; /* Don't want a node to appear more than once */ if (node_isset(n, *used_node_mask)) continue; - /* Use the local node if we haven't already */ - if (!node_isset(node, *used_node_mask)) { - best_node = node; - break; - } - /* Use the distance array to find the distance */ val = node_distance(node, n); + /* Penalize nodes under us ("prefer the next node") */ + val += (n < node); + /* Give preference to headless and unused nodes */ tmp = node_to_cpumask(n); if (!cpus_empty(tmp)) @@ -1537,7 +1480,7 @@ static int __init find_next_best_node(int node, nodemask_t *used_node_mask) return best_node; } -static void __init build_zonelists(pg_data_t *pgdat) +static void __meminit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; int prev_node, load; @@ -1556,13 +1499,22 @@ static void __init build_zonelists(pg_data_t *pgdat) prev_node = local_node; nodes_clear(used_mask); while ((node = find_next_best_node(local_node, &used_mask)) >= 0) { + int distance = node_distance(local_node, node); + + /* + * If another node is sufficiently far away then it is better + * to reclaim pages in a zone before going off node. + */ + if (distance > RECLAIM_DISTANCE) + zone_reclaim_mode = 1; + /* * We don't want to pressure a particular node. * So adding penalty to the first node in same * distance group to make it round-robin. */ - if (node_distance(local_node, node) != - node_distance(local_node, prev_node)) + + if (distance != node_distance(local_node, prev_node)) node_load[node] += load; prev_node = node; load--; @@ -1580,7 +1532,7 @@ static void __init build_zonelists(pg_data_t *pgdat) #else /* CONFIG_NUMA */ -static void __init build_zonelists(pg_data_t *pgdat) +static void __meminit build_zonelists(pg_data_t *pgdat) { int i, j, k, node, local_node; @@ -1618,14 +1570,29 @@ static void __init build_zonelists(pg_data_t *pgdat) #endif /* CONFIG_NUMA */ -void __init build_all_zonelists(void) +/* return values int ....just for stop_machine_run() */ +static int __meminit __build_all_zonelists(void *dummy) { - int i; + int nid; + for_each_online_node(nid) + build_zonelists(NODE_DATA(nid)); + return 0; +} - for_each_online_node(i) - build_zonelists(NODE_DATA(i)); - printk("Built %i zonelists\n", num_online_nodes()); - cpuset_init_current_mems_allowed(); +void __meminit build_all_zonelists(void) +{ + if (system_state == SYSTEM_BOOTING) { + __build_all_zonelists(0); + cpuset_init_current_mems_allowed(); + } else { + /* we have to stop all cpus to guaranntee there is no user + of zonelist */ + stop_machine_run(__build_all_zonelists, NULL, NR_CPUS); + /* cpuset refresh routine should be here */ + } + vm_total_pages = nr_free_pagecache_pages(); + printk("Built %i zonelists. Total pages: %ld\n", + num_online_nodes(), vm_total_pages); } /* @@ -1641,7 +1608,8 @@ void __init build_all_zonelists(void) */ #define PAGES_PER_WAITQUEUE 256 -static inline unsigned long wait_table_size(unsigned long pages) +#ifndef CONFIG_MEMORY_HOTPLUG +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) { unsigned long size = 1; @@ -1659,6 +1627,29 @@ static inline unsigned long wait_table_size(unsigned long pages) return max(size, 4UL); } +#else +/* + * A zone's size might be changed by hot-add, so it is not possible to determine + * a suitable size for its wait_table. So we use the maximum size now. + * + * The max wait table size = 4096 x sizeof(wait_queue_head_t). ie: + * + * i386 (preemption config) : 4096 x 16 = 64Kbyte. + * ia64, x86-64 (no preemption): 4096 x 20 = 80Kbyte. + * ia64, x86-64 (preemption) : 4096 x 24 = 96Kbyte. + * + * The maximum entries are prepared when a zone's memory is (512K + 256) pages + * or more by the traditional way. (See above). It equals: + * + * i386, x86-64, powerpc(4K page size) : = ( 2G + 1M)byte. + * ia64(16K page size) : = ( 8G + 4M)byte. + * powerpc (64K page size) : = (32G +16M)byte. + */ +static inline unsigned long wait_table_hash_nr_entries(unsigned long pages) +{ + return 4096UL; +} +#endif /* * This is an integer logarithm so that shifts can be used later @@ -1696,19 +1687,19 @@ static void __init calculate_zone_totalpages(struct pglist_data *pgdat, * up by free_all_bootmem() once the early boot process is * done. Non-atomic initialization, single-pass. */ -void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone, +void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone, unsigned long start_pfn) { struct page *page; unsigned long end_pfn = start_pfn + size; unsigned long pfn; - for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) { + for (pfn = start_pfn; pfn < end_pfn; pfn++) { if (!early_pfn_valid(pfn)) continue; page = pfn_to_page(pfn); set_page_links(page, zone, nid, pfn); - set_page_count(page, 1); + init_page_count(page); reset_page_mapcount(page); SetPageReserved(page); INIT_LIST_HEAD(&page->lru); @@ -1749,7 +1740,7 @@ void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn, memmap_init_zone((size), (nid), (zone), (start_pfn)) #endif -static int __devinit zone_batchsize(struct zone *zone) +static int __cpuinit zone_batchsize(struct zone *zone) { int batch; @@ -1800,6 +1791,24 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) INIT_LIST_HEAD(&pcp->list); } +/* + * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * to the value high for the pageset p. + */ + +static void setup_pagelist_highmark(struct per_cpu_pageset *p, + unsigned long high) +{ + struct per_cpu_pages *pcp; + + pcp = &p->pcp[0]; /* hot list */ + pcp->high = high; + pcp->batch = max(1UL, high/4); + if ((high/4) > (PAGE_SHIFT * 8)) + pcp->batch = PAGE_SHIFT * 8; +} + + #ifdef CONFIG_NUMA /* * Boot pageset table. One per cpu which is going to be used for all @@ -1818,25 +1827,28 @@ inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) * not check if the processor is online before following the pageset pointer. * Other parts of the kernel may not check if the zone is available. */ -static struct per_cpu_pageset - boot_pageset[NR_CPUS]; +static struct per_cpu_pageset boot_pageset[NR_CPUS]; /* * Dynamically allocate memory for the * per cpu pageset array in struct zone. */ -static int __devinit process_zones(int cpu) +static int __cpuinit process_zones(int cpu) { struct zone *zone, *dzone; for_each_zone(zone) { - zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset), + zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset), GFP_KERNEL, cpu_to_node(cpu)); - if (!zone->pageset[cpu]) + if (!zone_pcp(zone, cpu)) goto bad; - setup_pageset(zone->pageset[cpu], zone_batchsize(zone)); + setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone)); + + if (percpu_pagelist_fraction) + setup_pagelist_highmark(zone_pcp(zone, cpu), + (zone->present_pages / percpu_pagelist_fraction)); } return 0; @@ -1844,15 +1856,14 @@ bad: for_each_zone(dzone) { if (dzone == zone) break; - kfree(dzone->pageset[cpu]); - dzone->pageset[cpu] = NULL; + kfree(zone_pcp(dzone, cpu)); + zone_pcp(dzone, cpu) = NULL; } return -ENOMEM; } static inline void free_zone_pagesets(int cpu) { -#ifdef CONFIG_NUMA struct zone *zone; for_each_zone(zone) { @@ -1861,10 +1872,9 @@ static inline void free_zone_pagesets(int cpu) zone_pcp(zone, cpu) = NULL; kfree(pset); } -#endif } -static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, +static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1886,7 +1896,7 @@ static int __devinit pageset_cpuup_callback(struct notifier_block *nfb, return ret; } -static struct notifier_block pageset_notifier = +static struct notifier_block __cpuinitdata pageset_notifier = { &pageset_cpuup_callback, NULL, 0 }; void __init setup_per_cpu_pageset(void) @@ -1904,27 +1914,50 @@ void __init setup_per_cpu_pageset(void) #endif -static __devinit -void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) +static __meminit +int zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages) { int i; struct pglist_data *pgdat = zone->zone_pgdat; + size_t alloc_size; /* * The per-page waitqueue mechanism uses hashed waitqueues * per zone. */ - zone->wait_table_size = wait_table_size(zone_size_pages); - zone->wait_table_bits = wait_table_bits(zone->wait_table_size); - zone->wait_table = (wait_queue_head_t *) - alloc_bootmem_node(pgdat, zone->wait_table_size - * sizeof(wait_queue_head_t)); + zone->wait_table_hash_nr_entries = + wait_table_hash_nr_entries(zone_size_pages); + zone->wait_table_bits = + wait_table_bits(zone->wait_table_hash_nr_entries); + alloc_size = zone->wait_table_hash_nr_entries + * sizeof(wait_queue_head_t); + + if (system_state == SYSTEM_BOOTING) { + zone->wait_table = (wait_queue_head_t *) + alloc_bootmem_node(pgdat, alloc_size); + } else { + /* + * This case means that a zone whose size was 0 gets new memory + * via memory hot-add. + * But it may be the case that a new node was hot-added. In + * this case vmalloc() will not be able to use this new node's + * memory - this wait_table must be initialized to use this new + * node itself as well. + * To use this new node's memory, further consideration will be + * necessary. + */ + zone->wait_table = (wait_queue_head_t *)vmalloc(alloc_size); + } + if (!zone->wait_table) + return -ENOMEM; - for(i = 0; i < zone->wait_table_size; ++i) + for(i = 0; i < zone->wait_table_hash_nr_entries; ++i) init_waitqueue_head(zone->wait_table + i); + + return 0; } -static __devinit void zone_pcp_init(struct zone *zone) +static __meminit void zone_pcp_init(struct zone *zone) { int cpu; unsigned long batch = zone_batchsize(zone); @@ -1932,30 +1965,35 @@ static __devinit void zone_pcp_init(struct zone *zone) for (cpu = 0; cpu < NR_CPUS; cpu++) { #ifdef CONFIG_NUMA /* Early boot. Slab allocator not functional yet */ - zone->pageset[cpu] = &boot_pageset[cpu]; + zone_pcp(zone, cpu) = &boot_pageset[cpu]; setup_pageset(&boot_pageset[cpu],0); #else setup_pageset(zone_pcp(zone,cpu), batch); #endif } - printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", - zone->name, zone->present_pages, batch); + if (zone->present_pages) + printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n", + zone->name, zone->present_pages, batch); } -static __devinit void init_currently_empty_zone(struct zone *zone, - unsigned long zone_start_pfn, unsigned long size) +__meminit int init_currently_empty_zone(struct zone *zone, + unsigned long zone_start_pfn, + unsigned long size) { struct pglist_data *pgdat = zone->zone_pgdat; - - zone_wait_table_init(zone, size); + int ret; + ret = zone_wait_table_init(zone, size); + if (ret) + return ret; pgdat->nr_zones = zone_idx(zone) + 1; - zone->zone_mem_map = pfn_to_page(zone_start_pfn); zone->zone_start_pfn = zone_start_pfn; memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn); zone_init_free_lists(pgdat, zone, zone->spanned_pages); + + return 0; } /* @@ -1964,12 +2002,13 @@ static __devinit void init_currently_empty_zone(struct zone *zone, * - mark all memory queues empty * - clear the memory bitmaps */ -static void __init free_area_init_core(struct pglist_data *pgdat, +static void __meminit free_area_init_core(struct pglist_data *pgdat, unsigned long *zones_size, unsigned long *zholes_size) { unsigned long j; int nid = pgdat->node_id; unsigned long zone_start_pfn = pgdat->node_start_pfn; + int ret; pgdat_resize_init(pgdat); pgdat->nr_zones = 0; @@ -2011,7 +2050,8 @@ static void __init free_area_init_core(struct pglist_data *pgdat, continue; zonetable_add(zone, nid, j, zone_start_pfn, size); - init_currently_empty_zone(zone, zone_start_pfn, size); + ret = init_currently_empty_zone(zone, zone_start_pfn, size); + BUG_ON(ret); zone_start_pfn += size; } } @@ -2025,14 +2065,22 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) #ifdef CONFIG_FLAT_NODE_MEM_MAP /* ia64 gets its own node_mem_map, before this, without bootmem */ if (!pgdat->node_mem_map) { - unsigned long size; + unsigned long size, start, end; struct page *map; - size = (pgdat->node_spanned_pages + 1) * sizeof(struct page); + /* + * The zone's endpoints aren't required to be MAX_ORDER + * aligned but the node_mem_map endpoints must be in order + * for the buddy allocator to function correctly. + */ + start = pgdat->node_start_pfn & ~(MAX_ORDER_NR_PAGES - 1); + end = pgdat->node_start_pfn + pgdat->node_spanned_pages; + end = ALIGN(end, MAX_ORDER_NR_PAGES); + size = (end - start) * sizeof(struct page); map = alloc_remap(pgdat->node_id, size); if (!map) map = alloc_bootmem_node(pgdat, size); - pgdat->node_mem_map = map; + pgdat->node_mem_map = map + (pgdat->node_start_pfn - start); } #ifdef CONFIG_FLATMEM /* @@ -2044,7 +2092,7 @@ static void __init alloc_node_mem_map(struct pglist_data *pgdat) #endif /* CONFIG_FLAT_NODE_MEM_MAP */ } -void __init free_area_init_node(int nid, struct pglist_data *pgdat, +void __meminit free_area_init_node(int nid, struct pglist_data *pgdat, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { @@ -2070,269 +2118,6 @@ void __init free_area_init(unsigned long *zones_size) __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL); } -#ifdef CONFIG_PROC_FS - -#include - -static void *frag_start(struct seq_file *m, loff_t *pos) -{ - pg_data_t *pgdat; - loff_t node = *pos; - - for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next) - --node; - - return pgdat; -} - -static void *frag_next(struct seq_file *m, void *arg, loff_t *pos) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - - (*pos)++; - return pgdat->pgdat_next; -} - -static void frag_stop(struct seq_file *m, void *arg) -{ -} - -/* - * This walks the free areas for each zone. - */ -static int frag_show(struct seq_file *m, void *arg) -{ - pg_data_t *pgdat = (pg_data_t *)arg; - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - int order; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { - if (!zone->present_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name); - for (order = 0; order < MAX_ORDER; ++order) - seq_printf(m, "%6lu ", zone->free_area[order].nr_free); - spin_unlock_irqrestore(&zone->lock, flags); - seq_putc(m, '\n'); - } - return 0; -} - -struct seq_operations fragmentation_op = { - .start = frag_start, - .next = frag_next, - .stop = frag_stop, - .show = frag_show, -}; - -/* - * Output information about zones in @pgdat. - */ -static int zoneinfo_show(struct seq_file *m, void *arg) -{ - pg_data_t *pgdat = arg; - struct zone *zone; - struct zone *node_zones = pgdat->node_zones; - unsigned long flags; - - for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) { - int i; - - if (!zone->present_pages) - continue; - - spin_lock_irqsave(&zone->lock, flags); - seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name); - seq_printf(m, - "\n pages free %lu" - "\n min %lu" - "\n low %lu" - "\n high %lu" - "\n active %lu" - "\n inactive %lu" - "\n scanned %lu (a: %lu i: %lu)" - "\n spanned %lu" - "\n present %lu", - zone->free_pages, - zone->pages_min, - zone->pages_low, - zone->pages_high, - zone->nr_active, - zone->nr_inactive, - zone->pages_scanned, - zone->nr_scan_active, zone->nr_scan_inactive, - zone->spanned_pages, - zone->present_pages); - seq_printf(m, - "\n protection: (%lu", - zone->lowmem_reserve[0]); - for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++) - seq_printf(m, ", %lu", zone->lowmem_reserve[i]); - seq_printf(m, - ")" - "\n pagesets"); - for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) { - struct per_cpu_pageset *pageset; - int j; - - pageset = zone_pcp(zone, i); - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - if (pageset->pcp[j].count) - break; - } - if (j == ARRAY_SIZE(pageset->pcp)) - continue; - for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) { - seq_printf(m, - "\n cpu: %i pcp: %i" - "\n count: %i" - "\n high: %i" - "\n batch: %i", - i, j, - pageset->pcp[j].count, - pageset->pcp[j].high, - pageset->pcp[j].batch); - } -#ifdef CONFIG_NUMA - seq_printf(m, - "\n numa_hit: %lu" - "\n numa_miss: %lu" - "\n numa_foreign: %lu" - "\n interleave_hit: %lu" - "\n local_node: %lu" - "\n other_node: %lu", - pageset->numa_hit, - pageset->numa_miss, - pageset->numa_foreign, - pageset->interleave_hit, - pageset->local_node, - pageset->other_node); -#endif - } - seq_printf(m, - "\n all_unreclaimable: %u" - "\n prev_priority: %i" - "\n temp_priority: %i" - "\n start_pfn: %lu", - zone->all_unreclaimable, - zone->prev_priority, - zone->temp_priority, - zone->zone_start_pfn); - spin_unlock_irqrestore(&zone->lock, flags); - seq_putc(m, '\n'); - } - return 0; -} - -struct seq_operations zoneinfo_op = { - .start = frag_start, /* iterate over all zones. The same as in - * fragmentation. */ - .next = frag_next, - .stop = frag_stop, - .show = zoneinfo_show, -}; - -static char *vmstat_text[] = { - "nr_dirty", - "nr_writeback", - "nr_unstable", - "nr_page_table_pages", - "nr_mapped", - "nr_slab", - - "pgpgin", - "pgpgout", - "pswpin", - "pswpout", - "pgalloc_high", - - "pgalloc_normal", - "pgalloc_dma", - "pgfree", - "pgactivate", - "pgdeactivate", - - "pgfault", - "pgmajfault", - "pgrefill_high", - "pgrefill_normal", - "pgrefill_dma", - - "pgsteal_high", - "pgsteal_normal", - "pgsteal_dma", - "pgscan_kswapd_high", - "pgscan_kswapd_normal", - - "pgscan_kswapd_dma", - "pgscan_direct_high", - "pgscan_direct_normal", - "pgscan_direct_dma", - "pginodesteal", - - "slabs_scanned", - "kswapd_steal", - "kswapd_inodesteal", - "pageoutrun", - "allocstall", - - "pgrotated", - "nr_bounce", -}; - -static void *vmstat_start(struct seq_file *m, loff_t *pos) -{ - struct page_state *ps; - - if (*pos >= ARRAY_SIZE(vmstat_text)) - return NULL; - - ps = kmalloc(sizeof(*ps), GFP_KERNEL); - m->private = ps; - if (!ps) - return ERR_PTR(-ENOMEM); - get_full_page_state(ps); - ps->pgpgin /= 2; /* sectors -> kbytes */ - ps->pgpgout /= 2; - return (unsigned long *)ps + *pos; -} - -static void *vmstat_next(struct seq_file *m, void *arg, loff_t *pos) -{ - (*pos)++; - if (*pos >= ARRAY_SIZE(vmstat_text)) - return NULL; - return (unsigned long *)m->private + *pos; -} - -static int vmstat_show(struct seq_file *m, void *arg) -{ - unsigned long *l = arg; - unsigned long off = l - (unsigned long *)m->private; - - seq_printf(m, "%s %lu\n", vmstat_text[off], *l); - return 0; -} - -static void vmstat_stop(struct seq_file *m, void *arg) -{ - kfree(m->private); - m->private = NULL; -} - -struct seq_operations vmstat_op = { - .start = vmstat_start, - .next = vmstat_next, - .stop = vmstat_stop, - .show = vmstat_show, -}; - -#endif /* CONFIG_PROC_FS */ - #ifdef CONFIG_HOTPLUG_CPU static int page_alloc_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) @@ -2373,6 +2158,38 @@ void __init page_alloc_init(void) } /* + * calculate_totalreserve_pages - called when sysctl_lower_zone_reserve_ratio + * or min_free_kbytes changes. + */ +static void calculate_totalreserve_pages(void) +{ + struct pglist_data *pgdat; + unsigned long reserve_pages = 0; + int i, j; + + for_each_online_pgdat(pgdat) { + for (i = 0; i < MAX_NR_ZONES; i++) { + struct zone *zone = pgdat->node_zones + i; + unsigned long max = 0; + + /* Find valid and maximum lowmem_reserve in the zone */ + for (j = i; j < MAX_NR_ZONES; j++) { + if (zone->lowmem_reserve[j] > max) + max = zone->lowmem_reserve[j]; + } + + /* we treat pages_high as reserved pages. */ + max += zone->pages_high; + + if (max > zone->present_pages) + max = zone->present_pages; + reserve_pages += max; + } + } + totalreserve_pages = reserve_pages; +} + +/* * setup_per_zone_lowmem_reserve - called whenever * sysctl_lower_zone_reserve_ratio changes. Ensures that each zone * has a correct pages reserved value, so an adequate number of @@ -2383,7 +2200,7 @@ static void setup_per_zone_lowmem_reserve(void) struct pglist_data *pgdat; int j, idx; - for_each_pgdat(pgdat) { + for_each_online_pgdat(pgdat) { for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long present_pages = zone->present_pages; @@ -2403,6 +2220,9 @@ static void setup_per_zone_lowmem_reserve(void) } } } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); } /* @@ -2424,9 +2244,11 @@ void setup_per_zone_pages_min(void) } for_each_zone(zone) { - unsigned long tmp; + u64 tmp; + spin_lock_irqsave(&zone->lru_lock, flags); - tmp = (pages_min * zone->present_pages) / lowmem_pages; + tmp = (u64)pages_min * zone->present_pages; + do_div(tmp, lowmem_pages); if (is_highmem(zone)) { /* * __GFP_HIGH and PF_MEMALLOC allocations usually don't @@ -2453,10 +2275,13 @@ void setup_per_zone_pages_min(void) zone->pages_min = tmp; } - zone->pages_low = zone->pages_min + tmp / 4; - zone->pages_high = zone->pages_min + tmp / 2; + zone->pages_low = zone->pages_min + (tmp >> 2); + zone->pages_high = zone->pages_min + (tmp >> 1); spin_unlock_irqrestore(&zone->lru_lock, flags); } + + /* update totalreserve_pages */ + calculate_totalreserve_pages(); } /* @@ -2530,6 +2355,32 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, return 0; } +/* + * percpu_pagelist_fraction - changes the pcp->high for each zone on each + * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist + * can have before it gets flushed back to buddy allocator. + */ + +int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, + struct file *file, void __user *buffer, size_t *length, loff_t *ppos) +{ + struct zone *zone; + unsigned int cpu; + int ret; + + ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos); + if (!write || (ret == -EINVAL)) + return ret; + for_each_zone(zone) { + for_each_online_cpu(cpu) { + unsigned long high; + high = zone->present_pages / percpu_pagelist_fraction; + setup_pagelist_highmark(zone_pcp(zone, cpu), high); + } + } + return 0; +} + __initdata int hashdist = HASHDIST_DEFAULT; #ifdef CONFIG_NUMA @@ -2576,8 +2427,7 @@ void *__init alloc_large_system_hash(const char *tablename, else numentries <<= (PAGE_SHIFT - scale); } - /* rounded up to nearest power of 2 in size */ - numentries = 1UL << (long_log2(numentries) + 1); + numentries = roundup_pow_of_two(numentries); /* limit allocation size to 1/16 total memory by default */ if (max == 0) { @@ -2620,3 +2470,16 @@ void *__init alloc_large_system_hash(const char *tablename, return table; } + +#ifdef CONFIG_OUT_OF_LINE_PFN_TO_PAGE +struct page *pfn_to_page(unsigned long pfn) +{ + return __pfn_to_page(pfn); +} +unsigned long page_to_pfn(struct page *page) +{ + return __page_to_pfn(page); +} +EXPORT_SYMBOL(pfn_to_page); +EXPORT_SYMBOL(page_to_pfn); +#endif /* CONFIG_OUT_OF_LINE_PFN_TO_PAGE */