X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fvmscan.c;h=2ef59d5b16a6b16b809219a17fe9ab0292b6eb57;hb=4035c07a895974d0ac06a56fe870ad293fc451a7;hp=1219ceb8a9b2d992da20bb9a10942e7cef2d98b1;hpb=db16826367fefcb0ddb93d76b66adc52eb4e6339;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/vmscan.c b/mm/vmscan.c index 1219ceb..2ef59d5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,6 +55,11 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + unsigned long hibernation_mode; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -66,12 +71,6 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - /* This context's SWAP_CLUSTER_MAX. If freeing memory for - * suspend, we effectively ignore SWAP_CLUSTER_MAX. - * In this context, it doesn't matter that we scan the - * whole list at once. */ - int swap_cluster_max; - int swappiness; int all_unreclaimable; @@ -358,7 +357,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in generic_file_write() against + * If this process is currently in __generic_file_aio_write() against * this page's queue, we can perform writeback even if that * will block. * @@ -544,6 +543,16 @@ redo: */ lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); + /* + * When racing with an mlock clearing (page is + * unlocked), make sure that if the other thread does + * not observe our setting of PG_lru and fails + * isolation, we see PG_mlocked cleared below and move + * the page back to the evictable list. + * + * The other side is TestClearPageMlocked(). + */ + smp_mb(); } /* @@ -1088,7 +1097,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, int lumpy_reclaim = 0; while (unlikely(too_many_isolated(zone, file, sc))) { - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* We are about to die and free our memory. Return now. */ if (fatal_signal_pending(current)) @@ -1122,7 +1131,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_anon; unsigned long nr_file; - nr_taken = sc->isolate_pages(sc->swap_cluster_max, + nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, &page_list, &nr_scan, sc->order, mode, zone, sc->mem_cgroup, 0, file); @@ -1356,7 +1365,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * IO, plus JVM can create lots of anon VM_EXEC pages, * so we ignore them here. */ - if ((vm_flags & VM_EXEC) && !PageAnon(page)) { + if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { list_add(&page->lru, &l_active); continue; } @@ -1557,15 +1566,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * until we collected @swap_cluster_max pages to scan. */ static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan, - unsigned long swap_cluster_max) + unsigned long *nr_saved_scan) { unsigned long nr; *nr_saved_scan += nr_to_scan; nr = *nr_saved_scan; - if (nr >= swap_cluster_max) + if (nr >= SWAP_CLUSTER_MAX) *nr_saved_scan = 0; else nr = 0; @@ -1584,7 +1592,7 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; - unsigned long swap_cluster_max = sc->swap_cluster_max; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int noswap = 0; @@ -1606,15 +1614,15 @@ static void shrink_zone(int priority, struct zone *zone, scan = (scan * percent[file]) / 100; } nr[l] = nr_scan_try_batch(scan, - &reclaim_stat->nr_saved_scan[l], - swap_cluster_max); + &reclaim_stat->nr_saved_scan[l]); } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { if (nr[l]) { - nr_to_scan = min(nr[l], swap_cluster_max); + nr_to_scan = min_t(unsigned long, + nr[l], SWAP_CLUSTER_MAX); nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, @@ -1629,8 +1637,7 @@ static void shrink_zone(int priority, struct zone *zone, * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ - if (nr_reclaimed > swap_cluster_max && - priority < DEF_PRIORITY && !current_is_kswapd()) + if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } @@ -1709,10 +1716,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed @@ -1728,6 +1735,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + unsigned long writeback_threshold; delayacct_freepages_start(); @@ -1763,7 +1771,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } } total_scanned += sc->nr_scanned; - if (sc->nr_reclaimed >= sc->swap_cluster_max) { + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { ret = sc->nr_reclaimed; goto out; } @@ -1775,14 +1783,15 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc->swap_cluster_max + - sc->swap_cluster_max / 2) { + writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; + if (total_scanned > writeback_threshold) { wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ - if (sc->nr_scanned && priority < DEF_PRIORITY - 2) + if (!sc->hibernation_mode && sc->nr_scanned && + priority < DEF_PRIORITY - 2) congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ @@ -1821,7 +1830,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, .swappiness = vm_swappiness, @@ -1845,7 +1854,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem, @@ -1879,7 +1887,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, @@ -1894,6 +1902,30 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* is kswapd sleeping prematurely? */ +static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +{ + int i; + + /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ + if (remaining) + return 1; + + /* If after HZ/10, a zone is below the high mark, it's premature */ + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), + 0, 0)) + return 1; + } + + return 0; +} + /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). @@ -1926,7 +1958,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .gfp_mask = GFP_KERNEL, .may_unmap = 1, .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, + /* + * kswapd doesn't want to be bailed out while reclaim. because + * we want to put equal scanning pressure on each zone. + */ + .nr_to_reclaim = ULONG_MAX, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, @@ -1951,6 +1987,7 @@ loop_again: for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; + int has_under_min_watermark_zone = 0; /* The swap token gets in the way of swapout... */ if (!priority) @@ -2057,6 +2094,15 @@ loop_again: if (total_scanned > SWAP_CLUSTER_MAX * 2 && total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + + /* + * We are still under min water mark. it mean we have + * GFP_ATOMIC allocation failure risk. Hurry up! + */ + if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), + end_zone, 0)) + has_under_min_watermark_zone = 1; + } if (all_zones_ok) break; /* kswapd: all done */ @@ -2064,8 +2110,12 @@ loop_again: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ/10); + if (total_scanned && (priority < DEF_PRIORITY - 2)) { + if (has_under_min_watermark_zone) + count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); + else + congestion_wait(BLK_RW_ASYNC, HZ/10); + } /* * We do this so kswapd doesn't build up large priorities for @@ -2163,6 +2213,7 @@ static int kswapd(void *p) order = 0; for ( ; ; ) { unsigned long new_order; + int ret; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; @@ -2174,19 +2225,45 @@ static int kswapd(void *p) */ order = new_order; } else { - if (!freezing(current)) - schedule(); + if (!freezing(current) && !kthread_should_stop()) { + long remaining = 0; + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a + * premature sleep. If not, then go fully + * to sleep until explicitly woken up + */ + if (!sleeping_prematurely(pgdat, order, remaining)) + schedule(); + else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + } order = pgdat->kswapd_max_order; } finish_wait(&pgdat->kswapd_wait, &wait); - if (!try_to_freeze()) { - /* We can speed up thawing tasks if we don't call - * balance_pgdat after returning from the refrigerator - */ + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + /* + * We can speed up thawing tasks if we don't call balance_pgdat + * after returning from the refrigerator + */ + if (!ret) balance_pgdat(pgdat, order); - } } return 0; } @@ -2250,148 +2327,43 @@ unsigned long zone_reclaimable_pages(struct zone *zone) #ifdef CONFIG_HIBERNATION /* - * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority. - * - * For pass > 3 we also try to shrink the LRU lists that contain a few pages - */ -static void shrink_all_zones(unsigned long nr_pages, int prio, - int pass, struct scan_control *sc) -{ - struct zone *zone; - unsigned long nr_reclaimed = 0; - struct zone_reclaim_stat *reclaim_stat; - - for_each_populated_zone(zone) { - enum lru_list l; - - if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) - continue; - - for_each_evictable_lru(l) { - enum zone_stat_item ls = NR_LRU_BASE + l; - unsigned long lru_pages = zone_page_state(zone, ls); - - /* For pass = 0, we don't shrink the active list */ - if (pass == 0 && (l == LRU_ACTIVE_ANON || - l == LRU_ACTIVE_FILE)) - continue; - - reclaim_stat = get_reclaim_stat(zone, sc); - reclaim_stat->nr_saved_scan[l] += - (lru_pages >> prio) + 1; - if (reclaim_stat->nr_saved_scan[l] - >= nr_pages || pass > 3) { - unsigned long nr_to_scan; - - reclaim_stat->nr_saved_scan[l] = 0; - nr_to_scan = min(nr_pages, lru_pages); - nr_reclaimed += shrink_list(l, nr_to_scan, zone, - sc, prio); - if (nr_reclaimed >= nr_pages) { - sc->nr_reclaimed += nr_reclaimed; - return; - } - } - } - } - sc->nr_reclaimed += nr_reclaimed; -} - -/* - * Try to free `nr_pages' of memory, system-wide, and return the number of + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ -unsigned long shrink_all_memory(unsigned long nr_pages) +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { - unsigned long lru_pages, nr_slab; - int pass; struct reclaim_state reclaim_state; struct scan_control sc = { - .gfp_mask = GFP_KERNEL, - .may_unmap = 0, + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .may_swap = 1, + .may_unmap = 1, .may_writepage = 1, + .nr_to_reclaim = nr_to_reclaim, + .hibernation_mode = 1, + .swappiness = vm_swappiness, + .order = 0, .isolate_pages = isolate_pages_global, - .nr_reclaimed = 0, }; + struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct task_struct *p = current; + unsigned long nr_reclaimed; - current->reclaim_state = &reclaim_state; - - lru_pages = global_reclaimable_pages(); - nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); - /* If slab caches are huge, it's better to hit them first */ - while (nr_slab >= lru_pages) { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); - if (!reclaim_state.reclaimed_slab) - break; - - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - nr_slab -= reclaim_state.reclaimed_slab; - } - - /* - * We try to shrink LRUs in 5 passes: - * 0 = Reclaim from inactive_list only - * 1 = Reclaim from active list but don't reclaim mapped - * 2 = 2nd pass of type 1 - * 3 = Reclaim mapped (normal reclaim) - * 4 = 2nd pass of type 3 - */ - for (pass = 0; pass < 5; pass++) { - int prio; - - /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) - sc.may_unmap = 1; - - for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; - - sc.nr_scanned = 0; - sc.swap_cluster_max = nr_to_scan; - shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (sc.nr_reclaimed >= nr_pages) - goto out; - - reclaim_state.reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_reclaimable_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(BLK_RW_ASYNC, HZ / 10); - } - } - - /* - * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be - * something in slab caches - */ - if (!sc.nr_reclaimed) { - do { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, - global_reclaimable_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - } while (sc.nr_reclaimed < nr_pages && - reclaim_state.reclaimed_slab > 0); - } + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(sc.gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); -out: - current->reclaim_state = NULL; + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; - return sc.nr_reclaimed; + return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ @@ -2441,6 +2413,17 @@ int kswapd_run(int nid) return ret; } +/* + * Called by memory hotplug when all memory in a node is offlined. + */ +void kswapd_stop(int nid) +{ + struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + + if (kswapd) + kthread_stop(kswapd); +} + static int __init kswapd_init(void) { int nid; @@ -2543,8 +2526,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), + .nr_to_reclaim = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .order = order,