X-Git-Url: http://ftp.safe.ca/?p=safe%2Fjmp%2Flinux-2.6;a=blobdiff_plain;f=mm%2Fvmscan.c;h=c26986c85ce01b1d15a8914065a829890054c210;hp=2c4b945b011f3b3f8dba2913d118aa0789fccd9d;hb=e071041be037eca208b62b84469a06bdfc692bea;hpb=6837765963f1723e80ca97b1fae660f3a60d77df diff --git a/mm/vmscan.c b/mm/vmscan.c index 2c4b945..c26986c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -55,6 +55,11 @@ struct scan_control { /* Number of pages freed so far during a call to shrink_zones() */ unsigned long nr_reclaimed; + /* How many pages shrink_list() should reclaim */ + unsigned long nr_to_reclaim; + + unsigned long hibernation_mode; + /* This context's GFP mask */ gfp_t gfp_mask; @@ -66,12 +71,6 @@ struct scan_control { /* Can pages be swapped as part of reclaim? */ int may_swap; - /* This context's SWAP_CLUSTER_MAX. If freeing memory for - * suspend, we effectively ignore SWAP_CLUSTER_MAX. - * In this context, it doesn't matter that we scan the - * whole list at once. */ - int swap_cluster_max; - int swappiness; int all_unreclaimable; @@ -148,8 +147,8 @@ static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, return &zone->reclaim_stat; } -static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, - enum lru_list lru) +static unsigned long zone_nr_lru_pages(struct zone *zone, + struct scan_control *sc, enum lru_list lru) { if (!scanning_global_lru(sc)) return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); @@ -286,7 +285,12 @@ static inline int page_mapping_inuse(struct page *page) static inline int is_page_cache_freeable(struct page *page) { - return page_count(page) - !!page_has_private(page) == 2; + /* + * A freeable page cache page is referenced only by the caller + * that isolated the page, the page cache radix tree and + * optional buffer heads at page->private. + */ + return page_count(page) - page_has_private(page) == 2; } static int may_write_to_queue(struct backing_dev_info *bdi) @@ -353,7 +357,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * stalls if we need to run get_block(). We could test * PagePrivate for that. * - * If this process is currently in generic_file_write() against + * If this process is currently in __generic_file_aio_write() against * this page's queue, we can perform writeback even if that * will block. * @@ -361,7 +365,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. - * See swapfile.c:page_queue_congested(). */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; @@ -470,8 +473,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - mem_cgroup_uncharge_swapcache(page, swap); - swap_free(swap); + swapcache_free(swap, page); } else { __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); @@ -532,7 +534,7 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_is_file_cache(page); + lru = active + page_lru_base_type(page); lru_cache_add_lru(page, lru); } else { /* @@ -541,6 +543,16 @@ redo: */ lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); + /* + * When racing with an mlock clearing (page is + * unlocked), make sure that if the other thread does + * not observe our setting of PG_lru and fails + * isolation, we see PG_mlocked cleared below and move + * the page back to the evictable list. + * + * The other side is TestClearPageMlocked(). + */ + smp_mb(); } /* @@ -578,6 +590,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pagevec freed_pvec; int pgactivate = 0; unsigned long nr_reclaimed = 0; + unsigned long vm_flags; cond_resched(); @@ -628,10 +641,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } - referenced = page_referenced(page, 1, sc->mem_cgroup); - /* In active use or really unfreeable? Activate it. */ + referenced = page_referenced(page, 1, + sc->mem_cgroup, &vm_flags); + /* + * In active use or really unfreeable? Activate it. + * If page which have PG_mlocked lost isoltation race, + * try_to_unmap moves it to unevictable list + */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page)) + referenced && page_mapping_inuse(page) + && !(vm_flags & VM_LOCKED)) goto activate_locked; /* @@ -653,7 +672,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, 0)) { + switch (try_to_unmap(page, TTU_UNMAP)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -815,7 +834,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; - if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) return ret; /* @@ -836,7 +855,6 @@ int __isolate_lru_page(struct page *page, int mode, int file) */ ClearPageLRU(page); ret = 0; - mem_cgroup_del_lru(page); } return ret; @@ -884,12 +902,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode, file)) { case 0: list_move(&page->lru, dst); + mem_cgroup_del_lru(page); nr_taken++; break; case -EBUSY: /* else it is being freed elsewhere */ list_move(&page->lru, src); + mem_cgroup_rotate_lru_list(page, page_lru(page)); continue; default: @@ -928,18 +948,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode, file)) { - case 0: + + /* + * If we don't have enough swap space, reclaiming of + * anon page which don't already have a swap slot is + * pointless. + */ + if (nr_swap_pages <= 0 && PageAnon(cursor_page) && + !PageSwapCache(cursor_page)) + continue; + + if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); + mem_cgroup_del_lru(cursor_page); nr_taken++; scan++; - break; - - case -EBUSY: - /* else it is being freed elsewhere */ - list_move(&cursor_page->lru, src); - default: - break; /* ! on LRU or wrong list */ } } } @@ -961,7 +984,7 @@ static unsigned long isolate_pages_global(unsigned long nr, if (file) lru += LRU_FILE; return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, !!file); + mode, file); } /* @@ -976,7 +999,7 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { - lru = page_is_file_cache(page); + lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); @@ -1034,6 +1057,31 @@ int isolate_lru_page(struct page *page) } /* + * Are there way too many processes in the direct reclaim path already? + */ +static int too_many_isolated(struct zone *zone, int file, + struct scan_control *sc) +{ + unsigned long inactive, isolated; + + if (current_is_kswapd()) + return 0; + + if (!scanning_global_lru(sc)) + return 0; + + if (file) { + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + isolated = zone_page_state(zone, NR_ISOLATED_FILE); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_ANON); + } + + return isolated > inactive; +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1048,6 +1096,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); int lumpy_reclaim = 0; + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(BLK_RW_ASYNC, HZ/10); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } + /* * If we need a large contiguous chunk of memory, or have * trouble getting a small set of contiguous pages, we @@ -1072,10 +1128,26 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; + unsigned long nr_anon; + unsigned long nr_file; - nr_taken = sc->isolate_pages(sc->swap_cluster_max, + nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX, &page_list, &nr_scan, sc->order, mode, zone, sc->mem_cgroup, 0, file); + + if (scanning_global_lru(sc)) { + zone->pages_scanned += nr_scan; + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, + nr_scan); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, + nr_scan); + } + + if (nr_taken == 0) + goto done; + nr_active = clear_active_flags(&page_list, count); __count_vm_events(PGDEACTIVATE, nr_active); @@ -1088,13 +1160,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_zone_page_state(zone, NR_INACTIVE_ANON, -count[LRU_INACTIVE_ANON]); - if (scanning_global_lru(sc)) - zone->pages_scanned += nr_scan; + nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); - reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; - reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; - reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; - reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + reclaim_stat->recent_scanned[0] += nr_anon; + reclaim_stat->recent_scanned[1] += nr_file; spin_unlock_irq(&zone->lru_lock); @@ -1109,7 +1181,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, */ if (nr_freed < nr_taken && !current_is_kswapd() && lumpy_reclaim) { - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The attempt at page out may have made some @@ -1123,18 +1195,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, } nr_reclaimed += nr_freed; + local_irq_disable(); - if (current_is_kswapd()) { - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_freed); - } else if (scanning_global_lru(sc)) - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); - if (nr_taken == 0) - goto done; - spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. @@ -1153,8 +1219,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, SetPageLRU(page); lru = page_lru(page); add_page_to_lru_list(zone, page, lru); - if (PageActive(page)) { - int file = !!page_is_file_cache(page); + if (is_active_lru(lru)) { + int file = is_file_lru(lru); reclaim_stat->recent_rotated[file]++; } if (!pagevec_add(&pvec, page)) { @@ -1163,10 +1229,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, spin_lock_irq(&zone->lru_lock); } } + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + } while (nr_scanned < max_scan); - spin_unlock(&zone->lru_lock); + done: - local_irq_enable(); + spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); return nr_reclaimed; } @@ -1203,22 +1272,55 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) * But we had to alter page->flags anyway. */ +static void move_active_pages_to_lru(struct zone *zone, + struct list_head *list, + enum lru_list lru) +{ + unsigned long pgmoved = 0; + struct pagevec pvec; + struct page *page; + + pagevec_init(&pvec, 1); + + while (!list_empty(list)) { + page = lru_to_page(list); + + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_add_lru_list(page, lru); + pgmoved++; + + if (!pagevec_add(&pvec, page) || list_empty(list)) { + spin_unlock_irq(&zone->lru_lock); + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) + __count_vm_events(PGDEACTIVATE, pgmoved); +} static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) { - unsigned long pgmoved; + unsigned long nr_taken; unsigned long pgscanned; + unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct pagevec pvec; - enum lru_list lru; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + unsigned long nr_rotated = 0; lru_add_drain(); spin_lock_irq(&zone->lru_lock); - pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, + nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, sc->mem_cgroup, 1, file); /* @@ -1228,15 +1330,16 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, if (scanning_global_lru(sc)) { zone->pages_scanned += pgscanned; } - reclaim_stat->recent_scanned[!!file] += pgmoved; + reclaim_stat->recent_scanned[file] += nr_taken; + __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) - __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); else - __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); spin_unlock_irq(&zone->lru_lock); - pgmoved = 0; /* count referenced (mapping) mapped pages */ while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1249,54 +1352,45 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup)) - pgmoved++; + page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { + nr_rotated++; + /* + * Identify referenced, file-backed active pages and + * give them one more trip around the active list. So + * that executable code get better chances to stay in + * memory under moderate memory pressure. Anon pages + * are not likely to be evicted by use-once streaming + * IO, plus JVM can create lots of anon VM_EXEC pages, + * so we ignore them here. + */ + if ((vm_flags & VM_EXEC) && page_is_file_cache(page)) { + list_add(&page->lru, &l_active); + continue; + } + } + ClearPageActive(page); /* we are de-activating */ list_add(&page->lru, &l_inactive); } /* - * Move the pages to the [file or anon] inactive list. + * Move pages back to the lru list. */ - pagevec_init(&pvec, 1); - lru = LRU_BASE + file * LRU_FILE; - spin_lock_irq(&zone->lru_lock); /* - * Count referenced pages from currently used mappings as - * rotated, even though they are moved to the inactive list. - * This helps balance scan pressure between file and anonymous - * pages in get_scan_ratio. + * Count referenced pages from currently used mappings as rotated, + * even though only some of them are actually re-activated. This + * helps balance scan pressure between file and anonymous pages in + * get_scan_ratio. */ - reclaim_stat->recent_rotated[!!file] += pgmoved; + reclaim_stat->recent_rotated[file] += nr_rotated; - pgmoved = 0; /* count pages moved to inactive list */ - while (!list_empty(&l_inactive)) { - page = lru_to_page(&l_inactive); - prefetchw_prev_lru_page(page, &l_inactive, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - ClearPageActive(page); - - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_add_lru_list(page, lru); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - __count_zone_vm_events(PGREFILL, zone, pgscanned); - __count_vm_events(PGDEACTIVATE, pgmoved); + move_active_pages_to_lru(zone, &l_active, + LRU_ACTIVE + file * LRU_FILE); + move_active_pages_to_lru(zone, &l_inactive, + LRU_BASE + file * LRU_FILE); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - pagevec_release(&pvec); } static int inactive_anon_is_low_global(struct zone *zone) @@ -1367,20 +1461,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) return low; } +static int inactive_list_is_low(struct zone *zone, struct scan_control *sc, + int file) +{ + if (file) + return inactive_file_is_low(zone, sc); + else + return inactive_anon_is_low(zone, sc); +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); + if (is_active_lru(lru)) { + if (inactive_list_is_low(zone, sc, file)) + shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } - if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { - shrink_active_list(nr_to_scan, zone, sc, priority, file); - return 0; - } return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); } @@ -1401,17 +1501,10 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long ap, fp; struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); - /* If we have no swap space, do not bother scanning anon pages. */ - if (!sc->may_swap || (nr_swap_pages <= 0)) { - percent[0] = 0; - percent[1] = 100; - return; - } - - anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + - zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); - file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + - zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); if (scanning_global_lru(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); @@ -1477,15 +1570,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * until we collected @swap_cluster_max pages to scan. */ static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, - unsigned long *nr_saved_scan, - unsigned long swap_cluster_max) + unsigned long *nr_saved_scan) { unsigned long nr; *nr_saved_scan += nr_to_scan; nr = *nr_saved_scan; - if (nr >= swap_cluster_max) + if (nr >= SWAP_CLUSTER_MAX) *nr_saved_scan = 0; else nr = 0; @@ -1504,32 +1596,37 @@ static void shrink_zone(int priority, struct zone *zone, unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; - unsigned long swap_cluster_max = sc->swap_cluster_max; + unsigned long nr_to_reclaim = sc->nr_to_reclaim; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + int noswap = 0; - get_scan_ratio(zone, sc, percent); + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + percent[0] = 0; + percent[1] = 100; + } else + get_scan_ratio(zone, sc, percent); for_each_evictable_lru(l) { int file = is_file_lru(l); unsigned long scan; - scan = zone_nr_pages(zone, sc, l); - if (priority) { + scan = zone_nr_lru_pages(zone, sc, l); + if (priority || noswap) { scan >>= priority; scan = (scan * percent[file]) / 100; } - if (scanning_global_lru(sc)) - nr[l] = nr_scan_try_batch(scan, - &zone->lru[l].nr_saved_scan, - swap_cluster_max); - else - nr[l] = scan; + nr[l] = nr_scan_try_batch(scan, + &reclaim_stat->nr_saved_scan[l]); } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { if (nr[l]) { - nr_to_scan = min(nr[l], swap_cluster_max); + nr_to_scan = min_t(unsigned long, + nr[l], SWAP_CLUSTER_MAX); nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, @@ -1544,8 +1641,7 @@ static void shrink_zone(int priority, struct zone *zone, * with multiple processes reclaiming pages, the total * freeing target can get unreasonably large. */ - if (nr_reclaimed > swap_cluster_max && - priority < DEF_PRIORITY && !current_is_kswapd()) + if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY) break; } @@ -1624,10 +1720,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed @@ -1643,6 +1739,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, struct zoneref *z; struct zone *zone; enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); + unsigned long writeback_threshold; delayacct_freepages_start(); @@ -1657,7 +1754,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } } @@ -1678,7 +1775,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, } } total_scanned += sc->nr_scanned; - if (sc->nr_reclaimed >= sc->swap_cluster_max) { + if (sc->nr_reclaimed >= sc->nr_to_reclaim) { ret = sc->nr_reclaimed; goto out; } @@ -1690,15 +1787,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * that's undesirable in laptop mode, where we *want* lumpy * writeout. So in laptop mode, write out the whole world. */ - if (total_scanned > sc->swap_cluster_max + - sc->swap_cluster_max / 2) { - wakeup_pdflush(laptop_mode ? 0 : total_scanned); + writeback_threshold = sc->nr_to_reclaim + sc->nr_to_reclaim / 2; + if (total_scanned > writeback_threshold) { + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ - if (sc->nr_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + if (!sc->hibernation_mode && sc->nr_scanned && + priority < DEF_PRIORITY - 2) + congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ if (!sc->all_unreclaimable && scanning_global_lru(sc)) @@ -1736,7 +1834,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .may_unmap = 1, .may_swap = 1, .swappiness = vm_swappiness, @@ -1751,23 +1849,55 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, + gfp_t gfp_mask, bool noswap, + unsigned int swappiness, + struct zone *zone, int nid) +{ + struct scan_control sc = { + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, + .swappiness = swappiness, + .order = 0, + .mem_cgroup = mem, + .isolate_pages = mem_cgroup_isolate_pages, + }; + nodemask_t nm = nodemask_of_node(nid); + + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + sc.nodemask = &nm; + sc.nr_reclaimed = 0; + sc.nr_scanned = 0; + /* + * NOTE: Although we can get the priority field, using it + * here is not a good idea, since it limits the pages we can scan. + * if we don't reclaim here, the shrink_zone from balance_pgdat + * will pick up pages from other mem cgroup's as well. We hack + * the priority and make it zero. + */ + shrink_zone(0, zone, &sc); + return sc.nr_reclaimed; +} + unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, gfp_t gfp_mask, bool noswap, unsigned int swappiness) { + struct zonelist *zonelist; struct scan_control sc = { .may_writepage = !laptop_mode, .may_unmap = 1, .may_swap = !noswap, - .swap_cluster_max = SWAP_CLUSTER_MAX, + .nr_to_reclaim = SWAP_CLUSTER_MAX, .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, .nodemask = NULL, /* we don't care the placement */ }; - struct zonelist *zonelist; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -1776,6 +1906,33 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, } #endif +/* is kswapd sleeping prematurely? */ +static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining) +{ + int i; + + /* If a direct reclaimer woke kswapd within HZ/10, it's premature */ + if (remaining) + return 1; + + /* If after HZ/10, a zone is below the high mark, it's premature */ + for (i = 0; i < pgdat->nr_zones; i++) { + struct zone *zone = pgdat->node_zones + i; + + if (!populated_zone(zone)) + continue; + + if (zone_is_all_unreclaimable(zone)) + continue; + + if (!zone_watermark_ok(zone, order, high_wmark_pages(zone), + 0, 0)) + return 1; + } + + return 0; +} + /* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). @@ -1808,7 +1965,11 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) .gfp_mask = GFP_KERNEL, .may_unmap = 1, .may_swap = 1, - .swap_cluster_max = SWAP_CLUSTER_MAX, + /* + * kswapd doesn't want to be bailed out while reclaim. because + * we want to put equal scanning pressure on each zone. + */ + .nr_to_reclaim = ULONG_MAX, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, @@ -1833,6 +1994,7 @@ loop_again: for (priority = DEF_PRIORITY; priority >= 0; priority--) { int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; + int has_under_min_watermark_zone = 0; /* The swap token gets in the way of swapout... */ if (!priority) @@ -1874,7 +2036,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } /* @@ -1889,6 +2051,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; + int nid, zid; if (!populated_zone(zone)) continue; @@ -1903,6 +2066,15 @@ loop_again: temp_priority[i] = priority; sc.nr_scanned = 0; note_zone_scanning_priority(zone, priority); + + nid = pgdat->node_id; + zid = zone_idx(zone); + /* + * Call soft limit reclaim before calling shrink_zone. + * For now we ignore the return value + */ + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, + nid, zid); /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. @@ -1918,7 +2090,7 @@ loop_again: if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone_lru_pages(zone) * 6)) + (zone_reclaimable_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* @@ -1929,6 +2101,15 @@ loop_again: if (total_scanned > SWAP_CLUSTER_MAX * 2 && total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; + + /* + * We are still under min water mark. it mean we have + * GFP_ATOMIC allocation failure risk. Hurry up! + */ + if (!zone_watermark_ok(zone, order, min_wmark_pages(zone), + end_zone, 0)) + has_under_min_watermark_zone = 1; + } if (all_zones_ok) break; /* kswapd: all done */ @@ -1936,8 +2117,12 @@ loop_again: * OK, kswapd is getting into trouble. Take a nap, then take * another pass across the zones. */ - if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + if (total_scanned && (priority < DEF_PRIORITY - 2)) { + if (has_under_min_watermark_zone) + count_vm_event(KSWAPD_SKIP_CONGESTION_WAIT); + else + congestion_wait(BLK_RW_ASYNC, HZ/10); + } /* * We do this so kswapd doesn't build up large priorities for @@ -2035,6 +2220,7 @@ static int kswapd(void *p) order = 0; for ( ; ; ) { unsigned long new_order; + int ret; prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); new_order = pgdat->kswapd_max_order; @@ -2046,19 +2232,45 @@ static int kswapd(void *p) */ order = new_order; } else { - if (!freezing(current)) - schedule(); + if (!freezing(current) && !kthread_should_stop()) { + long remaining = 0; + + /* Try to sleep for a short interval */ + if (!sleeping_prematurely(pgdat, order, remaining)) { + remaining = schedule_timeout(HZ/10); + finish_wait(&pgdat->kswapd_wait, &wait); + prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE); + } + + /* + * After a short sleep, check if it was a + * premature sleep. If not, then go fully + * to sleep until explicitly woken up + */ + if (!sleeping_prematurely(pgdat, order, remaining)) + schedule(); + else { + if (remaining) + count_vm_event(KSWAPD_LOW_WMARK_HIT_QUICKLY); + else + count_vm_event(KSWAPD_HIGH_WMARK_HIT_QUICKLY); + } + } order = pgdat->kswapd_max_order; } finish_wait(&pgdat->kswapd_wait, &wait); - if (!try_to_freeze()) { - /* We can speed up thawing tasks if we don't call - * balance_pgdat after returning from the refrigerator - */ + ret = try_to_freeze(); + if (kthread_should_stop()) + break; + + /* + * We can speed up thawing tasks if we don't call balance_pgdat + * after returning from the refrigerator + */ + if (!ret) balance_pgdat(pgdat, order); - } } return 0; } @@ -2085,153 +2297,80 @@ void wakeup_kswapd(struct zone *zone, int order) wake_up_interruptible(&pgdat->kswapd_wait); } -unsigned long global_lru_pages(void) -{ - return global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_INACTIVE_FILE); -} - -#ifdef CONFIG_HIBERNATION /* - * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority. - * - * For pass > 3 we also try to shrink the LRU lists that contain a few pages + * The reclaimable count would be mostly accurate. + * The less reclaimable pages may be + * - mlocked pages, which will be moved to unevictable list when encountered + * - mapped pages, which may require several travels to be reclaimed + * - dirty pages, which is not "instantly" reclaimable */ -static void shrink_all_zones(unsigned long nr_pages, int prio, - int pass, struct scan_control *sc) +unsigned long global_reclaimable_pages(void) { - struct zone *zone; - unsigned long nr_reclaimed = 0; + int nr; - for_each_populated_zone(zone) { - enum lru_list l; + nr = global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE); - if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) - continue; + if (nr_swap_pages > 0) + nr += global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON); - for_each_evictable_lru(l) { - enum zone_stat_item ls = NR_LRU_BASE + l; - unsigned long lru_pages = zone_page_state(zone, ls); + return nr; +} - /* For pass = 0, we don't shrink the active list */ - if (pass == 0 && (l == LRU_ACTIVE_ANON || - l == LRU_ACTIVE_FILE)) - continue; +unsigned long zone_reclaimable_pages(struct zone *zone) +{ + int nr; - zone->lru[l].nr_saved_scan += (lru_pages >> prio) + 1; - if (zone->lru[l].nr_saved_scan >= nr_pages || pass > 3) { - unsigned long nr_to_scan; - - zone->lru[l].nr_saved_scan = 0; - nr_to_scan = min(nr_pages, lru_pages); - nr_reclaimed += shrink_list(l, nr_to_scan, zone, - sc, prio); - if (nr_reclaimed >= nr_pages) { - sc->nr_reclaimed += nr_reclaimed; - return; - } - } - } - } - sc->nr_reclaimed += nr_reclaimed; + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; } +#ifdef CONFIG_HIBERNATION /* - * Try to free `nr_pages' of memory, system-wide, and return the number of + * Try to free `nr_to_reclaim' of memory, system-wide, and return the number of * freed pages. * * Rather than trying to age LRUs the aim is to preserve the overall * LRU order by reclaiming preferentially * inactive > active > active referenced > active mapped */ -unsigned long shrink_all_memory(unsigned long nr_pages) +unsigned long shrink_all_memory(unsigned long nr_to_reclaim) { - unsigned long lru_pages, nr_slab; - int pass; struct reclaim_state reclaim_state; struct scan_control sc = { - .gfp_mask = GFP_KERNEL, - .may_unmap = 0, + .gfp_mask = GFP_HIGHUSER_MOVABLE, + .may_swap = 1, + .may_unmap = 1, .may_writepage = 1, + .nr_to_reclaim = nr_to_reclaim, + .hibernation_mode = 1, + .swappiness = vm_swappiness, + .order = 0, .isolate_pages = isolate_pages_global, - .nr_reclaimed = 0, }; + struct zonelist * zonelist = node_zonelist(numa_node_id(), sc.gfp_mask); + struct task_struct *p = current; + unsigned long nr_reclaimed; - current->reclaim_state = &reclaim_state; - - lru_pages = global_lru_pages(); - nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); - /* If slab caches are huge, it's better to hit them first */ - while (nr_slab >= lru_pages) { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, lru_pages); - if (!reclaim_state.reclaimed_slab) - break; - - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - nr_slab -= reclaim_state.reclaimed_slab; - } - - /* - * We try to shrink LRUs in 5 passes: - * 0 = Reclaim from inactive_list only - * 1 = Reclaim from active list but don't reclaim mapped - * 2 = 2nd pass of type 1 - * 3 = Reclaim mapped (normal reclaim) - * 4 = 2nd pass of type 3 - */ - for (pass = 0; pass < 5; pass++) { - int prio; - - /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) - sc.may_unmap = 1; - - for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; - - sc.nr_scanned = 0; - sc.swap_cluster_max = nr_to_scan; - shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (sc.nr_reclaimed >= nr_pages) - goto out; - - reclaim_state.reclaimed_slab = 0; - shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_lru_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - if (sc.nr_reclaimed >= nr_pages) - goto out; - - if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ / 10); - } - } - - /* - * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be - * something in slab caches - */ - if (!sc.nr_reclaimed) { - do { - reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); - sc.nr_reclaimed += reclaim_state.reclaimed_slab; - } while (sc.nr_reclaimed < nr_pages && - reclaim_state.reclaimed_slab > 0); - } + p->flags |= PF_MEMALLOC; + lockdep_set_current_reclaim_state(sc.gfp_mask); + reclaim_state.reclaimed_slab = 0; + p->reclaim_state = &reclaim_state; + nr_reclaimed = do_try_to_free_pages(zonelist, &sc); -out: - current->reclaim_state = NULL; + p->reclaim_state = NULL; + lockdep_clear_current_reclaim_state(); + p->flags &= ~PF_MEMALLOC; - return sc.nr_reclaimed; + return nr_reclaimed; } #endif /* CONFIG_HIBERNATION */ @@ -2281,6 +2420,17 @@ int kswapd_run(int nid) return ret; } +/* + * Called by memory hotplug when all memory in a node is offlined. + */ +void kswapd_stop(int nid) +{ + struct task_struct *kswapd = NODE_DATA(nid)->kswapd; + + if (kswapd) + kthread_stop(kswapd); +} + static int __init kswapd_init(void) { int nid; @@ -2327,6 +2477,48 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; +static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +{ + unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); + unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_FILE); + + /* + * It's possible for there to be more file mapped pages than + * accounted for by the pages on the file LRU lists because + * tmpfs pages accounted for as ANON can also be FILE_MAPPED + */ + return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; +} + +/* Work out how many page cache pages we can reclaim in this reclaim_mode */ +static long zone_pagecache_reclaimable(struct zone *zone) +{ + long nr_pagecache_reclaimable; + long delta = 0; + + /* + * If RECLAIM_SWAP is set, then all file pages are considered + * potentially reclaimable. Otherwise, we have to worry about + * pages like swapcache and zone_unmapped_file_pages() provides + * a better estimate + */ + if (zone_reclaim_mode & RECLAIM_SWAP) + nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + else + nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + + /* If we can't clean pages, remove dirty pages from consideration */ + if (!(zone_reclaim_mode & RECLAIM_WRITE)) + delta += zone_page_state(zone, NR_FILE_DIRTY); + + /* Watch for any possible underflows due to delta */ + if (unlikely(delta > nr_pagecache_reclaimable)) + delta = nr_pagecache_reclaimable; + + return nr_pagecache_reclaimable - delta; +} + /* * Try to free up some pages from this zone through reclaim. */ @@ -2341,8 +2533,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .may_swap = 1, - .swap_cluster_max = max_t(unsigned long, nr_pages, - SWAP_CLUSTER_MAX), + .nr_to_reclaim = max_t(unsigned long, nr_pages, + SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, .order = order, @@ -2361,9 +2553,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) > - zone->min_unmapped_pages) { + if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. @@ -2421,20 +2611,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * if less than a specified percentage of the zone is used by * unmapped file backed pages. */ - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages - && zone_page_state(zone, NR_SLAB_RECLAIMABLE) - <= zone->min_slab_pages) - return 0; + if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) + return ZONE_RECLAIM_FULL; if (zone_is_all_unreclaimable(zone)) - return 0; + return ZONE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) - return 0; + return ZONE_RECLAIM_NOSCAN; /* * Only run zone reclaim on the local zone or on zones that do not @@ -2444,13 +2632,17 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ node_id = zone_to_nid(zone); if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return 0; + return ZONE_RECLAIM_NOSCAN; if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) - return 0; + return ZONE_RECLAIM_NOSCAN; + ret = __zone_reclaim(zone, gfp_mask, order); zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + if (!ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); + return ret; } #endif @@ -2499,7 +2691,7 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone) retry: ClearPageUnevictable(page); if (page_evictable(page, NULL)) { - enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); list_move(&page->lru, &zone->lru[l].list); @@ -2642,10 +2834,10 @@ static void scan_all_zones_unevictable_pages(void) unsigned long scan_unevictable_pages; int scan_unevictable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write && *(unsigned long *)table->data) scan_all_zones_unevictable_pages();