X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fvmscan.c;h=64e438898832371277e8baeb89fd5ebb1f72a14a;hb=a3bafbbbb5ac49355aa35e6a722eac6ef1dff19a;hp=5daf606e0a35cb6111b668a584d52b4de8cd940c;hpb=b555749aac87d7c2637f153e44bd77c7fdf4c65b;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/vmscan.c b/mm/vmscan.c index 5daf606..64e43889 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -60,6 +60,9 @@ struct scan_control { int may_writepage; + /* Can mapped pages be reclaimed? */ + int may_unmap; + /* Can pages be swapped as part of reclaim? */ int may_swap; @@ -78,6 +81,12 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + /* Pluggable isolate pages callback */ unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, @@ -125,11 +134,30 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_CGROUP_MEM_RES_CTLR -#define scan_global_lru(sc) (!(sc)->mem_cgroup) +#define scanning_global_lru(sc) (!(sc)->mem_cgroup) #else -#define scan_global_lru(sc) (1) +#define scanning_global_lru(sc) (1) #endif +static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, + struct scan_control *sc) +{ + if (!scanning_global_lru(sc)) + return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); + + return &zone->reclaim_stat; +} + +static unsigned long zone_nr_lru_pages(struct zone *zone, + struct scan_control *sc, enum lru_list lru) +{ + if (!scanning_global_lru(sc)) + return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); + + return zone_page_state(zone, NR_LRU_BASE + lru); +} + + /* * Add a shrinker callback to be called from the vm */ @@ -195,8 +223,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, do_div(delta, lru_pages + 1); shrinker->nr += delta; if (shrinker->nr < 0) { - printk(KERN_ERR "%s: nr=%ld\n", - __func__, shrinker->nr); + printk(KERN_ERR "shrink_slab: %pF negative objects to " + "delete nr=%ld\n", + shrinker->shrink, shrinker->nr); shrinker->nr = max_pass; } @@ -257,7 +286,12 @@ static inline int page_mapping_inuse(struct page *page) static inline int is_page_cache_freeable(struct page *page) { - return page_count(page) - !!PagePrivate(page) == 2; + /* + * A freeable page cache page is referenced only by the caller + * that isolated the page, the page cache radix tree and + * optional buffer heads at page->private. + */ + return page_count(page) - page_has_private(page) == 2; } static int may_write_to_queue(struct backing_dev_info *bdi) @@ -332,7 +366,6 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * block, for some throttling. This happens by accident, because * swap_backing_dev_info is bust: it doesn't reflect the * congestion state of the swapdevs. Easy to fix, if needed. - * See swapfile.c:page_queue_congested(). */ if (!is_page_cache_freeable(page)) return PAGE_KEEP; @@ -341,7 +374,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping, * Some data journaling orphaned pages can have * page->mapping == NULL while being dirty with clean buffers. */ - if (PagePrivate(page)) { + if (page_has_private(page)) { if (try_to_free_buffers(page)) { ClearPageDirty(page); printk("%s: orphaned page\n", __func__); @@ -441,10 +474,11 @@ static int __remove_mapping(struct address_space *mapping, struct page *page) swp_entry_t swap = { .val = page_private(page) }; __delete_from_swap_cache(page); spin_unlock_irq(&mapping->tree_lock); - swap_free(swap); + swapcache_free(swap, page); } else { __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); + mem_cgroup_uncharge_cache_page(page); } return 1; @@ -483,7 +517,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) * * lru_lock must not be held, interrupts must be enabled. */ -#ifdef CONFIG_UNEVICTABLE_LRU void putback_lru_page(struct page *page) { int lru; @@ -502,7 +535,7 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_is_file_cache(page); + lru = active + page_lru_base_type(page); lru_cache_add_lru(page, lru); } else { /* @@ -512,7 +545,6 @@ redo: lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); } - mem_cgroup_move_lists(page, lru); /* * page's status can change while we move it among lru. If an evictable @@ -538,21 +570,6 @@ redo: put_page(page); /* drop ref from isolate */ } -#else /* CONFIG_UNEVICTABLE_LRU */ - -void putback_lru_page(struct page *page) -{ - int lru; - VM_BUG_ON(PageLRU(page)); - - lru = !!TestClearPageActive(page) + page_is_file_cache(page); - lru_cache_add_lru(page, lru); - mem_cgroup_move_lists(page, lru); - put_page(page); -} -#endif /* CONFIG_UNEVICTABLE_LRU */ - - /* * shrink_page_list() returns the number of reclaimed pages */ @@ -564,6 +581,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct pagevec freed_pvec; int pgactivate = 0; unsigned long nr_reclaimed = 0; + unsigned long vm_flags; cond_resched(); @@ -589,7 +607,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(!page_evictable(page, NULL))) goto cull_mlocked; - if (!sc->may_swap && page_mapped(page)) + if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ @@ -614,10 +632,16 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; } - referenced = page_referenced(page, 1, sc->mem_cgroup); - /* In active use or really unfreeable? Activate it. */ + referenced = page_referenced(page, 1, + sc->mem_cgroup, &vm_flags); + /* + * In active use or really unfreeable? Activate it. + * If page which have PG_mlocked lost isoltation race, + * try_to_unmap moves it to unevictable list + */ if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && - referenced && page_mapping_inuse(page)) + referenced && page_mapping_inuse(page) + && !(vm_flags & VM_LOCKED)) goto activate_locked; /* @@ -639,7 +663,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * processes. Try to unmap it here. */ if (page_mapped(page) && mapping) { - switch (try_to_unmap(page, 0)) { + switch (try_to_unmap(page, TTU_UNMAP)) { case SWAP_FAIL: goto activate_locked; case SWAP_AGAIN: @@ -703,7 +727,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * process address space (page_count == 1) it can be freed. * Otherwise, leave the page on the LRU so it is swappable. */ - if (PagePrivate(page)) { + if (page_has_private(page)) { if (!try_to_release_page(page, sc->gfp_mask)) goto activate_locked; if (!mapping && page_count(page) == 1) { @@ -801,7 +825,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; - if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + if (mode != ISOLATE_BOTH && page_is_file_cache(page) != file) return ret; /* @@ -813,6 +837,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) return ret; ret = -EBUSY; + if (likely(get_page_unless_zero(page))) { /* * Be careful not to clear PageLRU until after we're @@ -868,12 +893,14 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, switch (__isolate_lru_page(page, mode, file)) { case 0: list_move(&page->lru, dst); + mem_cgroup_del_lru(page); nr_taken++; break; case -EBUSY: /* else it is being freed elsewhere */ list_move(&page->lru, src); + mem_cgroup_rotate_lru_list(page, page_lru(page)); continue; default: @@ -912,18 +939,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode, file)) { - case 0: + + /* + * If we don't have enough swap space, reclaiming of + * anon page which don't already have a swap slot is + * pointless. + */ + if (nr_swap_pages <= 0 && PageAnon(cursor_page) && + !PageSwapCache(cursor_page)) + continue; + + if (__isolate_lru_page(cursor_page, mode, file) == 0) { list_move(&cursor_page->lru, dst); + mem_cgroup_del_lru(cursor_page); nr_taken++; scan++; - break; - - case -EBUSY: - /* else it is being freed elsewhere */ - list_move(&cursor_page->lru, src); - default: - break; /* ! on LRU or wrong list */ } } } @@ -945,7 +975,7 @@ static unsigned long isolate_pages_global(unsigned long nr, if (file) lru += LRU_FILE; return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, - mode, !!file); + mode, file); } /* @@ -960,7 +990,7 @@ static unsigned long clear_active_flags(struct list_head *page_list, struct page *page; list_for_each_entry(page, page_list, lru) { - lru = page_is_file_cache(page); + lru = page_lru_base_type(page); if (PageActive(page)) { lru += LRU_ACTIVE; ClearPageActive(page); @@ -1018,6 +1048,31 @@ int isolate_lru_page(struct page *page) } /* + * Are there way too many processes in the direct reclaim path already? + */ +static int too_many_isolated(struct zone *zone, int file, + struct scan_control *sc) +{ + unsigned long inactive, isolated; + + if (current_is_kswapd()) + return 0; + + if (!scanning_global_lru(sc)) + return 0; + + if (file) { + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + isolated = zone_page_state(zone, NR_ISOLATED_FILE); + } else { + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + isolated = zone_page_state(zone, NR_ISOLATED_ANON); + } + + return isolated > inactive; +} + +/* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ @@ -1029,6 +1084,28 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, struct pagevec pvec; unsigned long nr_scanned = 0; unsigned long nr_reclaimed = 0; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + int lumpy_reclaim = 0; + + while (unlikely(too_many_isolated(zone, file, sc))) { + congestion_wait(WRITE, HZ/10); + + /* We are about to die and free our memory. Return now. */ + if (fatal_signal_pending(current)) + return SWAP_CLUSTER_MAX; + } + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + * + * We use the same threshold as pageout congestion_wait below. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + lumpy_reclaim = 1; + else if (sc->order && priority < DEF_PRIORITY - 2) + lumpy_reclaim = 1; pagevec_init(&pvec, 1); @@ -1041,23 +1118,27 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_freed; unsigned long nr_active; unsigned int count[NR_LRU_LISTS] = { 0, }; - int mode = ISOLATE_INACTIVE; - - /* - * If we need a large contiguous chunk of memory, or have - * trouble getting a small set of contiguous pages, we - * will reclaim both active and inactive pages. - * - * We use the same threshold as pageout congestion_wait below. - */ - if (sc->order > PAGE_ALLOC_COSTLY_ORDER) - mode = ISOLATE_BOTH; - else if (sc->order && priority < DEF_PRIORITY - 2) - mode = ISOLATE_BOTH; + int mode = lumpy_reclaim ? ISOLATE_BOTH : ISOLATE_INACTIVE; + unsigned long nr_anon; + unsigned long nr_file; nr_taken = sc->isolate_pages(sc->swap_cluster_max, &page_list, &nr_scan, sc->order, mode, zone, sc->mem_cgroup, 0, file); + + if (scanning_global_lru(sc)) { + zone->pages_scanned += nr_scan; + if (current_is_kswapd()) + __count_zone_vm_events(PGSCAN_KSWAPD, zone, + nr_scan); + else + __count_zone_vm_events(PGSCAN_DIRECT, zone, + nr_scan); + } + + if (nr_taken == 0) + goto done; + nr_active = clear_active_flags(&page_list, count); __count_vm_events(PGDEACTIVATE, nr_active); @@ -1070,13 +1151,16 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_zone_page_state(zone, NR_INACTIVE_ANON, -count[LRU_INACTIVE_ANON]); - if (scan_global_lru(sc)) { - zone->pages_scanned += nr_scan; - zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; - zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; - zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; - zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; - } + nr_anon = count[LRU_ACTIVE_ANON] + count[LRU_INACTIVE_ANON]; + nr_file = count[LRU_ACTIVE_FILE] + count[LRU_INACTIVE_FILE]; + __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file); + + reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; + reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; + reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; + reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; @@ -1089,8 +1173,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * but that should be acceptable to the caller */ if (nr_freed < nr_taken && !current_is_kswapd() && - sc->order > PAGE_ALLOC_COSTLY_ORDER) { - congestion_wait(WRITE, HZ/10); + lumpy_reclaim) { + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * The attempt at page out may have made some @@ -1104,18 +1188,12 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, } nr_reclaimed += nr_freed; + local_irq_disable(); - if (current_is_kswapd()) { - __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); + if (current_is_kswapd()) __count_vm_events(KSWAPD_STEAL, nr_freed); - } else if (scan_global_lru(sc)) - __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); - __count_zone_vm_events(PGSTEAL, zone, nr_freed); - if (nr_taken == 0) - goto done; - spin_lock(&zone->lru_lock); /* * Put back any unfreeable pages. @@ -1134,10 +1212,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, SetPageLRU(page); lru = page_lru(page); add_page_to_lru_list(zone, page, lru); - mem_cgroup_move_lists(page, lru); - if (PageActive(page) && scan_global_lru(sc)) { - int file = !!page_is_file_cache(page); - zone->recent_rotated[file]++; + if (is_active_lru(lru)) { + int file = is_file_lru(lru); + reclaim_stat->recent_rotated[file]++; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1145,10 +1222,13 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, spin_lock_irq(&zone->lru_lock); } } + __mod_zone_page_state(zone, NR_ISOLATED_ANON, -nr_anon); + __mod_zone_page_state(zone, NR_ISOLATED_FILE, -nr_file); + } while (nr_scanned < max_scan); - spin_unlock(&zone->lru_lock); + done: - local_irq_enable(); + spin_unlock_irq(&zone->lru_lock); pagevec_release(&pvec); return nr_reclaimed; } @@ -1185,40 +1265,74 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) * But we had to alter page->flags anyway. */ +static void move_active_pages_to_lru(struct zone *zone, + struct list_head *list, + enum lru_list lru) +{ + unsigned long pgmoved = 0; + struct pagevec pvec; + struct page *page; + + pagevec_init(&pvec, 1); + + while (!list_empty(list)) { + page = lru_to_page(list); + + VM_BUG_ON(PageLRU(page)); + SetPageLRU(page); + + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_add_lru_list(page, lru); + pgmoved++; + + if (!pagevec_add(&pvec, page) || list_empty(list)) { + spin_unlock_irq(&zone->lru_lock); + if (buffer_heads_over_limit) + pagevec_strip(&pvec); + __pagevec_release(&pvec); + spin_lock_irq(&zone->lru_lock); + } + } + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); + if (!is_active_lru(lru)) + __count_vm_events(PGDEACTIVATE, pgmoved); +} static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct scan_control *sc, int priority, int file) { - unsigned long pgmoved; - int pgdeactivate = 0; + unsigned long nr_taken; unsigned long pgscanned; + unsigned long vm_flags; LIST_HEAD(l_hold); /* The pages which were snipped off */ + LIST_HEAD(l_active); LIST_HEAD(l_inactive); struct page *page; - struct pagevec pvec; - enum lru_list lru; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + unsigned long nr_rotated = 0; lru_add_drain(); spin_lock_irq(&zone->lru_lock); - pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, + nr_taken = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, sc->mem_cgroup, 1, file); /* * zone->pages_scanned is used for detect zone's oom * mem_cgroup remembers nr_scan by itself. */ - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) { zone->pages_scanned += pgscanned; - zone->recent_scanned[!!file] += pgmoved; } + reclaim_stat->recent_scanned[file] += nr_taken; + __count_zone_vm_events(PGREFILL, zone, pgscanned); if (file) - __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -nr_taken); else - __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -nr_taken); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, nr_taken); spin_unlock_irq(&zone->lru_lock); - pgmoved = 0; while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); @@ -1231,65 +1345,113 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, /* page_referenced clears PageReferenced */ if (page_mapping_inuse(page) && - page_referenced(page, 0, sc->mem_cgroup)) - pgmoved++; + page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) { + nr_rotated++; + /* + * Identify referenced, file-backed active pages and + * give them one more trip around the active list. So + * that executable code get better chances to stay in + * memory under moderate memory pressure. Anon pages + * are not likely to be evicted by use-once streaming + * IO, plus JVM can create lots of anon VM_EXEC pages, + * so we ignore them here. + */ + if ((vm_flags & VM_EXEC) && !PageAnon(page)) { + list_add(&page->lru, &l_active); + continue; + } + } + ClearPageActive(page); /* we are de-activating */ list_add(&page->lru, &l_inactive); } /* - * Move the pages to the [file or anon] inactive list. + * Move pages back to the lru list. */ - pagevec_init(&pvec, 1); - pgmoved = 0; - lru = LRU_BASE + file * LRU_FILE; - spin_lock_irq(&zone->lru_lock); /* - * Count referenced pages from currently used mappings as - * rotated, even though they are moved to the inactive list. - * This helps balance scan pressure between file and anonymous - * pages in get_scan_ratio. + * Count referenced pages from currently used mappings as rotated, + * even though only some of them are actually re-activated. This + * helps balance scan pressure between file and anonymous pages in + * get_scan_ratio. */ - if (scan_global_lru(sc)) - zone->recent_rotated[!!file] += pgmoved; + reclaim_stat->recent_rotated[file] += nr_rotated; - while (!list_empty(&l_inactive)) { - page = lru_to_page(&l_inactive); - prefetchw_prev_lru_page(page, &l_inactive, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - ClearPageActive(page); - - list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_move_lists(page, lru); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - spin_unlock_irq(&zone->lru_lock); - pgdeactivate += pgmoved; - pgmoved = 0; - if (buffer_heads_over_limit) - pagevec_strip(&pvec); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); - pgdeactivate += pgmoved; - if (buffer_heads_over_limit) { - spin_unlock_irq(&zone->lru_lock); - pagevec_strip(&pvec); - spin_lock_irq(&zone->lru_lock); - } - __count_zone_vm_events(PGREFILL, zone, pgscanned); - __count_vm_events(PGDEACTIVATE, pgdeactivate); + move_active_pages_to_lru(zone, &l_active, + LRU_ACTIVE + file * LRU_FILE); + move_active_pages_to_lru(zone, &l_inactive, + LRU_BASE + file * LRU_FILE); + __mod_zone_page_state(zone, NR_ISOLATED_ANON + file, -nr_taken); spin_unlock_irq(&zone->lru_lock); - if (vm_swap_full()) - pagevec_swap_free(&pvec); +} - pagevec_release(&pvec); +static int inactive_anon_is_low_global(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_ANON); + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + + if (inactive * zone->inactive_ratio < active) + return 1; + + return 0; +} + +/** + * inactive_anon_is_low - check if anonymous pages need to be deactivated + * @zone: zone to check + * @sc: scan control of this context + * + * Returns true if the zone does not have enough inactive anon pages, + * meaning some active anon pages need to be deactivated. + */ +static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) +{ + int low; + + if (scanning_global_lru(sc)) + low = inactive_anon_is_low_global(zone); + else + low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); + return low; +} + +static int inactive_file_is_low_global(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_FILE); + inactive = zone_page_state(zone, NR_INACTIVE_FILE); + + return (active > inactive); +} + +/** + * inactive_file_is_low - check if file pages need to be deactivated + * @zone: zone to check + * @sc: scan control of this context + * + * When the system is doing streaming IO, memory pressure here + * ensures that active file pages get deactivated, until more + * than half of the file pages are on the inactive list. + * + * Once we get to that situation, protect the system's working + * set from being evicted by disabling active file page aging. + * + * This uses a different ratio than the anonymous pages, because + * the page cache uses a use-once replacement algorithm. + */ +static int inactive_file_is_low(struct zone *zone, struct scan_control *sc) +{ + int low; + + if (scanning_global_lru(sc)) + low = inactive_file_is_low_global(zone); + else + low = mem_cgroup_inactive_file_is_low(sc->mem_cgroup); + return low; } static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, @@ -1297,13 +1459,12 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, { int file = is_file_lru(lru); - if (lru == LRU_ACTIVE_FILE) { + if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) { shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } - if (lru == LRU_ACTIVE_ANON && - (!scan_global_lru(sc) || inactive_anon_is_low(zone))) { + if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } @@ -1325,25 +1486,22 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; - - /* If we have no swap space, do not bother scanning anon pages. */ - if (nr_swap_pages <= 0) { - percent[0] = 0; - percent[1] = 100; - return; - } - - anon = zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - file = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - free = zone_page_state(zone, NR_FREE_PAGES); - - /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= zone->pages_high)) { - percent[0] = 100; - percent[1] = 0; - return; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + + anon = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_lru_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_lru_pages(zone, sc, LRU_INACTIVE_FILE); + + if (scanning_global_lru(sc)) { + free = zone_page_state(zone, NR_FREE_PAGES); + /* If we have very few page cache pages, + force-scan anon pages. */ + if (unlikely(file + free <= high_wmark_pages(zone))) { + percent[0] = 100; + percent[1] = 0; + return; + } } /* @@ -1357,17 +1515,17 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ - if (unlikely(zone->recent_scanned[0] > anon / 4)) { + if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { spin_lock_irq(&zone->lru_lock); - zone->recent_scanned[0] /= 2; - zone->recent_rotated[0] /= 2; + reclaim_stat->recent_scanned[0] /= 2; + reclaim_stat->recent_rotated[0] /= 2; spin_unlock_irq(&zone->lru_lock); } - if (unlikely(zone->recent_scanned[1] > file / 4)) { + if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { spin_lock_irq(&zone->lru_lock); - zone->recent_scanned[1] /= 2; - zone->recent_rotated[1] /= 2; + reclaim_stat->recent_scanned[1] /= 2; + reclaim_stat->recent_rotated[1] /= 2; spin_unlock_irq(&zone->lru_lock); } @@ -1383,17 +1541,37 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * proportional to the fraction of recently scanned pages on * each list that were recently referenced and in active use. */ - ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); - ap /= zone->recent_rotated[0] + 1; + ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); + ap /= reclaim_stat->recent_rotated[0] + 1; - fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); - fp /= zone->recent_rotated[1] + 1; + fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); + fp /= reclaim_stat->recent_rotated[1] + 1; /* Normalize to percentages */ percent[0] = 100 * ap / (ap + fp + 1); percent[1] = 100 - percent[0]; } +/* + * Smallish @nr_to_scan's are deposited in @nr_saved_scan, + * until we collected @swap_cluster_max pages to scan. + */ +static unsigned long nr_scan_try_batch(unsigned long nr_to_scan, + unsigned long *nr_saved_scan, + unsigned long swap_cluster_max) +{ + unsigned long nr; + + *nr_saved_scan += nr_to_scan; + nr = *nr_saved_scan; + + if (nr >= swap_cluster_max) + *nr_saved_scan = 0; + else + nr = 0; + + return nr; +} /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. @@ -1407,34 +1585,29 @@ static void shrink_zone(int priority, struct zone *zone, enum lru_list l; unsigned long nr_reclaimed = sc->nr_reclaimed; unsigned long swap_cluster_max = sc->swap_cluster_max; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); + int noswap = 0; - get_scan_ratio(zone, sc, percent); + /* If we have no swap space, do not bother scanning anon pages. */ + if (!sc->may_swap || (nr_swap_pages <= 0)) { + noswap = 1; + percent[0] = 0; + percent[1] = 100; + } else + get_scan_ratio(zone, sc, percent); for_each_evictable_lru(l) { - if (scan_global_lru(sc)) { - int file = is_file_lru(l); - int scan; - - scan = zone_page_state(zone, NR_LRU_BASE + l); - if (priority) { - scan >>= priority; - scan = (scan * percent[file]) / 100; - } - zone->lru[l].nr_scan += scan; - nr[l] = zone->lru[l].nr_scan; - if (nr[l] >= swap_cluster_max) - zone->lru[l].nr_scan = 0; - else - nr[l] = 0; - } else { - /* - * This reclaim occurs not because zone memory shortage - * but because memory controller hits its limit. - * Don't modify zone reclaim related data. - */ - nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, - priority, l); + int file = is_file_lru(l); + unsigned long scan; + + scan = zone_nr_lru_pages(zone, sc, l); + if (priority || noswap) { + scan >>= priority; + scan = (scan * percent[file]) / 100; } + nr[l] = nr_scan_try_batch(scan, + &reclaim_stat->nr_saved_scan[l], + swap_cluster_max); } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || @@ -1467,9 +1640,7 @@ static void shrink_zone(int priority, struct zone *zone, * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); - else if (!scan_global_lru(sc)) + if (inactive_anon_is_low(zone, sc) && nr_swap_pages > 0) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); @@ -1480,11 +1651,13 @@ static void shrink_zone(int priority, struct zone *zone, * try to reclaim pages from zones which will satisfy the caller's allocation * request. * - * We reclaim from a zone even if that zone is over pages_high. Because: + * We reclaim from a zone even if that zone is over high_wmark_pages(zone). + * Because: * a) The caller may be trying to free *extra* pages to satisfy a higher-order * allocation or - * b) The zones may be over pages_high but they must go *over* pages_high to - * satisfy the `incremental min' zone defense algorithm. + * b) The target zone may be at high_wmark_pages(zone) but the lower zones + * must go *over* high_wmark_pages(zone) to satisfy the `incremental min' + * zone defense algorithm. * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. @@ -1497,14 +1670,15 @@ static void shrink_zones(int priority, struct zonelist *zonelist, struct zone *zone; sc->all_unreclaimable = 1; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, + sc->nodemask) { if (!populated_zone(zone)) continue; /* * Take care memory controller reclaiming has small influence * to global LRU. */ - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; note_zone_scanning_priority(zone, priority); @@ -1535,10 +1709,10 @@ static void shrink_zones(int priority, struct zonelist *zonelist, * * If the caller is !__GFP_FS then the probability of a failure is reasonably * high - the zone may be full of dirty or under-writeback pages, which this - * caller can't do much about. We kick pdflush and take explicit naps in the - * hope that some of these pages can be written. But if the allocating task - * holds filesystem locks which prevent writeout this might not work, and the - * allocation attempt will fail. + * caller can't do much about. We kick the writeback threads and take explicit + * naps in the hope that some of these pages can be written. But if the + * allocating task holds filesystem locks which prevent writeout this might not + * work, and the allocation attempt will fail. * * returns: 0, if no pages reclaimed * else, the number of pages reclaimed @@ -1557,18 +1731,18 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, delayacct_freepages_start(); - if (scan_global_lru(sc)) + if (scanning_global_lru(sc)) count_vm_event(ALLOCSTALL); /* * mem_cgroup will not do shrink_slab. */ - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) { for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } } @@ -1581,7 +1755,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, * Don't shrink slabs when reclaiming memory from * over limit cgroups */ - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) { shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; @@ -1603,16 +1777,16 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, */ if (total_scanned > sc->swap_cluster_max + sc->swap_cluster_max / 2) { - wakeup_pdflush(laptop_mode ? 0 : total_scanned); + wakeup_flusher_threads(laptop_mode ? 0 : total_scanned); sc->may_writepage = 1; } /* Take a nap, wait for some writeback to complete */ if (sc->nr_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ - if (!sc->all_unreclaimable && scan_global_lru(sc)) + if (!sc->all_unreclaimable && scanning_global_lru(sc)) ret = sc->nr_reclaimed; out: /* @@ -1625,7 +1799,7 @@ out: if (priority < 0) priority = 0; - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) { for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) @@ -1642,17 +1816,19 @@ out: } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, nodemask_t *nodemask) { struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, + .may_unmap = 1, .may_swap = 1, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, .isolate_pages = isolate_pages_global, + .nodemask = nodemask, }; return do_try_to_free_pages(zonelist, &sc); @@ -1660,19 +1836,56 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR +unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem, + gfp_t gfp_mask, bool noswap, + unsigned int swappiness, + struct zone *zone, int nid) +{ + struct scan_control sc = { + .may_writepage = !laptop_mode, + .may_unmap = 1, + .may_swap = !noswap, + .swap_cluster_max = SWAP_CLUSTER_MAX, + .swappiness = swappiness, + .order = 0, + .mem_cgroup = mem, + .isolate_pages = mem_cgroup_isolate_pages, + }; + nodemask_t nm = nodemask_of_node(nid); + + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | + (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); + sc.nodemask = &nm; + sc.nr_reclaimed = 0; + sc.nr_scanned = 0; + /* + * NOTE: Although we can get the priority field, using it + * here is not a good idea, since it limits the pages we can scan. + * if we don't reclaim here, the shrink_zone from balance_pgdat + * will pick up pages from other mem cgroup's as well. We hack + * the priority and make it zero. + */ + shrink_zone(0, zone, &sc); + return sc.nr_reclaimed; +} + unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, - gfp_t gfp_mask) + gfp_t gfp_mask, + bool noswap, + unsigned int swappiness) { + struct zonelist *zonelist; struct scan_control sc = { .may_writepage = !laptop_mode, - .may_swap = 1, + .may_unmap = 1, + .may_swap = !noswap, .swap_cluster_max = SWAP_CLUSTER_MAX, - .swappiness = vm_swappiness, + .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, + .nodemask = NULL, /* we don't care the placement */ }; - struct zonelist *zonelist; sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -1683,7 +1896,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, /* * For kswapd, balance_pgdat() will work across all this node's zones until - * they are all at pages_high. + * they are all at high_wmark_pages(zone). * * Returns the number of pages which were actually freed. * @@ -1696,11 +1909,11 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, * the zone for when the problem goes away. * * kswapd scans the zones in the highmem->normal->dma direction. It skips - * zones which have free_pages > pages_high, but once a zone is found to have - * free_pages <= pages_high, we scan that zone and the lower zones regardless - * of the number of free pages in the lower zones. This interoperates with - * the page allocator fallback scheme to ensure that aging of pages is balanced - * across the zones. + * zones which have free_pages > high_wmark_pages(zone), but once a zone is + * found to have free_pages <= high_wmark_pages(zone), we scan that zone and the + * lower zones regardless of the number of free pages in the lower zones. This + * interoperates with the page allocator fallback scheme to ensure that aging + * of pages is balanced across the zones. */ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) { @@ -1711,6 +1924,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .may_unmap = 1, .may_swap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, @@ -1720,7 +1934,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) }; /* * temp_priority is used to remember the scanning priority at which - * this zone was successfully refilled to free_pages == pages_high. + * this zone was successfully refilled to + * free_pages == high_wmark_pages(zone). */ int temp_priority[MAX_NR_ZONES]; @@ -1761,12 +1976,12 @@ loop_again: * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. */ - if (inactive_anon_is_low(zone)) + if (inactive_anon_is_low(zone, &sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); - if (!zone_watermark_ok(zone, order, zone->pages_high, - 0, 0)) { + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), 0, 0)) { end_zone = i; break; } @@ -1777,7 +1992,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_lru_pages(zone); + lru_pages += zone_reclaimable_pages(zone); } /* @@ -1792,6 +2007,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; int nr_slab; + int nid, zid; if (!populated_zone(zone)) continue; @@ -1800,18 +2016,27 @@ loop_again: priority != DEF_PRIORITY) continue; - if (!zone_watermark_ok(zone, order, zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + high_wmark_pages(zone), end_zone, 0)) all_zones_ok = 0; temp_priority[i] = priority; sc.nr_scanned = 0; note_zone_scanning_priority(zone, priority); + + nid = pgdat->node_id; + zid = zone_idx(zone); + /* + * Call soft limit reclaim before calling shrink_zone. + * For now we ignore the return value + */ + mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask, + nid, zid); /* * We put equal pressure on every zone, unless one * zone has way too many pages free already. */ - if (!zone_watermark_ok(zone, order, 8*zone->pages_high, - end_zone, 0)) + if (!zone_watermark_ok(zone, order, + 8*high_wmark_pages(zone), end_zone, 0)) shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, @@ -1821,7 +2046,7 @@ loop_again: if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone_lru_pages(zone) * 6)) + (zone_reclaimable_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* @@ -1840,7 +2065,7 @@ loop_again: * another pass across the zones. */ if (total_scanned && priority < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ/10); + congestion_wait(BLK_RW_ASYNC, HZ/10); /* * We do this so kswapd doesn't build up large priorities for @@ -1867,6 +2092,23 @@ out: try_to_freeze(); + /* + * Fragmentation may mean that the system cannot be + * rebalanced for high-order allocations in all zones. + * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, + * it means the zones have been fully scanned and are still + * not balanced. For high-order allocations, there is + * little point trying all over again as kswapd may + * infinite loop. + * + * Instead, recheck all watermarks at order-0 as they + * are the most important. If watermarks are ok, kswapd will go + * back to sleep. High-order users can still perform direct + * reclaim if they wish. + */ + if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) + order = sc.order = 0; + goto loop_again; } @@ -1895,7 +2137,9 @@ static int kswapd(void *p) struct reclaim_state reclaim_state = { .reclaimed_slab = 0, }; - node_to_cpumask_ptr(cpumask, pgdat->node_id); + const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id); + + lockdep_set_current_reclaim_state(GFP_KERNEL); if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); @@ -1958,7 +2202,7 @@ void wakeup_kswapd(struct zone *zone, int order) return; pgdat = zone->zone_pgdat; - if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0)) + if (zone_watermark_ok(zone, order, low_wmark_pages(zone), 0, 0)) return; if (pgdat->kswapd_max_order < order) pgdat->kswapd_max_order = order; @@ -1969,60 +2213,89 @@ void wakeup_kswapd(struct zone *zone, int order) wake_up_interruptible(&pgdat->kswapd_wait); } -unsigned long global_lru_pages(void) +/* + * The reclaimable count would be mostly accurate. + * The less reclaimable pages may be + * - mlocked pages, which will be moved to unevictable list when encountered + * - mapped pages, which may require several travels to be reclaimed + * - dirty pages, which is not "instantly" reclaimable + */ +unsigned long global_reclaimable_pages(void) +{ + int nr; + + nr = global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_INACTIVE_ANON); + + return nr; +} + +unsigned long zone_reclaimable_pages(struct zone *zone) { - return global_page_state(NR_ACTIVE_ANON) - + global_page_state(NR_ACTIVE_FILE) - + global_page_state(NR_INACTIVE_ANON) - + global_page_state(NR_INACTIVE_FILE); + int nr; + + nr = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + + if (nr_swap_pages > 0) + nr += zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + + return nr; } -#ifdef CONFIG_PM +#ifdef CONFIG_HIBERNATION /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority, and returns the - * number of reclaimed pages + * from LRU lists system-wide, for given pass and priority. * * For pass > 3 we also try to shrink the LRU lists that contain a few pages */ -static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, +static void shrink_all_zones(unsigned long nr_pages, int prio, int pass, struct scan_control *sc) { struct zone *zone; - unsigned long nr_to_scan, ret = 0; - enum lru_list l; - - for_each_zone(zone) { + unsigned long nr_reclaimed = 0; + struct zone_reclaim_stat *reclaim_stat; - if (!populated_zone(zone)) - continue; + for_each_populated_zone(zone) { + enum lru_list l; if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; for_each_evictable_lru(l) { + enum zone_stat_item ls = NR_LRU_BASE + l; + unsigned long lru_pages = zone_page_state(zone, ls); + /* For pass = 0, we don't shrink the active list */ - if (pass == 0 && - (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) + if (pass == 0 && (l == LRU_ACTIVE_ANON || + l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_scan += - (zone_page_state(zone, NR_LRU_BASE + l) - >> prio) + 1; - if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { - zone->lru[l].nr_scan = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, - NR_LRU_BASE + l)); - ret += shrink_list(l, nr_to_scan, zone, + reclaim_stat = get_reclaim_stat(zone, sc); + reclaim_stat->nr_saved_scan[l] += + (lru_pages >> prio) + 1; + if (reclaim_stat->nr_saved_scan[l] + >= nr_pages || pass > 3) { + unsigned long nr_to_scan; + + reclaim_stat->nr_saved_scan[l] = 0; + nr_to_scan = min(nr_pages, lru_pages); + nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); - if (ret >= nr_pages) - return ret; + if (nr_reclaimed >= nr_pages) { + sc->nr_reclaimed += nr_reclaimed; + return; + } } } } - - return ret; + sc->nr_reclaimed += nr_reclaimed; } /* @@ -2036,21 +2309,19 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, unsigned long shrink_all_memory(unsigned long nr_pages) { unsigned long lru_pages, nr_slab; - unsigned long ret = 0; int pass; struct reclaim_state reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .may_swap = 0, - .swap_cluster_max = nr_pages, + .may_unmap = 0, .may_writepage = 1, - .swappiness = vm_swappiness, .isolate_pages = isolate_pages_global, + .nr_reclaimed = 0, }; current->reclaim_state = &reclaim_state; - lru_pages = global_lru_pages(); + lru_pages = global_reclaimable_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -2059,8 +2330,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!reclaim_state.reclaimed_slab) break; - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + if (sc.nr_reclaimed >= nr_pages) goto out; nr_slab -= reclaim_state.reclaimed_slab; @@ -2078,49 +2349,51 @@ unsigned long shrink_all_memory(unsigned long nr_pages) int prio; /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) { - sc.may_swap = 1; - sc.swappiness = 100; - } + if (pass > 2) + sc.may_unmap = 1; for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - ret; + unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; sc.nr_scanned = 0; - ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (ret >= nr_pages) + sc.swap_cluster_max = nr_to_scan; + shrink_all_zones(nr_to_scan, prio, pass, &sc); + if (sc.nr_reclaimed >= nr_pages) goto out; reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - global_lru_pages()); - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) + global_reclaimable_pages()); + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + if (sc.nr_reclaimed >= nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) - congestion_wait(WRITE, HZ / 10); + congestion_wait(BLK_RW_ASYNC, HZ / 10); } } /* - * If ret = 0, we could not shrink LRUs, but there may be something - * in slab caches + * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be + * something in slab caches */ - if (!ret) { + if (!sc.nr_reclaimed) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); - ret += reclaim_state.reclaimed_slab; - } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); + shrink_slab(nr_pages, sc.gfp_mask, + global_reclaimable_pages()); + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + } while (sc.nr_reclaimed < nr_pages && + reclaim_state.reclaimed_slab > 0); } + out: current->reclaim_state = NULL; - return ret; + return sc.nr_reclaimed; } -#endif +#endif /* CONFIG_HIBERNATION */ /* It's optimal to keep kswapds on the same CPUs as their memory, but not required for correctness. So if the last cpu in a node goes @@ -2134,7 +2407,9 @@ static int __devinit cpu_callback(struct notifier_block *nfb, if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) { for_each_node_state(nid, N_HIGH_MEMORY) { pg_data_t *pgdat = NODE_DATA(nid); - node_to_cpumask_ptr(mask, pgdat->node_id); + const struct cpumask *mask; + + mask = cpumask_of_node(pgdat->node_id); if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) /* One of our CPUs online: restore mask */ @@ -2212,6 +2487,48 @@ int sysctl_min_unmapped_ratio = 1; */ int sysctl_min_slab_ratio = 5; +static inline unsigned long zone_unmapped_file_pages(struct zone *zone) +{ + unsigned long file_mapped = zone_page_state(zone, NR_FILE_MAPPED); + unsigned long file_lru = zone_page_state(zone, NR_INACTIVE_FILE) + + zone_page_state(zone, NR_ACTIVE_FILE); + + /* + * It's possible for there to be more file mapped pages than + * accounted for by the pages on the file LRU lists because + * tmpfs pages accounted for as ANON can also be FILE_MAPPED + */ + return (file_lru > file_mapped) ? (file_lru - file_mapped) : 0; +} + +/* Work out how many page cache pages we can reclaim in this reclaim_mode */ +static long zone_pagecache_reclaimable(struct zone *zone) +{ + long nr_pagecache_reclaimable; + long delta = 0; + + /* + * If RECLAIM_SWAP is set, then all file pages are considered + * potentially reclaimable. Otherwise, we have to worry about + * pages like swapcache and zone_unmapped_file_pages() provides + * a better estimate + */ + if (zone_reclaim_mode & RECLAIM_SWAP) + nr_pagecache_reclaimable = zone_page_state(zone, NR_FILE_PAGES); + else + nr_pagecache_reclaimable = zone_unmapped_file_pages(zone); + + /* If we can't clean pages, remove dirty pages from consideration */ + if (!(zone_reclaim_mode & RECLAIM_WRITE)) + delta += zone_page_state(zone, NR_FILE_DIRTY); + + /* Watch for any possible underflows due to delta */ + if (unlikely(delta > nr_pagecache_reclaimable)) + delta = nr_pagecache_reclaimable; + + return nr_pagecache_reclaimable - delta; +} + /* * Try to free up some pages from this zone through reclaim. */ @@ -2224,11 +2541,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) int priority; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_swap = 1, .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, + .order = order, .isolate_pages = isolate_pages_global, }; unsigned long slab_reclaimable; @@ -2244,9 +2563,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) > - zone->min_unmapped_pages) { + if (zone_pagecache_reclaimable(zone) > zone->min_unmapped_pages) { /* * Free memory by calling shrink zone with increasing * priorities until we have enough memory freed. @@ -2304,20 +2621,18 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * if less than a specified percentage of the zone is used by * unmapped file backed pages. */ - if (zone_page_state(zone, NR_FILE_PAGES) - - zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages - && zone_page_state(zone, NR_SLAB_RECLAIMABLE) - <= zone->min_slab_pages) - return 0; + if (zone_pagecache_reclaimable(zone) <= zone->min_unmapped_pages && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages) + return ZONE_RECLAIM_FULL; if (zone_is_all_unreclaimable(zone)) - return 0; + return ZONE_RECLAIM_FULL; /* * Do not scan if the allocation should not be delayed. */ if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC)) - return 0; + return ZONE_RECLAIM_NOSCAN; /* * Only run zone reclaim on the local zone or on zones that do not @@ -2327,18 +2642,21 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) */ node_id = zone_to_nid(zone); if (node_state(node_id, N_CPU) && node_id != numa_node_id()) - return 0; + return ZONE_RECLAIM_NOSCAN; if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED)) - return 0; + return ZONE_RECLAIM_NOSCAN; + ret = __zone_reclaim(zone, gfp_mask, order); zone_clear_flag(zone, ZONE_RECLAIM_LOCKED); + if (!ret) + count_vm_event(PGSCAN_ZONE_RECLAIM_FAILED); + return ret; } #endif -#ifdef CONFIG_UNEVICTABLE_LRU /* * page_evictable - test whether a page is evictable * @page: the page to test @@ -2383,10 +2701,11 @@ static void check_move_unevictable_page(struct page *page, struct zone *zone) retry: ClearPageUnevictable(page); if (page_evictable(page, NULL)) { - enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + enum lru_list l = page_lru_base_type(page); __dec_zone_state(zone, NR_UNEVICTABLE); list_move(&page->lru, &zone->lru[l].list); + mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); __inc_zone_state(zone, NR_INACTIVE_ANON + l); __count_vm_event(UNEVICTABLE_PGRESCUED); } else { @@ -2395,6 +2714,7 @@ retry: */ SetPageUnevictable(page); list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); + mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); if (page_evictable(page, NULL)) goto retry; } @@ -2524,10 +2844,10 @@ static void scan_all_zones_unevictable_pages(void) unsigned long scan_unevictable_pages; int scan_unevictable_handler(struct ctl_table *table, int write, - struct file *file, void __user *buffer, + void __user *buffer, size_t *length, loff_t *ppos) { - proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + proc_doulongvec_minmax(table, write, buffer, length, ppos); if (write && *(unsigned long *)table->data) scan_all_zones_unevictable_pages(); @@ -2583,4 +2903,3 @@ void scan_unevictable_unregister_node(struct node *node) sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); } -#endif