X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=mm%2Fvmscan.c;h=06e72693b4587a6181b002dec4f14ac5a7d4ba06;hb=327c0e968645f2601a43f5ea7c19c7b3a5fa0a34;hp=ca64e3e0c518965a81bc99516199978e16f6b469;hpb=af936a1606246a10c145feac3770f6287f483f02;p=safe%2Fjmp%2Flinux-2.6 diff --git a/mm/vmscan.c b/mm/vmscan.c index ca64e3e..06e7269 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -52,13 +52,16 @@ struct scan_control { /* Incremented by the number of inactive pages that were scanned */ unsigned long nr_scanned; + /* Number of pages freed so far during a call to shrink_zones() */ + unsigned long nr_reclaimed; + /* This context's GFP mask */ gfp_t gfp_mask; int may_writepage; - /* Can pages be swapped as part of reclaim? */ - int may_swap; + /* Can mapped pages be reclaimed? */ + int may_unmap; /* This context's SWAP_CLUSTER_MAX. If freeing memory for * suspend, we effectively ignore SWAP_CLUSTER_MAX. @@ -75,6 +78,12 @@ struct scan_control { /* Which cgroup do we reclaim from */ struct mem_cgroup *mem_cgroup; + /* + * Nodemask of nodes allowed by the caller. If NULL, all nodes + * are scanned. + */ + nodemask_t *nodemask; + /* Pluggable isolate pages callback */ unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, @@ -122,11 +131,30 @@ static LIST_HEAD(shrinker_list); static DECLARE_RWSEM(shrinker_rwsem); #ifdef CONFIG_CGROUP_MEM_RES_CTLR -#define scan_global_lru(sc) (!(sc)->mem_cgroup) +#define scanning_global_lru(sc) (!(sc)->mem_cgroup) #else -#define scan_global_lru(sc) (1) +#define scanning_global_lru(sc) (1) #endif +static struct zone_reclaim_stat *get_reclaim_stat(struct zone *zone, + struct scan_control *sc) +{ + if (!scanning_global_lru(sc)) + return mem_cgroup_get_reclaim_stat(sc->mem_cgroup, zone); + + return &zone->reclaim_stat; +} + +static unsigned long zone_nr_pages(struct zone *zone, struct scan_control *sc, + enum lru_list lru) +{ + if (!scanning_global_lru(sc)) + return mem_cgroup_zone_nr_pages(sc->mem_cgroup, zone, lru); + + return zone_page_state(zone, NR_LRU_BASE + lru); +} + + /* * Add a shrinker callback to be called from the vm */ @@ -192,8 +220,9 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask, do_div(delta, lru_pages + 1); shrinker->nr += delta; if (shrinker->nr < 0) { - printk(KERN_ERR "%s: nr=%ld\n", - __func__, shrinker->nr); + printk(KERN_ERR "shrink_slab: %pF negative objects to " + "delete nr=%ld\n", + shrinker->shrink, shrinker->nr); shrinker->nr = max_pass; } @@ -509,7 +538,6 @@ redo: lru = LRU_UNEVICTABLE; add_page_to_unevictable_list(page); } - mem_cgroup_move_lists(page, lru); /* * page's status can change while we move it among lru. If an evictable @@ -544,7 +572,6 @@ void putback_lru_page(struct page *page) lru = !!TestClearPageActive(page) + page_is_file_cache(page); lru_cache_add_lru(page, lru); - mem_cgroup_move_lists(page, lru); put_page(page); } #endif /* CONFIG_UNEVICTABLE_LRU */ @@ -586,7 +613,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (unlikely(!page_evictable(page, NULL))) goto cull_mlocked; - if (!sc->may_swap && page_mapped(page)) + if (!sc->may_unmap && page_mapped(page)) goto keep_locked; /* Double the slab pressure for mapped and swapcache pages */ @@ -617,25 +644,17 @@ static unsigned long shrink_page_list(struct list_head *page_list, referenced && page_mapping_inuse(page)) goto activate_locked; -#ifdef CONFIG_SWAP /* * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ if (PageAnon(page) && !PageSwapCache(page)) { - switch (try_to_munlock(page)) { - case SWAP_FAIL: /* shouldn't happen */ - case SWAP_AGAIN: + if (!(sc->gfp_mask & __GFP_IO)) goto keep_locked; - case SWAP_MLOCK: - goto cull_mlocked; - case SWAP_SUCCESS: - ; /* fall thru'; add to swap cache */ - } - if (!add_to_swap(page, GFP_ATOMIC)) + if (!add_to_swap(page)) goto activate_locked; + may_enter_fs = 1; } -#endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -732,7 +751,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!mapping || !__remove_mapping(mapping, page)) goto keep_locked; - unlock_page(page); + /* + * At this point, we have no other references and there is + * no way to pick any more up (removed from LRU, removed + * from pagecache). Can use non-atomic bitops now (and + * we obviously don't have to worry about waking up a process + * waiting on the page lock, because there are no references. + */ + __clear_page_locked(page); free_it: nr_reclaimed++; if (!pagevec_add(&freed_pvec, page)) { @@ -742,6 +768,8 @@ free_it: continue; cull_mlocked: + if (PageSwapCache(page)) + try_to_free_swap(page); unlock_page(page); putback_lru_page(page); continue; @@ -749,7 +777,7 @@ cull_mlocked: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (PageSwapCache(page) && vm_swap_full()) - remove_exclusive_swap_page_ref(page); + try_to_free_swap(page); VM_BUG_ON(PageActive(page)); SetPageActive(page); pgactivate++; @@ -809,6 +837,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) return ret; ret = -EBUSY; + if (likely(get_page_unless_zero(page))) { /* * Be careful not to clear PageLRU until after we're @@ -817,6 +846,7 @@ int __isolate_lru_page(struct page *page, int mode, int file) */ ClearPageLRU(page); ret = 0; + mem_cgroup_del_lru(page); } return ret; @@ -1025,6 +1055,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, struct pagevec pvec; unsigned long nr_scanned = 0; unsigned long nr_reclaimed = 0; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); pagevec_init(&pvec, 1); @@ -1066,13 +1097,14 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, __mod_zone_page_state(zone, NR_INACTIVE_ANON, -count[LRU_INACTIVE_ANON]); - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) zone->pages_scanned += nr_scan; - zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; - zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; - zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; - zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; - } + + reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON]; + reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON]; + reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE]; + reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; @@ -1104,7 +1136,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, if (current_is_kswapd()) { __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan); __count_vm_events(KSWAPD_STEAL, nr_freed); - } else if (scan_global_lru(sc)) + } else if (scanning_global_lru(sc)) __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan); __count_zone_vm_events(PGSTEAL, zone, nr_freed); @@ -1130,10 +1162,9 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, SetPageLRU(page); lru = page_lru(page); add_page_to_lru_list(zone, page, lru); - mem_cgroup_move_lists(page, lru); - if (PageActive(page) && scan_global_lru(sc)) { + if (PageActive(page)) { int file = !!page_is_file_cache(page); - zone->recent_rotated[file]++; + reclaim_stat->recent_rotated[file]++; } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); @@ -1163,11 +1194,6 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) zone->prev_priority = priority; } -static inline int zone_is_near_oom(struct zone *zone) -{ - return zone->pages_scanned >= (zone_lru_pages(zone) * 3); -} - /* * This moves pages from the active list to the inactive list. * @@ -1198,6 +1224,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, struct page *page; struct pagevec pvec; enum lru_list lru; + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); lru_add_drain(); spin_lock_irq(&zone->lru_lock); @@ -1208,10 +1235,10 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, * zone->pages_scanned is used for detect zone's oom * mem_cgroup remembers nr_scan by itself. */ - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) { zone->pages_scanned += pgscanned; - zone->recent_scanned[!!file] += pgmoved; } + reclaim_stat->recent_scanned[!!file] += pgmoved; if (file) __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); @@ -1239,21 +1266,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } /* + * Move the pages to the [file or anon] inactive list. + */ + pagevec_init(&pvec, 1); + lru = LRU_BASE + file * LRU_FILE; + + spin_lock_irq(&zone->lru_lock); + /* * Count referenced pages from currently used mappings as * rotated, even though they are moved to the inactive list. * This helps balance scan pressure between file and anonymous * pages in get_scan_ratio. */ - zone->recent_rotated[!!file] += pgmoved; - - /* - * Move the pages to the [file or anon] inactive list. - */ - pagevec_init(&pvec, 1); + reclaim_stat->recent_rotated[!!file] += pgmoved; pgmoved = 0; - lru = LRU_BASE + file * LRU_FILE; - spin_lock_irq(&zone->lru_lock); while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); prefetchw_prev_lru_page(page, &l_inactive, flags); @@ -1263,7 +1290,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, ClearPageActive(page); list_move(&page->lru, &zone->lru[lru].list); - mem_cgroup_move_lists(page, lru); + mem_cgroup_add_lru_list(page, lru); pgmoved++; if (!pagevec_add(&pvec, page)) { __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); @@ -1278,20 +1305,46 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, } __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); pgdeactivate += pgmoved; - if (buffer_heads_over_limit) { - spin_unlock_irq(&zone->lru_lock); - pagevec_strip(&pvec); - spin_lock_irq(&zone->lru_lock); - } __count_zone_vm_events(PGREFILL, zone, pgscanned); __count_vm_events(PGDEACTIVATE, pgdeactivate); spin_unlock_irq(&zone->lru_lock); - if (vm_swap_full()) - pagevec_swap_free(&pvec); - + if (buffer_heads_over_limit) + pagevec_strip(&pvec); pagevec_release(&pvec); } +static int inactive_anon_is_low_global(struct zone *zone) +{ + unsigned long active, inactive; + + active = zone_page_state(zone, NR_ACTIVE_ANON); + inactive = zone_page_state(zone, NR_INACTIVE_ANON); + + if (inactive * zone->inactive_ratio < active) + return 1; + + return 0; +} + +/** + * inactive_anon_is_low - check if anonymous pages need to be deactivated + * @zone: zone to check + * @sc: scan control of this context + * + * Returns true if the zone does not have enough inactive anon pages, + * meaning some active anon pages need to be deactivated. + */ +static int inactive_anon_is_low(struct zone *zone, struct scan_control *sc) +{ + int low; + + if (scanning_global_lru(sc)) + low = inactive_anon_is_low_global(zone); + else + low = mem_cgroup_inactive_anon_is_low(sc->mem_cgroup); + return low; +} + static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, struct zone *zone, struct scan_control *sc, int priority) { @@ -1302,8 +1355,7 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, return 0; } - if (lru == LRU_ACTIVE_ANON && - (!scan_global_lru(sc) || inactive_anon_is_low(zone))) { + if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) { shrink_active_list(nr_to_scan, zone, sc, priority, file); return 0; } @@ -1325,12 +1377,7 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, unsigned long anon, file, free; unsigned long anon_prio, file_prio; unsigned long ap, fp; - - anon = zone_page_state(zone, NR_ACTIVE_ANON) + - zone_page_state(zone, NR_INACTIVE_ANON); - file = zone_page_state(zone, NR_ACTIVE_FILE) + - zone_page_state(zone, NR_INACTIVE_FILE); - free = zone_page_state(zone, NR_FREE_PAGES); + struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc); /* If we have no swap space, do not bother scanning anon pages. */ if (nr_swap_pages <= 0) { @@ -1339,11 +1386,20 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, return; } - /* If we have very few page cache pages, force-scan anon pages. */ - if (unlikely(file + free <= zone->pages_high)) { - percent[0] = 100; - percent[1] = 0; - return; + anon = zone_nr_pages(zone, sc, LRU_ACTIVE_ANON) + + zone_nr_pages(zone, sc, LRU_INACTIVE_ANON); + file = zone_nr_pages(zone, sc, LRU_ACTIVE_FILE) + + zone_nr_pages(zone, sc, LRU_INACTIVE_FILE); + + if (scanning_global_lru(sc)) { + free = zone_page_state(zone, NR_FREE_PAGES); + /* If we have very few page cache pages, + force-scan anon pages. */ + if (unlikely(file + free <= zone->pages_high)) { + percent[0] = 100; + percent[1] = 0; + return; + } } /* @@ -1357,17 +1413,17 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, * * anon in [0], file in [1] */ - if (unlikely(zone->recent_scanned[0] > anon / 4)) { + if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) { spin_lock_irq(&zone->lru_lock); - zone->recent_scanned[0] /= 2; - zone->recent_rotated[0] /= 2; + reclaim_stat->recent_scanned[0] /= 2; + reclaim_stat->recent_rotated[0] /= 2; spin_unlock_irq(&zone->lru_lock); } - if (unlikely(zone->recent_scanned[1] > file / 4)) { + if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) { spin_lock_irq(&zone->lru_lock); - zone->recent_scanned[1] /= 2; - zone->recent_rotated[1] /= 2; + reclaim_stat->recent_scanned[1] /= 2; + reclaim_stat->recent_rotated[1] /= 2; spin_unlock_irq(&zone->lru_lock); } @@ -1379,15 +1435,15 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, file_prio = 200 - sc->swappiness; /* - * anon recent_rotated[0] - * %anon = 100 * ----------- / ----------------- * IO cost - * anon + file rotate_sum + * The amount of pressure on anon vs file pages is inversely + * proportional to the fraction of recently scanned pages on + * each list that were recently referenced and in active use. */ - ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); - ap /= zone->recent_rotated[0] + 1; + ap = (anon_prio + 1) * (reclaim_stat->recent_scanned[0] + 1); + ap /= reclaim_stat->recent_rotated[0] + 1; - fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); - fp /= zone->recent_rotated[1] + 1; + fp = (file_prio + 1) * (reclaim_stat->recent_scanned[1] + 1); + fp /= reclaim_stat->recent_rotated[1] + 1; /* Normalize to percentages */ percent[0] = 100 * ap / (ap + fp + 1); @@ -1398,72 +1454,72 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc, /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ -static unsigned long shrink_zone(int priority, struct zone *zone, +static void shrink_zone(int priority, struct zone *zone, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; - unsigned long nr_reclaimed = 0; unsigned long percent[2]; /* anon @ 0; file @ 1 */ enum lru_list l; + unsigned long nr_reclaimed = sc->nr_reclaimed; + unsigned long swap_cluster_max = sc->swap_cluster_max; get_scan_ratio(zone, sc, percent); for_each_evictable_lru(l) { - if (scan_global_lru(sc)) { - int file = is_file_lru(l); - int scan; - /* - * Add one to nr_to_scan just to make sure that the - * kernel will slowly sift through each list. - */ - scan = zone_page_state(zone, NR_LRU_BASE + l); - if (priority) { - scan >>= priority; - scan = (scan * percent[file]) / 100; - } - zone->lru[l].nr_scan += scan + 1; + int file = is_file_lru(l); + int scan; + + scan = zone_nr_pages(zone, sc, l); + if (priority) { + scan >>= priority; + scan = (scan * percent[file]) / 100; + } + if (scanning_global_lru(sc)) { + zone->lru[l].nr_scan += scan; nr[l] = zone->lru[l].nr_scan; - if (nr[l] >= sc->swap_cluster_max) + if (nr[l] >= swap_cluster_max) zone->lru[l].nr_scan = 0; else nr[l] = 0; - } else { - /* - * This reclaim occurs not because zone memory shortage - * but because memory controller hits its limit. - * Don't modify zone reclaim related data. - */ - nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, - priority, l); - } + } else + nr[l] = scan; } while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { for_each_evictable_lru(l) { if (nr[l]) { - nr_to_scan = min(nr[l], - (unsigned long)sc->swap_cluster_max); + nr_to_scan = min(nr[l], swap_cluster_max); nr[l] -= nr_to_scan; nr_reclaimed += shrink_list(l, nr_to_scan, - zone, sc, priority); + zone, sc, priority); } } + /* + * On large memory systems, scan >> priority can become + * really large. This is fine for the starting priority; + * we want to put equal scanning pressure on each zone. + * However, if the VM has a harder time of freeing pages, + * with multiple processes reclaiming pages, the total + * freeing target can get unreasonably large. + */ + if (nr_reclaimed > swap_cluster_max && + priority < DEF_PRIORITY && !current_is_kswapd()) + break; } + sc->nr_reclaimed = nr_reclaimed; + /* * Even if we did not try to evict anon pages at all, we want to * rebalance the anon lru active/inactive ratio. */ - if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) - shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); - else if (!scan_global_lru(sc)) + if (inactive_anon_is_low(zone, sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); throttle_vm_writeout(sc->gfp_mask); - return nr_reclaimed; } /* @@ -1477,28 +1533,26 @@ static unsigned long shrink_zone(int priority, struct zone *zone, * b) The zones may be over pages_high but they must go *over* pages_high to * satisfy the `incremental min' zone defense algorithm. * - * Returns the number of reclaimed pages. - * * If a zone is deemed to be full of pinned pages then just give it a light * scan then give up on it. */ -static unsigned long shrink_zones(int priority, struct zonelist *zonelist, +static void shrink_zones(int priority, struct zonelist *zonelist, struct scan_control *sc) { enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask); - unsigned long nr_reclaimed = 0; struct zoneref *z; struct zone *zone; sc->all_unreclaimable = 1; - for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, + sc->nodemask) { if (!populated_zone(zone)) continue; /* * Take care memory controller reclaiming has small influence * to global LRU. */ - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; note_zone_scanning_priority(zone, priority); @@ -1517,10 +1571,8 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, priority); } - nr_reclaimed += shrink_zone(priority, zone, sc); + shrink_zone(priority, zone, sc); } - - return nr_reclaimed; } /* @@ -1545,7 +1597,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, int priority; unsigned long ret = 0; unsigned long total_scanned = 0; - unsigned long nr_reclaimed = 0; struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long lru_pages = 0; struct zoneref *z; @@ -1554,12 +1605,12 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, delayacct_freepages_start(); - if (scan_global_lru(sc)) + if (scanning_global_lru(sc)) count_vm_event(ALLOCSTALL); /* * mem_cgroup will not do shrink_slab. */ - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) { for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) @@ -1573,21 +1624,21 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, sc->nr_scanned = 0; if (!priority) disable_swap_token(); - nr_reclaimed += shrink_zones(priority, zonelist, sc); + shrink_zones(priority, zonelist, sc); /* * Don't shrink slabs when reclaiming memory from * over limit cgroups */ - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) { shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages); if (reclaim_state) { - nr_reclaimed += reclaim_state->reclaimed_slab; + sc->nr_reclaimed += reclaim_state->reclaimed_slab; reclaim_state->reclaimed_slab = 0; } } total_scanned += sc->nr_scanned; - if (nr_reclaimed >= sc->swap_cluster_max) { - ret = nr_reclaimed; + if (sc->nr_reclaimed >= sc->swap_cluster_max) { + ret = sc->nr_reclaimed; goto out; } @@ -1609,8 +1660,8 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, congestion_wait(WRITE, HZ/10); } /* top priority shrink_zones still had more to do? don't OOM, then */ - if (!sc->all_unreclaimable && scan_global_lru(sc)) - ret = nr_reclaimed; + if (!sc->all_unreclaimable && scanning_global_lru(sc)) + ret = sc->nr_reclaimed; out: /* * Now that we've scanned all the zones at this priority level, note @@ -1622,7 +1673,7 @@ out: if (priority < 0) priority = 0; - if (scan_global_lru(sc)) { + if (scanning_global_lru(sc)) { for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) { if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) @@ -1639,17 +1690,18 @@ out: } unsigned long try_to_free_pages(struct zonelist *zonelist, int order, - gfp_t gfp_mask) + gfp_t gfp_mask, nodemask_t *nodemask) { struct scan_control sc = { .gfp_mask = gfp_mask, .may_writepage = !laptop_mode, .swap_cluster_max = SWAP_CLUSTER_MAX, - .may_swap = 1, + .may_unmap = 1, .swappiness = vm_swappiness, .order = order, .mem_cgroup = NULL, .isolate_pages = isolate_pages_global, + .nodemask = nodemask, }; return do_try_to_free_pages(zonelist, &sc); @@ -1658,19 +1710,25 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_CGROUP_MEM_RES_CTLR unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont, - gfp_t gfp_mask) + gfp_t gfp_mask, + bool noswap, + unsigned int swappiness) { struct scan_control sc = { .may_writepage = !laptop_mode, - .may_swap = 1, + .may_unmap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, - .swappiness = vm_swappiness, + .swappiness = swappiness, .order = 0, .mem_cgroup = mem_cont, .isolate_pages = mem_cgroup_isolate_pages, + .nodemask = NULL, /* we don't care the placement */ }; struct zonelist *zonelist; + if (noswap) + sc.may_unmap = 0; + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); zonelist = NODE_DATA(numa_node_id())->node_zonelists; @@ -1705,11 +1763,10 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) int priority; int i; unsigned long total_scanned; - unsigned long nr_reclaimed; struct reclaim_state *reclaim_state = current->reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .may_swap = 1, + .may_unmap = 1, .swap_cluster_max = SWAP_CLUSTER_MAX, .swappiness = vm_swappiness, .order = order, @@ -1724,7 +1781,7 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order) loop_again: total_scanned = 0; - nr_reclaimed = 0; + sc.nr_reclaimed = 0; sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); @@ -1759,7 +1816,7 @@ loop_again: * Do some background aging of the anon list, to give * pages a chance to be referenced before reclaiming. */ - if (inactive_anon_is_low(zone)) + if (inactive_anon_is_low(zone, &sc)) shrink_active_list(SWAP_CLUSTER_MAX, zone, &sc, priority, 0); @@ -1810,11 +1867,11 @@ loop_again: */ if (!zone_watermark_ok(zone, order, 8*zone->pages_high, end_zone, 0)) - nr_reclaimed += shrink_zone(priority, zone, &sc); + shrink_zone(priority, zone, &sc); reclaim_state->reclaimed_slab = 0; nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL, lru_pages); - nr_reclaimed += reclaim_state->reclaimed_slab; + sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; if (zone_is_all_unreclaimable(zone)) continue; @@ -1828,7 +1885,7 @@ loop_again: * even in laptop mode */ if (total_scanned > SWAP_CLUSTER_MAX * 2 && - total_scanned > nr_reclaimed + nr_reclaimed / 2) + total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2) sc.may_writepage = 1; } if (all_zones_ok) @@ -1846,7 +1903,7 @@ loop_again: * matches the direct reclaim path behaviour in terms of impact * on zone->*_priority. */ - if (nr_reclaimed >= SWAP_CLUSTER_MAX) + if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) break; } out: @@ -1865,10 +1922,27 @@ out: try_to_freeze(); + /* + * Fragmentation may mean that the system cannot be + * rebalanced for high-order allocations in all zones. + * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, + * it means the zones have been fully scanned and are still + * not balanced. For high-order allocations, there is + * little point trying all over again as kswapd may + * infinite loop. + * + * Instead, recheck all watermarks at order-0 as they + * are the most important. If watermarks are ok, kswapd will go + * back to sleep. High-order users can still perform direct + * reclaim if they wish. + */ + if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) + order = sc.order = 0; + goto loop_again; } - return nr_reclaimed; + return sc.nr_reclaimed; } /* @@ -1895,7 +1969,9 @@ static int kswapd(void *p) }; node_to_cpumask_ptr(cpumask, pgdat->node_id); - if (!cpus_empty(*cpumask)) + lockdep_set_current_reclaim_state(GFP_KERNEL); + + if (!cpumask_empty(cpumask)) set_cpus_allowed_ptr(tsk, cpumask); current->reclaim_state = &reclaim_state; @@ -1978,49 +2054,47 @@ unsigned long global_lru_pages(void) #ifdef CONFIG_PM /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages - * from LRU lists system-wide, for given pass and priority, and returns the - * number of reclaimed pages + * from LRU lists system-wide, for given pass and priority. * * For pass > 3 we also try to shrink the LRU lists that contain a few pages */ -static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, +static void shrink_all_zones(unsigned long nr_pages, int prio, int pass, struct scan_control *sc) { struct zone *zone; - unsigned long nr_to_scan, ret = 0; - enum lru_list l; - - for_each_zone(zone) { + unsigned long nr_reclaimed = 0; - if (!populated_zone(zone)) - continue; + for_each_populated_zone(zone) { + enum lru_list l; if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; for_each_evictable_lru(l) { + enum zone_stat_item ls = NR_LRU_BASE + l; + unsigned long lru_pages = zone_page_state(zone, ls); + /* For pass = 0, we don't shrink the active list */ - if (pass == 0 && - (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) + if (pass == 0 && (l == LRU_ACTIVE_ANON || + l == LRU_ACTIVE_FILE)) continue; - zone->lru[l].nr_scan += - (zone_page_state(zone, NR_LRU_BASE + l) - >> prio) + 1; + zone->lru[l].nr_scan += (lru_pages >> prio) + 1; if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { + unsigned long nr_to_scan; + zone->lru[l].nr_scan = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, - NR_LRU_BASE + l)); - ret += shrink_list(l, nr_to_scan, zone, + nr_to_scan = min(nr_pages, lru_pages); + nr_reclaimed += shrink_list(l, nr_to_scan, zone, sc, prio); - if (ret >= nr_pages) - return ret; + if (nr_reclaimed >= nr_pages) { + sc->nr_reclaimed = nr_reclaimed; + return; + } } } } - - return ret; + sc->nr_reclaimed = nr_reclaimed; } /* @@ -2034,15 +2108,12 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, unsigned long shrink_all_memory(unsigned long nr_pages) { unsigned long lru_pages, nr_slab; - unsigned long ret = 0; int pass; struct reclaim_state reclaim_state; struct scan_control sc = { .gfp_mask = GFP_KERNEL, - .may_swap = 0, - .swap_cluster_max = nr_pages, + .may_unmap = 0, .may_writepage = 1, - .swappiness = vm_swappiness, .isolate_pages = isolate_pages_global, }; @@ -2057,8 +2128,8 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!reclaim_state.reclaimed_slab) break; - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + if (sc.nr_reclaimed >= nr_pages) goto out; nr_slab -= reclaim_state.reclaimed_slab; @@ -2076,24 +2147,23 @@ unsigned long shrink_all_memory(unsigned long nr_pages) int prio; /* Force reclaiming mapped pages in the passes #3 and #4 */ - if (pass > 2) { - sc.may_swap = 1; - sc.swappiness = 100; - } + if (pass > 2) + sc.may_unmap = 1; for (prio = DEF_PRIORITY; prio >= 0; prio--) { - unsigned long nr_to_scan = nr_pages - ret; + unsigned long nr_to_scan = nr_pages - sc.nr_reclaimed; sc.nr_scanned = 0; - ret += shrink_all_zones(nr_to_scan, prio, pass, &sc); - if (ret >= nr_pages) + sc.swap_cluster_max = nr_to_scan; + shrink_all_zones(nr_to_scan, prio, pass, &sc); + if (sc.nr_reclaimed >= nr_pages) goto out; reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, global_lru_pages()); - ret += reclaim_state.reclaimed_slab; - if (ret >= nr_pages) + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + if (sc.nr_reclaimed >= nr_pages) goto out; if (sc.nr_scanned && prio < DEF_PRIORITY - 2) @@ -2102,21 +2172,23 @@ unsigned long shrink_all_memory(unsigned long nr_pages) } /* - * If ret = 0, we could not shrink LRUs, but there may be something - * in slab caches + * If sc.nr_reclaimed = 0, we could not shrink LRUs, but there may be + * something in slab caches */ - if (!ret) { + if (!sc.nr_reclaimed) { do { reclaim_state.reclaimed_slab = 0; shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); - ret += reclaim_state.reclaimed_slab; - } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); + sc.nr_reclaimed += reclaim_state.reclaimed_slab; + } while (sc.nr_reclaimed < nr_pages && + reclaim_state.reclaimed_slab > 0); } + out: current->reclaim_state = NULL; - return ret; + return sc.nr_reclaimed; } #endif @@ -2134,7 +2206,7 @@ static int __devinit cpu_callback(struct notifier_block *nfb, pg_data_t *pgdat = NODE_DATA(nid); node_to_cpumask_ptr(mask, pgdat->node_id); - if (any_online_cpu(*mask) < nr_cpu_ids) + if (cpumask_any_and(cpu_online_mask, mask) < nr_cpu_ids) /* One of our CPUs online: restore mask */ set_cpus_allowed_ptr(pgdat->kswapd, mask); } @@ -2220,14 +2292,14 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) struct task_struct *p = current; struct reclaim_state reclaim_state; int priority; - unsigned long nr_reclaimed = 0; struct scan_control sc = { .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE), - .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP), + .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP), .swap_cluster_max = max_t(unsigned long, nr_pages, SWAP_CLUSTER_MAX), .gfp_mask = gfp_mask, .swappiness = vm_swappiness, + .order = order, .isolate_pages = isolate_pages_global, }; unsigned long slab_reclaimable; @@ -2253,9 +2325,9 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) priority = ZONE_RECLAIM_PRIORITY; do { note_zone_scanning_priority(zone, priority); - nr_reclaimed += shrink_zone(priority, zone, &sc); + shrink_zone(priority, zone, &sc); priority--; - } while (priority >= 0 && nr_reclaimed < nr_pages); + } while (priority >= 0 && sc.nr_reclaimed < nr_pages); } slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE); @@ -2279,13 +2351,13 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) * Update nr_reclaimed by the number of slab pages we * reclaimed from this zone. */ - nr_reclaimed += slab_reclaimable - + sc.nr_reclaimed += slab_reclaimable - zone_page_state(zone, NR_SLAB_RECLAIMABLE); } p->reclaim_state = NULL; current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE); - return nr_reclaimed >= nr_pages; + return sc.nr_reclaimed >= nr_pages; } int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) @@ -2364,39 +2436,6 @@ int page_evictable(struct page *page, struct vm_area_struct *vma) return 1; } -static void show_page_path(struct page *page) -{ - char buf[256]; - if (page_is_file_cache(page)) { - struct address_space *mapping = page->mapping; - struct dentry *dentry; - pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); - - spin_lock(&mapping->i_mmap_lock); - dentry = d_find_alias(mapping->host); - printk(KERN_INFO "rescued: %s %lu\n", - dentry_path(dentry, buf, 256), pgoff); - spin_unlock(&mapping->i_mmap_lock); - } else { -#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU) - struct anon_vma *anon_vma; - struct vm_area_struct *vma; - - anon_vma = page_lock_anon_vma(page); - if (!anon_vma) - return; - - list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - printk(KERN_INFO "rescued: anon %s\n", - vma->vm_mm->owner->comm); - break; - } - page_unlock_anon_vma(anon_vma); -#endif - } -} - - /** * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list * @page: page to check evictability and move to appropriate lru list @@ -2417,10 +2456,9 @@ retry: if (page_evictable(page, NULL)) { enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); - show_page_path(page); - __dec_zone_state(zone, NR_UNEVICTABLE); list_move(&page->lru, &zone->lru[l].list); + mem_cgroup_move_lists(page, LRU_UNEVICTABLE, l); __inc_zone_state(zone, NR_INACTIVE_ANON + l); __count_vm_event(UNEVICTABLE_PGRESCUED); } else { @@ -2429,6 +2467,7 @@ retry: */ SetPageUnevictable(page); list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); + mem_cgroup_rotate_lru_list(page, LRU_UNEVICTABLE); if (page_evictable(page, NULL)) goto retry; } @@ -2500,7 +2539,7 @@ void scan_mapping_unevictable_pages(struct address_space *mapping) * back onto @zone's unevictable list. */ #define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ -void scan_zone_unevictable_pages(struct zone *zone) +static void scan_zone_unevictable_pages(struct zone *zone) { struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; unsigned long scan; @@ -2542,7 +2581,7 @@ void scan_zone_unevictable_pages(struct zone *zone) * that has possibly/probably made some previously unevictable pages * evictable. */ -void scan_all_zones_unevictable_pages(void) +static void scan_all_zones_unevictable_pages(void) { struct zone *zone;