#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
+#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/cpuset.h>
#include <linux/notifier.h>
#include <linux/rwsem.h>
+#include <linux/delay.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
#include <linux/swapops.h>
-/* possible outcome of pageout() */
-typedef enum {
- /* failed to write page out, page is locked */
- PAGE_KEEP,
- /* move page to the active list, page is locked */
- PAGE_ACTIVATE,
- /* page has been sent to the disk successfully, page is unlocked */
- PAGE_SUCCESS,
- /* page is clean and locked */
- PAGE_CLEAN,
-} pageout_t;
+#include "internal.h"
struct scan_control {
- /* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
- unsigned long nr_to_scan;
-
/* Incremented by the number of inactive pages that were scanned */
unsigned long nr_scanned;
- /* Incremented by the number of pages reclaimed */
- unsigned long nr_reclaimed;
-
- unsigned long nr_mapped; /* From page_state */
-
- /* How many pages shrink_cache() should reclaim */
- int nr_to_reclaim;
-
- /* Ask shrink_caches, or shrink_zone to scan at this priority */
- unsigned int priority;
-
/* This context's GFP mask */
gfp_t gfp_mask;
* In this context, it doesn't matter that we scan the
* whole list at once. */
int swap_cluster_max;
+
+ int swappiness;
+
+ int all_unreclaimable;
};
/*
* From 0 .. 100. Higher means more swappy.
*/
int vm_swappiness = 60;
-static long total_memory;
+long vm_total_pages; /* The total number of pages which the VM controls */
static LIST_HEAD(shrinker_list);
static DECLARE_RWSEM(shrinker_rwsem);
*
* Returns the number of slab objects which we shrunk.
*/
-static int shrink_slab(unsigned long scanned, gfp_t gfp_mask,
+unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
unsigned long lru_pages)
{
struct shrinker *shrinker;
- int ret = 0;
+ unsigned long ret = 0;
if (scanned == 0)
scanned = SWAP_CLUSTER_MAX;
list_for_each_entry(shrinker, &shrinker_list, list) {
unsigned long long delta;
unsigned long total_scan;
+ unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask);
delta = (4 * scanned) / shrinker->seeks;
- delta *= (*shrinker->shrinker)(0, gfp_mask);
+ delta *= max_pass;
do_div(delta, lru_pages + 1);
shrinker->nr += delta;
- if (shrinker->nr < 0)
- shrinker->nr = LONG_MAX; /* It wrapped! */
+ if (shrinker->nr < 0) {
+ printk(KERN_ERR "%s: nr=%ld\n",
+ __FUNCTION__, shrinker->nr);
+ shrinker->nr = max_pass;
+ }
+
+ /*
+ * Avoid risking looping forever due to too large nr value:
+ * never try to free more than twice the estimate number of
+ * freeable entries.
+ */
+ if (shrinker->nr > max_pass * 2)
+ shrinker->nr = max_pass * 2;
total_scan = shrinker->nr;
shrinker->nr = 0;
break;
if (shrink_ret < nr_before)
ret += nr_before - shrink_ret;
- mod_page_state(slabs_scanned, this_scan);
+ count_vm_events(SLABS_SCANNED, this_scan);
total_scan -= this_scan;
cond_resched();
static int may_write_to_queue(struct backing_dev_info *bdi)
{
- if (current_is_kswapd())
- return 1;
- if (current_is_pdflush()) /* This is unlikely, but why not... */
+ if (current->flags & PF_SWAPWRITE)
return 1;
if (!bdi_write_congested(bdi))
return 1;
struct page *page, int error)
{
lock_page(page);
- if (page_mapping(page) == mapping) {
- if (error == -ENOSPC)
- set_bit(AS_ENOSPC, &mapping->flags);
- else
- set_bit(AS_EIO, &mapping->flags);
- }
+ if (page_mapping(page) == mapping)
+ mapping_set_error(mapping, error);
unlock_page(page);
}
+/* possible outcome of pageout() */
+typedef enum {
+ /* failed to write page out, page is locked */
+ PAGE_KEEP,
+ /* move page to the active list, page is locked */
+ PAGE_ACTIVATE,
+ /* page has been sent to the disk successfully, page is unlocked */
+ PAGE_SUCCESS,
+ /* page is clean and locked */
+ PAGE_CLEAN,
+} pageout_t;
+
/*
- * pageout is called by shrink_list() for each dirty page. Calls ->writepage().
+ * pageout is called by shrink_page_list() for each dirty page.
+ * Calls ->writepage().
*/
static pageout_t pageout(struct page *page, struct address_space *mapping)
{
struct writeback_control wbc = {
.sync_mode = WB_SYNC_NONE,
.nr_to_write = SWAP_CLUSTER_MAX,
+ .range_start = 0,
+ .range_end = LLONG_MAX,
.nonblocking = 1,
.for_reclaim = 1,
};
res = mapping->a_ops->writepage(page, &wbc);
if (res < 0)
handle_write_error(mapping, page, res);
- if (res == WRITEPAGE_ACTIVATE) {
+ if (res == AOP_WRITEPAGE_ACTIVATE) {
ClearPageReclaim(page);
return PAGE_ACTIVATE;
}
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
-
+ inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
}
/*
- * shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
+ * Attempt to detach a locked page from its ->mapping. If it is dirty or if
+ * someone else has a ref on the page, abort and return 0. If it was
+ * successfully detached, return 1. Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+ BUG_ON(!PageLocked(page));
+ BUG_ON(mapping != page_mapping(page));
+
+ write_lock_irq(&mapping->tree_lock);
+ /*
+ * The non racy check for a busy page.
+ *
+ * Must be careful with the order of the tests. When someone has
+ * a ref to the page, it may be possible that they dirty it then
+ * drop the reference. So if PageDirty is tested before page_count
+ * here, then the following race may occur:
+ *
+ * get_user_pages(&page);
+ * [user mapping goes away]
+ * write_to(page);
+ * !PageDirty(page) [good]
+ * SetPageDirty(page);
+ * put_page(page);
+ * !page_count(page) [good, discard it]
+ *
+ * [oops, our write_to data is lost]
+ *
+ * Reversing the order of the tests ensures such a situation cannot
+ * escape unnoticed. The smp_rmb is needed to ensure the page->flags
+ * load is not satisfied before that of page->_count.
+ *
+ * Note that if SetPageDirty is always performed via set_page_dirty,
+ * and thus under tree_lock, then this ordering is not required.
+ */
+ if (unlikely(page_count(page) != 2))
+ goto cannot_free;
+ smp_rmb();
+ if (unlikely(PageDirty(page)))
+ goto cannot_free;
+
+ if (PageSwapCache(page)) {
+ swp_entry_t swap = { .val = page_private(page) };
+ __delete_from_swap_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ swap_free(swap);
+ __put_page(page); /* The pagecache ref */
+ return 1;
+ }
+
+ __remove_from_page_cache(page);
+ write_unlock_irq(&mapping->tree_lock);
+ __put_page(page);
+ return 1;
+
+cannot_free:
+ write_unlock_irq(&mapping->tree_lock);
+ return 0;
+}
+
+/*
+ * shrink_page_list() returns the number of reclaimed pages
*/
-static int shrink_list(struct list_head *page_list, struct scan_control *sc)
+static unsigned long shrink_page_list(struct list_head *page_list,
+ struct scan_control *sc)
{
LIST_HEAD(ret_pages);
struct pagevec freed_pvec;
int pgactivate = 0;
- int reclaimed = 0;
+ unsigned long nr_reclaimed = 0;
cond_resched();
if (TestSetPageLocked(page))
goto keep;
- BUG_ON(PageActive(page));
+ VM_BUG_ON(PageActive(page));
sc->nr_scanned++;
+
+ if (!sc->may_swap && page_mapped(page))
+ goto keep_locked;
+
/* Double the slab pressure for mapped and swapcache pages */
if (page_mapped(page) || PageSwapCache(page))
sc->nr_scanned++;
* Anonymous process memory has backing store?
* Try to allocate it some swap space here.
*/
- if (PageAnon(page) && !PageSwapCache(page)) {
- if (!sc->may_swap)
- goto keep_locked;
- if (!add_to_swap(page))
+ if (PageAnon(page) && !PageSwapCache(page))
+ if (!add_to_swap(page, GFP_ATOMIC))
goto activate_locked;
- }
#endif /* CONFIG_SWAP */
mapping = page_mapping(page);
* processes. Try to unmap it here.
*/
if (page_mapped(page) && mapping) {
- switch (try_to_unmap(page)) {
+ switch (try_to_unmap(page, 0)) {
case SWAP_FAIL:
goto activate_locked;
case SWAP_AGAIN:
goto keep_locked;
if (!may_enter_fs)
goto keep_locked;
- if (laptop_mode && !sc->may_writepage)
+ if (!sc->may_writepage)
goto keep_locked;
/* Page is dirty, try to write it out here */
goto free_it;
}
- if (!mapping)
- goto keep_locked; /* truncate got there first */
-
- write_lock_irq(&mapping->tree_lock);
-
- /*
- * The non-racy check for busy page. It is critical to check
- * PageDirty _after_ making sure that the page is freeable and
- * not in use by anybody. (pagecache + us == 2)
- */
- if (unlikely(page_count(page) != 2))
- goto cannot_free;
- smp_rmb();
- if (unlikely(PageDirty(page)))
- goto cannot_free;
-
-#ifdef CONFIG_SWAP
- if (PageSwapCache(page)) {
- swp_entry_t swap = { .val = page_private(page) };
- __delete_from_swap_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- swap_free(swap);
- __put_page(page); /* The pagecache ref */
- goto free_it;
- }
-#endif /* CONFIG_SWAP */
-
- __remove_from_page_cache(page);
- write_unlock_irq(&mapping->tree_lock);
- __put_page(page);
+ if (!mapping || !remove_mapping(mapping, page))
+ goto keep_locked;
free_it:
unlock_page(page);
- reclaimed++;
+ nr_reclaimed++;
if (!pagevec_add(&freed_pvec, page))
__pagevec_release_nonlru(&freed_pvec);
continue;
-cannot_free:
- write_unlock_irq(&mapping->tree_lock);
- goto keep_locked;
-
activate_locked:
SetPageActive(page);
pgactivate++;
unlock_page(page);
keep:
list_add(&page->lru, &ret_pages);
- BUG_ON(PageLRU(page));
+ VM_BUG_ON(PageLRU(page));
}
list_splice(&ret_pages, page_list);
if (pagevec_count(&freed_pvec))
__pagevec_release_nonlru(&freed_pvec);
- mod_page_state(pgactivate, pgactivate);
- sc->nr_reclaimed += reclaimed;
- return reclaimed;
+ count_vm_events(PGACTIVATE, pgactivate);
+ return nr_reclaimed;
}
/*
*
* returns how many pages were moved onto *@dst.
*/
-static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
- struct list_head *dst, int *scanned)
+static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
+ struct list_head *src, struct list_head *dst,
+ unsigned long *scanned)
{
- int nr_taken = 0;
+ unsigned long nr_taken = 0;
struct page *page;
- int scan = 0;
+ unsigned long scan;
- while (scan++ < nr_to_scan && !list_empty(src)) {
+ for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
+ struct list_head *target;
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
- if (!TestClearPageLRU(page))
- BUG();
+ VM_BUG_ON(!PageLRU(page));
+
list_del(&page->lru);
- if (get_page_testone(page)) {
+ target = src;
+ if (likely(get_page_unless_zero(page))) {
/*
- * It is being freed elsewhere
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
*/
- __put_page(page);
- SetPageLRU(page);
- list_add(&page->lru, src);
- continue;
- } else {
- list_add(&page->lru, dst);
+ ClearPageLRU(page);
+ target = dst;
nr_taken++;
- }
+ } /* else it is being freed elsewhere */
+
+ list_add(&page->lru, target);
}
*scanned = scan;
}
/*
- * shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
+ * shrink_inactive_list() is a helper for shrink_zone(). It returns the number
+ * of reclaimed pages
*/
-static void shrink_cache(struct zone *zone, struct scan_control *sc)
+static unsigned long shrink_inactive_list(unsigned long max_scan,
+ struct zone *zone, struct scan_control *sc)
{
LIST_HEAD(page_list);
struct pagevec pvec;
- int max_scan = sc->nr_to_scan;
+ unsigned long nr_scanned = 0;
+ unsigned long nr_reclaimed = 0;
pagevec_init(&pvec, 1);
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
- while (max_scan > 0) {
+ do {
struct page *page;
- int nr_taken;
- int nr_scan;
- int nr_freed;
+ unsigned long nr_taken;
+ unsigned long nr_scan;
+ unsigned long nr_freed;
nr_taken = isolate_lru_pages(sc->swap_cluster_max,
&zone->inactive_list,
&page_list, &nr_scan);
- zone->nr_inactive -= nr_taken;
+ __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken);
zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
+ nr_scanned += nr_scan;
+ nr_freed = shrink_page_list(&page_list, sc);
+ nr_reclaimed += nr_freed;
+ local_irq_disable();
+ if (current_is_kswapd()) {
+ __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
+ __count_vm_events(KSWAPD_STEAL, nr_freed);
+ } else
+ __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
+ __count_zone_vm_events(PGSTEAL, zone, nr_freed);
+
if (nr_taken == 0)
goto done;
- max_scan -= nr_scan;
- if (current_is_kswapd())
- mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
- else
- mod_page_state_zone(zone, pgscan_direct, nr_scan);
- nr_freed = shrink_list(&page_list, sc);
- if (current_is_kswapd())
- mod_page_state(kswapd_steal, nr_freed);
- mod_page_state_zone(zone, pgsteal, nr_freed);
- sc->nr_to_reclaim -= nr_freed;
-
- spin_lock_irq(&zone->lru_lock);
+ spin_lock(&zone->lru_lock);
/*
* Put back any unfreeable pages.
*/
while (!list_empty(&page_list)) {
page = lru_to_page(&page_list);
- if (TestSetPageLRU(page))
- BUG();
+ VM_BUG_ON(PageLRU(page));
+ SetPageLRU(page);
list_del(&page->lru);
if (PageActive(page))
add_page_to_active_list(zone, page);
spin_lock_irq(&zone->lru_lock);
}
}
- }
- spin_unlock_irq(&zone->lru_lock);
+ } while (nr_scanned < max_scan);
+ spin_unlock(&zone->lru_lock);
done:
+ local_irq_enable();
pagevec_release(&pvec);
+ return nr_reclaimed;
+}
+
+/*
+ * We are about to scan this zone at a certain priority level. If that priority
+ * level is smaller (ie: more urgent) than the previous priority, then note
+ * that priority level within the zone. This is done so that when the next
+ * process comes in to scan this zone, it will immediately start out at this
+ * priority level rather than having to build up its own scanning priority.
+ * Here, this priority affects only the reclaim-mapped threshold.
+ */
+static inline void note_zone_scanning_priority(struct zone *zone, int priority)
+{
+ if (priority < zone->prev_priority)
+ zone->prev_priority = priority;
+}
+
+static inline int zone_is_near_oom(struct zone *zone)
+{
+ return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
+ + zone_page_state(zone, NR_INACTIVE))*3;
}
/*
* The downside is that we have to touch page->_count against each page.
* But we had to alter page->flags anyway.
*/
-static void
-refill_inactive_zone(struct zone *zone, struct scan_control *sc)
+static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
+ struct scan_control *sc, int priority)
{
- int pgmoved;
+ unsigned long pgmoved;
int pgdeactivate = 0;
- int pgscanned;
- int nr_pages = sc->nr_to_scan;
+ unsigned long pgscanned;
LIST_HEAD(l_hold); /* The pages which were snipped off */
LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
LIST_HEAD(l_active); /* Pages to go onto the active_list */
struct page *page;
struct pagevec pvec;
int reclaim_mapped = 0;
- long mapped_ratio;
- long distress;
- long swap_tendency;
+
+ if (sc->may_swap) {
+ long mapped_ratio;
+ long distress;
+ long swap_tendency;
+
+ if (zone_is_near_oom(zone))
+ goto force_reclaim_mapped;
+
+ /*
+ * `distress' is a measure of how much trouble we're having
+ * reclaiming pages. 0 -> no problems. 100 -> great trouble.
+ */
+ distress = 100 >> min(zone->prev_priority, priority);
+
+ /*
+ * The point of this algorithm is to decide when to start
+ * reclaiming mapped memory instead of just pagecache. Work out
+ * how much memory
+ * is mapped.
+ */
+ mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+ global_page_state(NR_ANON_PAGES)) * 100) /
+ vm_total_pages;
+
+ /*
+ * Now decide how much we really want to unmap some pages. The
+ * mapped ratio is downgraded - just because there's a lot of
+ * mapped memory doesn't necessarily mean that page reclaim
+ * isn't succeeding.
+ *
+ * The distress ratio is important - we don't want to start
+ * going oom.
+ *
+ * A 100% value of vm_swappiness overrides this algorithm
+ * altogether.
+ */
+ swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
+
+ /*
+ * Now use this metric to decide whether to start moving mapped
+ * memory onto the inactive list.
+ */
+ if (swap_tendency >= 100)
+force_reclaim_mapped:
+ reclaim_mapped = 1;
+ }
lru_add_drain();
spin_lock_irq(&zone->lru_lock);
pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
&l_hold, &pgscanned);
zone->pages_scanned += pgscanned;
- zone->nr_active -= pgmoved;
+ __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
spin_unlock_irq(&zone->lru_lock);
- /*
- * `distress' is a measure of how much trouble we're having reclaiming
- * pages. 0 -> no problems. 100 -> great trouble.
- */
- distress = 100 >> zone->prev_priority;
-
- /*
- * The point of this algorithm is to decide when to start reclaiming
- * mapped memory instead of just pagecache. Work out how much memory
- * is mapped.
- */
- mapped_ratio = (sc->nr_mapped * 100) / total_memory;
-
- /*
- * Now decide how much we really want to unmap some pages. The mapped
- * ratio is downgraded - just because there's a lot of mapped memory
- * doesn't necessarily mean that page reclaim isn't succeeding.
- *
- * The distress ratio is important - we don't want to start going oom.
- *
- * A 100% value of vm_swappiness overrides this algorithm altogether.
- */
- swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
-
- /*
- * Now use this metric to decide whether to start moving mapped memory
- * onto the inactive list.
- */
- if (swap_tendency >= 100)
- reclaim_mapped = 1;
-
while (!list_empty(&l_hold)) {
cond_resched();
page = lru_to_page(&l_hold);
while (!list_empty(&l_inactive)) {
page = lru_to_page(&l_inactive);
prefetchw_prev_lru_page(page, &l_inactive, flags);
- if (TestSetPageLRU(page))
- BUG();
- if (!TestClearPageActive(page))
- BUG();
+ VM_BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ VM_BUG_ON(!PageActive(page));
+ ClearPageActive(page);
+
list_move(&page->lru, &zone->inactive_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
- zone->nr_inactive += pgmoved;
+ __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
spin_unlock_irq(&zone->lru_lock);
pgdeactivate += pgmoved;
pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
}
}
- zone->nr_inactive += pgmoved;
+ __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
pgdeactivate += pgmoved;
if (buffer_heads_over_limit) {
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_active)) {
page = lru_to_page(&l_active);
prefetchw_prev_lru_page(page, &l_active, flags);
- if (TestSetPageLRU(page))
- BUG();
- BUG_ON(!PageActive(page));
+ VM_BUG_ON(PageLRU(page));
+ SetPageLRU(page);
+ VM_BUG_ON(!PageActive(page));
list_move(&page->lru, &zone->active_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
- zone->nr_active += pgmoved;
+ __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
pgmoved = 0;
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
- zone->nr_active += pgmoved;
+ __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
+
+ __count_zone_vm_events(PGREFILL, zone, pgscanned);
+ __count_vm_events(PGDEACTIVATE, pgdeactivate);
spin_unlock_irq(&zone->lru_lock);
- pagevec_release(&pvec);
- mod_page_state_zone(zone, pgrefill, pgscanned);
- mod_page_state(pgdeactivate, pgdeactivate);
+ pagevec_release(&pvec);
}
/*
* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
*/
-static void
-shrink_zone(struct zone *zone, struct scan_control *sc)
+static unsigned long shrink_zone(int priority, struct zone *zone,
+ struct scan_control *sc)
{
unsigned long nr_active;
unsigned long nr_inactive;
+ unsigned long nr_to_scan;
+ unsigned long nr_reclaimed = 0;
atomic_inc(&zone->reclaim_in_progress);
* Add one to `nr_to_scan' just to make sure that the kernel will
* slowly sift through the active list.
*/
- zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
+ zone->nr_scan_active +=
+ (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
nr_active = zone->nr_scan_active;
if (nr_active >= sc->swap_cluster_max)
zone->nr_scan_active = 0;
else
nr_active = 0;
- zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
+ zone->nr_scan_inactive +=
+ (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
nr_inactive = zone->nr_scan_inactive;
if (nr_inactive >= sc->swap_cluster_max)
zone->nr_scan_inactive = 0;
else
nr_inactive = 0;
- sc->nr_to_reclaim = sc->swap_cluster_max;
-
while (nr_active || nr_inactive) {
if (nr_active) {
- sc->nr_to_scan = min(nr_active,
+ nr_to_scan = min(nr_active,
(unsigned long)sc->swap_cluster_max);
- nr_active -= sc->nr_to_scan;
- refill_inactive_zone(zone, sc);
+ nr_active -= nr_to_scan;
+ shrink_active_list(nr_to_scan, zone, sc, priority);
}
if (nr_inactive) {
- sc->nr_to_scan = min(nr_inactive,
+ nr_to_scan = min(nr_inactive,
(unsigned long)sc->swap_cluster_max);
- nr_inactive -= sc->nr_to_scan;
- shrink_cache(zone, sc);
- if (sc->nr_to_reclaim <= 0)
- break;
+ nr_inactive -= nr_to_scan;
+ nr_reclaimed += shrink_inactive_list(nr_to_scan, zone,
+ sc);
}
}
- throttle_vm_writeout();
+ throttle_vm_writeout(sc->gfp_mask);
atomic_dec(&zone->reclaim_in_progress);
+ return nr_reclaimed;
}
/*
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static void
-shrink_caches(struct zone **zones, struct scan_control *sc)
+static unsigned long shrink_zones(int priority, struct zone **zones,
+ struct scan_control *sc)
{
+ unsigned long nr_reclaimed = 0;
int i;
+ sc->all_unreclaimable = 1;
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- zone->temp_priority = sc->priority;
- if (zone->prev_priority > sc->priority)
- zone->prev_priority = sc->priority;
+ note_zone_scanning_priority(zone, priority);
- if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
- shrink_zone(zone, sc);
+ sc->all_unreclaimable = 0;
+
+ nr_reclaimed += shrink_zone(priority, zone, sc);
}
+ return nr_reclaimed;
}
/*
* holds filesystem locks which prevent writeout this might not work, and the
* allocation attempt will fail.
*/
-int try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
+unsigned long try_to_free_pages(struct zone **zones, gfp_t gfp_mask)
{
int priority;
int ret = 0;
- int total_scanned = 0, total_reclaimed = 0;
+ unsigned long total_scanned = 0;
+ unsigned long nr_reclaimed = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
- struct scan_control sc;
unsigned long lru_pages = 0;
int i;
+ struct scan_control sc = {
+ .gfp_mask = gfp_mask,
+ .may_writepage = !laptop_mode,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .may_swap = 1,
+ .swappiness = vm_swappiness,
+ };
- sc.gfp_mask = gfp_mask;
- sc.may_writepage = 0;
- sc.may_swap = 1;
-
- inc_page_state(allocstall);
+ count_vm_event(ALLOCSTALL);
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- zone->temp_priority = DEF_PRIORITY;
- lru_pages += zone->nr_active + zone->nr_inactive;
+ lru_pages += zone_page_state(zone, NR_ACTIVE)
+ + zone_page_state(zone, NR_INACTIVE);
}
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
- sc.nr_mapped = read_page_state(nr_mapped);
sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = priority;
- sc.swap_cluster_max = SWAP_CLUSTER_MAX;
if (!priority)
disable_swap_token();
- shrink_caches(zones, &sc);
+ nr_reclaimed += shrink_zones(priority, zones, &sc);
shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
if (reclaim_state) {
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
+ nr_reclaimed += reclaim_state->reclaimed_slab;
reclaim_state->reclaimed_slab = 0;
}
total_scanned += sc.nr_scanned;
- total_reclaimed += sc.nr_reclaimed;
- if (total_reclaimed >= sc.swap_cluster_max) {
+ if (nr_reclaimed >= sc.swap_cluster_max) {
ret = 1;
goto out;
}
* that's undesirable in laptop mode, where we *want* lumpy
* writeout. So in laptop mode, write out the whole world.
*/
- if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
+ if (total_scanned > sc.swap_cluster_max +
+ sc.swap_cluster_max / 2) {
wakeup_pdflush(laptop_mode ? 0 : total_scanned);
sc.may_writepage = 1;
}
/* Take a nap, wait for some writeback to complete */
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
}
+ /* top priority shrink_caches still had more to do? don't OOM, then */
+ if (!sc.all_unreclaimable)
+ ret = 1;
out:
+ /*
+ * Now that we've scanned all the zones at this priority level, note
+ * that level within the zone so that the next thread which performs
+ * scanning of this zone will immediately start out at this priority
+ * level. This affects only the decision whether or not to bring
+ * mapped pages onto the inactive list.
+ */
+ if (priority < 0)
+ priority = 0;
for (i = 0; zones[i] != 0; i++) {
struct zone *zone = zones[i];
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- zone->prev_priority = zone->temp_priority;
+ zone->prev_priority = priority;
}
return ret;
}
* For kswapd, balance_pgdat() will work across all this node's zones until
* they are all at pages_high.
*
- * If `nr_pages' is non-zero then it is the number of pages which are to be
- * reclaimed, regardless of the zone occupancies. This is a software suspend
- * special.
- *
* Returns the number of pages which were actually freed.
*
* There is special handling here for zones which are full of pinned pages.
* the page allocator fallback scheme to ensure that aging of pages is balanced
* across the zones.
*/
-static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
+static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
{
- int to_free = nr_pages;
int all_zones_ok;
int priority;
int i;
- int total_scanned, total_reclaimed;
+ unsigned long total_scanned;
+ unsigned long nr_reclaimed;
struct reclaim_state *reclaim_state = current->reclaim_state;
- struct scan_control sc;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_swap = 1,
+ .swap_cluster_max = SWAP_CLUSTER_MAX,
+ .swappiness = vm_swappiness,
+ };
+ /*
+ * temp_priority is used to remember the scanning priority at which
+ * this zone was successfully refilled to free_pages == pages_high.
+ */
+ int temp_priority[MAX_NR_ZONES];
loop_again:
total_scanned = 0;
- total_reclaimed = 0;
- sc.gfp_mask = GFP_KERNEL;
- sc.may_writepage = 0;
- sc.may_swap = 1;
- sc.nr_mapped = read_page_state(nr_mapped);
-
- inc_page_state(pageoutrun);
+ nr_reclaimed = 0;
+ sc.may_writepage = !laptop_mode;
+ count_vm_event(PAGEOUTRUN);
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- zone->temp_priority = DEF_PRIORITY;
- }
+ for (i = 0; i < pgdat->nr_zones; i++)
+ temp_priority[i] = DEF_PRIORITY;
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
all_zones_ok = 1;
- if (nr_pages == 0) {
- /*
- * Scan in the highmem->dma direction for the highest
- * zone which needs scanning
- */
- for (i = pgdat->nr_zones - 1; i >= 0; i--) {
- struct zone *zone = pgdat->node_zones + i;
+ /*
+ * Scan in the highmem->dma direction for the highest
+ * zone which needs scanning
+ */
+ for (i = pgdat->nr_zones - 1; i >= 0; i--) {
+ struct zone *zone = pgdat->node_zones + i;
- if (zone->present_pages == 0)
- continue;
+ if (!populated_zone(zone))
+ continue;
- if (zone->all_unreclaimable &&
- priority != DEF_PRIORITY)
- continue;
+ if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+ continue;
- if (!zone_watermark_ok(zone, order,
- zone->pages_high, 0, 0)) {
- end_zone = i;
- goto scan;
- }
+ if (!zone_watermark_ok(zone, order, zone->pages_high,
+ 0, 0)) {
+ end_zone = i;
+ break;
}
- goto out;
- } else {
- end_zone = pgdat->nr_zones - 1;
}
-scan:
+ if (i < 0)
+ goto out;
+
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
- lru_pages += zone->nr_active + zone->nr_inactive;
+ lru_pages += zone_page_state(zone, NR_ACTIVE)
+ + zone_page_state(zone, NR_INACTIVE);
}
/*
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
continue;
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue;
- if (nr_pages == 0) { /* Not software suspend */
- if (!zone_watermark_ok(zone, order,
- zone->pages_high, end_zone, 0))
- all_zones_ok = 0;
- }
- zone->temp_priority = priority;
- if (zone->prev_priority > priority)
- zone->prev_priority = priority;
+ if (!zone_watermark_ok(zone, order, zone->pages_high,
+ end_zone, 0))
+ all_zones_ok = 0;
+ temp_priority[i] = priority;
sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- sc.priority = priority;
- sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
- atomic_inc(&zone->reclaim_in_progress);
- shrink_zone(zone, &sc);
- atomic_dec(&zone->reclaim_in_progress);
+ note_zone_scanning_priority(zone, priority);
+ nr_reclaimed += shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
lru_pages);
- sc.nr_reclaimed += reclaim_state->reclaimed_slab;
- total_reclaimed += sc.nr_reclaimed;
+ nr_reclaimed += reclaim_state->reclaimed_slab;
total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
if (nr_slab == 0 && zone->pages_scanned >=
- (zone->nr_active + zone->nr_inactive) * 4)
- zone->all_unreclaimable = 1;
+ (zone_page_state(zone, NR_ACTIVE)
+ + zone_page_state(zone, NR_INACTIVE)) * 6)
+ zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
* even in laptop mode
*/
if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
- total_scanned > total_reclaimed+total_reclaimed/2)
+ total_scanned > nr_reclaimed + nr_reclaimed / 2)
sc.may_writepage = 1;
}
- if (nr_pages && to_free > total_reclaimed)
- continue; /* swsusp: need to do more work */
if (all_zones_ok)
break; /* kswapd: all done */
/*
* another pass across the zones.
*/
if (total_scanned && priority < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
/*
* We do this so kswapd doesn't build up large priorities for
* matches the direct reclaim path behaviour in terms of impact
* on zone->*_priority.
*/
- if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
+ if (nr_reclaimed >= SWAP_CLUSTER_MAX)
break;
}
out:
+ /*
+ * Note within each zone the priority level at which this zone was
+ * brought into a happy state. So that the next thread which scans this
+ * zone will start out at that priority level.
+ */
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
- zone->prev_priority = zone->temp_priority;
+ zone->prev_priority = temp_priority[i];
}
if (!all_zones_ok) {
cond_resched();
+
+ try_to_freeze();
+
goto loop_again;
}
- return total_reclaimed;
+ return nr_reclaimed;
}
/*
};
cpumask_t cpumask;
- daemonize("kswapd%d", pgdat->node_id);
cpumask = node_to_cpumask(pgdat->node_id);
if (!cpus_empty(cpumask))
set_cpus_allowed(tsk, cpumask);
* us from recursively trying to free more memory as we're
* trying to free the first piece of memory in the first place).
*/
- tsk->flags |= PF_MEMALLOC|PF_KSWAPD;
+ tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
order = 0;
for ( ; ; ) {
unsigned long new_order;
- try_to_freeze();
-
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
new_order = pgdat->kswapd_max_order;
pgdat->kswapd_max_order = 0;
*/
order = new_order;
} else {
- schedule();
+ if (!freezing(current))
+ schedule();
+
order = pgdat->kswapd_max_order;
}
finish_wait(&pgdat->kswapd_wait, &wait);
- balance_pgdat(pgdat, 0, order);
+ if (!try_to_freeze()) {
+ /* We can speed up thawing tasks if we don't call
+ * balance_pgdat after returning from the refrigerator
+ */
+ balance_pgdat(pgdat, order);
+ }
}
return 0;
}
{
pg_data_t *pgdat;
- if (zone->present_pages == 0)
+ if (!populated_zone(zone))
return;
pgdat = zone->zone_pgdat;
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
#ifdef CONFIG_PM
/*
- * Try to free `nr_pages' of memory, system-wide. Returns the number of freed
- * pages.
+ * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages
+ * from LRU lists system-wide, for given pass and priority, and returns the
+ * number of reclaimed pages
+ *
+ * For pass > 3 we also try to shrink the LRU lists that contain a few pages
*/
-int shrink_all_memory(int nr_pages)
+static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
+ int pass, struct scan_control *sc)
{
- pg_data_t *pgdat;
- int nr_to_free = nr_pages;
- int ret = 0;
- struct reclaim_state reclaim_state = {
- .reclaimed_slab = 0,
+ struct zone *zone;
+ unsigned long nr_to_scan, ret = 0;
+
+ for_each_zone(zone) {
+
+ if (!populated_zone(zone))
+ continue;
+
+ if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+ continue;
+
+ /* For pass = 0 we don't shrink the active list */
+ if (pass > 0) {
+ zone->nr_scan_active +=
+ (zone_page_state(zone, NR_ACTIVE) >> prio) + 1;
+ if (zone->nr_scan_active >= nr_pages || pass > 3) {
+ zone->nr_scan_active = 0;
+ nr_to_scan = min(nr_pages,
+ zone_page_state(zone, NR_ACTIVE));
+ shrink_active_list(nr_to_scan, zone, sc, prio);
+ }
+ }
+
+ zone->nr_scan_inactive +=
+ (zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
+ if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
+ zone->nr_scan_inactive = 0;
+ nr_to_scan = min(nr_pages,
+ zone_page_state(zone, NR_INACTIVE));
+ ret += shrink_inactive_list(nr_to_scan, zone, sc);
+ if (ret >= nr_pages)
+ return ret;
+ }
+ }
+
+ return ret;
+}
+
+static unsigned long count_lru_pages(void)
+{
+ return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
+}
+
+/*
+ * Try to free `nr_pages' of memory, system-wide, and return the number of
+ * freed pages.
+ *
+ * Rather than trying to age LRUs the aim is to preserve the overall
+ * LRU order by reclaiming preferentially
+ * inactive > active > active referenced > active mapped
+ */
+unsigned long shrink_all_memory(unsigned long nr_pages)
+{
+ unsigned long lru_pages, nr_slab;
+ unsigned long ret = 0;
+ int pass;
+ struct reclaim_state reclaim_state;
+ struct scan_control sc = {
+ .gfp_mask = GFP_KERNEL,
+ .may_swap = 0,
+ .swap_cluster_max = nr_pages,
+ .may_writepage = 1,
+ .swappiness = vm_swappiness,
};
current->reclaim_state = &reclaim_state;
- for_each_pgdat(pgdat) {
- int freed;
- freed = balance_pgdat(pgdat, nr_to_free, 0);
- ret += freed;
- nr_to_free -= freed;
- if (nr_to_free <= 0)
+
+ lru_pages = count_lru_pages();
+ nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
+ /* If slab caches are huge, it's better to hit them first */
+ while (nr_slab >= lru_pages) {
+ reclaim_state.reclaimed_slab = 0;
+ shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ if (!reclaim_state.reclaimed_slab)
break;
+
+ ret += reclaim_state.reclaimed_slab;
+ if (ret >= nr_pages)
+ goto out;
+
+ nr_slab -= reclaim_state.reclaimed_slab;
+ }
+
+ /*
+ * We try to shrink LRUs in 5 passes:
+ * 0 = Reclaim from inactive_list only
+ * 1 = Reclaim from active list but don't reclaim mapped
+ * 2 = 2nd pass of type 1
+ * 3 = Reclaim mapped (normal reclaim)
+ * 4 = 2nd pass of type 3
+ */
+ for (pass = 0; pass < 5; pass++) {
+ int prio;
+
+ /* Force reclaiming mapped pages in the passes #3 and #4 */
+ if (pass > 2) {
+ sc.may_swap = 1;
+ sc.swappiness = 100;
+ }
+
+ for (prio = DEF_PRIORITY; prio >= 0; prio--) {
+ unsigned long nr_to_scan = nr_pages - ret;
+
+ sc.nr_scanned = 0;
+ ret += shrink_all_zones(nr_to_scan, prio, pass, &sc);
+ if (ret >= nr_pages)
+ goto out;
+
+ reclaim_state.reclaimed_slab = 0;
+ shrink_slab(sc.nr_scanned, sc.gfp_mask,
+ count_lru_pages());
+ ret += reclaim_state.reclaimed_slab;
+ if (ret >= nr_pages)
+ goto out;
+
+ if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
+ congestion_wait(WRITE, HZ / 10);
+ }
}
+
+ /*
+ * If ret = 0, we could not shrink LRUs, but there may be something
+ * in slab caches
+ */
+ if (!ret) {
+ do {
+ reclaim_state.reclaimed_slab = 0;
+ shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
+ ret += reclaim_state.reclaimed_slab;
+ } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+ }
+
+out:
current->reclaim_state = NULL;
+
return ret;
}
#endif
-#ifdef CONFIG_HOTPLUG_CPU
/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
restore their cpu bindings. */
static int __devinit cpu_callback(struct notifier_block *nfb,
- unsigned long action,
- void *hcpu)
+ unsigned long action, void *hcpu)
{
pg_data_t *pgdat;
cpumask_t mask;
- if (action == CPU_ONLINE) {
- for_each_pgdat(pgdat) {
+ if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
+ for_each_online_pgdat(pgdat) {
mask = node_to_cpumask(pgdat->node_id);
if (any_online_cpu(mask) != NR_CPUS)
/* One of our CPUs online: restore mask */
}
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
+
+/*
+ * This kswapd start function will be called by init and node-hot-add.
+ * On node-hot-add, kswapd will moved to proper cpus if cpus are hot-added.
+ */
+int kswapd_run(int nid)
+{
+ pg_data_t *pgdat = NODE_DATA(nid);
+ int ret = 0;
+
+ if (pgdat->kswapd)
+ return 0;
+
+ pgdat->kswapd = kthread_run(kswapd, pgdat, "kswapd%d", nid);
+ if (IS_ERR(pgdat->kswapd)) {
+ /* failure at boot is fatal */
+ BUG_ON(system_state == SYSTEM_BOOTING);
+ printk("Failed to start kswapd on node %d\n",nid);
+ ret = -1;
+ }
+ return ret;
+}
static int __init kswapd_init(void)
{
- pg_data_t *pgdat;
+ int nid;
+
swap_setup();
- for_each_pgdat(pgdat)
- pgdat->kswapd
- = find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
- total_memory = nr_free_pagecache_pages();
+ for_each_online_node(nid)
+ kswapd_run(nid);
hotcpu_notifier(cpu_callback, 0);
return 0;
}
module_init(kswapd_init)
+#ifdef CONFIG_NUMA
+/*
+ * Zone reclaim mode
+ *
+ * If non-zero call zone_reclaim when the number of free pages falls below
+ * the watermarks.
+ */
+int zone_reclaim_mode __read_mostly;
+
+#define RECLAIM_OFF 0
+#define RECLAIM_ZONE (1<<0) /* Run shrink_cache on the zone */
+#define RECLAIM_WRITE (1<<1) /* Writeout pages during reclaim */
+#define RECLAIM_SWAP (1<<2) /* Swap pages out during reclaim */
+
+/*
+ * Priority for ZONE_RECLAIM. This determines the fraction of pages
+ * of a node considered for each zone_reclaim. 4 scans 1/16th of
+ * a zone.
+ */
+#define ZONE_RECLAIM_PRIORITY 4
+
+/*
+ * Percentage of pages in a zone that must be unmapped for zone_reclaim to
+ * occur.
+ */
+int sysctl_min_unmapped_ratio = 1;
+
+/*
+ * If the number of slab pages in a zone grows beyond this percentage then
+ * slab reclaim needs to occur.
+ */
+int sysctl_min_slab_ratio = 5;
/*
* Try to free up some pages from this zone through reclaim.
*/
-int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
+static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
- struct scan_control sc;
- int nr_pages = 1 << order;
- int total_reclaimed = 0;
-
- /* The reclaim may sleep, so don't do it if sleep isn't allowed */
- if (!(gfp_mask & __GFP_WAIT))
- return 0;
- if (zone->all_unreclaimable)
- return 0;
+ /* Minimum pages needed in order to stay on node */
+ const unsigned long nr_pages = 1 << order;
+ struct task_struct *p = current;
+ struct reclaim_state reclaim_state;
+ int priority;
+ unsigned long nr_reclaimed = 0;
+ struct scan_control sc = {
+ .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
+ .may_swap = !!(zone_reclaim_mode & RECLAIM_SWAP),
+ .swap_cluster_max = max_t(unsigned long, nr_pages,
+ SWAP_CLUSTER_MAX),
+ .gfp_mask = gfp_mask,
+ .swappiness = vm_swappiness,
+ };
+ unsigned long slab_reclaimable;
- sc.gfp_mask = gfp_mask;
- sc.may_writepage = 0;
- sc.may_swap = 0;
- sc.nr_mapped = read_page_state(nr_mapped);
- sc.nr_scanned = 0;
- sc.nr_reclaimed = 0;
- /* scan at the highest priority */
- sc.priority = 0;
disable_swap_token();
+ cond_resched();
+ /*
+ * We need to be able to allocate from the reserves for RECLAIM_SWAP
+ * and we also need to be able to write out pages for RECLAIM_WRITE
+ * and RECLAIM_SWAP.
+ */
+ p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+ reclaim_state.reclaimed_slab = 0;
+ p->reclaim_state = &reclaim_state;
- if (nr_pages > SWAP_CLUSTER_MAX)
- sc.swap_cluster_max = nr_pages;
- else
- sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+ if (zone_page_state(zone, NR_FILE_PAGES) -
+ zone_page_state(zone, NR_FILE_MAPPED) >
+ zone->min_unmapped_pages) {
+ /*
+ * Free memory by calling shrink zone with increasing
+ * priorities until we have enough memory freed.
+ */
+ priority = ZONE_RECLAIM_PRIORITY;
+ do {
+ note_zone_scanning_priority(zone, priority);
+ nr_reclaimed += shrink_zone(priority, zone, &sc);
+ priority--;
+ } while (priority >= 0 && nr_reclaimed < nr_pages);
+ }
- /* Don't reclaim the zone if there are other reclaimers active */
- if (atomic_read(&zone->reclaim_in_progress) > 0)
- goto out;
+ slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (slab_reclaimable > zone->min_slab_pages) {
+ /*
+ * shrink_slab() does not currently allow us to determine how
+ * many pages were freed in this zone. So we take the current
+ * number of slab pages and shake the slab until it is reduced
+ * by the same nr_pages that we used for reclaiming unmapped
+ * pages.
+ *
+ * Note that shrink_slab will free memory on all zones and may
+ * take a long time.
+ */
+ while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
+ slab_reclaimable - nr_pages)
+ ;
- shrink_zone(zone, &sc);
- total_reclaimed = sc.nr_reclaimed;
+ /*
+ * Update nr_reclaimed by the number of slab pages we
+ * reclaimed from this zone.
+ */
+ nr_reclaimed += slab_reclaimable -
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ }
- out:
- return total_reclaimed;
+ p->reclaim_state = NULL;
+ current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+ return nr_reclaimed >= nr_pages;
}
-asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
- unsigned int state)
+int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
{
- struct zone *z;
- int i;
-
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- if (node >= MAX_NUMNODES || !node_online(node))
- return -EINVAL;
-
- /* This will break if we ever add more zones */
- if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
- return -EINVAL;
-
- for (i = 0; i < MAX_NR_ZONES; i++) {
- if (!(zone & 1<<i))
- continue;
+ cpumask_t mask;
+ int node_id;
- z = &NODE_DATA(node)->node_zones[i];
+ /*
+ * Zone reclaim reclaims unmapped file backed pages and
+ * slab pages if we are over the defined limits.
+ *
+ * A small portion of unmapped file backed pages is needed for
+ * file I/O otherwise pages read by file I/O will be immediately
+ * thrown out if the zone is overallocated. So we do not reclaim
+ * if less than a specified percentage of the zone is used by
+ * unmapped file backed pages.
+ */
+ if (zone_page_state(zone, NR_FILE_PAGES) -
+ zone_page_state(zone, NR_FILE_MAPPED) <= zone->min_unmapped_pages
+ && zone_page_state(zone, NR_SLAB_RECLAIMABLE)
+ <= zone->min_slab_pages)
+ return 0;
- if (state)
- z->reclaim_pages = 1;
- else
- z->reclaim_pages = 0;
- }
+ /*
+ * Avoid concurrent zone reclaims, do not reclaim in a zone that does
+ * not have reclaimable pages and if we should not delay the allocation
+ * then do not scan.
+ */
+ if (!(gfp_mask & __GFP_WAIT) ||
+ zone->all_unreclaimable ||
+ atomic_read(&zone->reclaim_in_progress) > 0 ||
+ (current->flags & PF_MEMALLOC))
+ return 0;
- return 0;
+ /*
+ * Only run zone reclaim on the local zone or on zones that do not
+ * have associated processors. This will favor the local processor
+ * over remote processors and spread off node memory allocations
+ * as wide as possible.
+ */
+ node_id = zone_to_nid(zone);
+ mask = node_to_cpumask(node_id);
+ if (!cpus_empty(mask) && node_id != numa_node_id())
+ return 0;
+ return __zone_reclaim(zone, gfp_mask, order);
}
+#endif