#include <linux/pagemap.h>
#include <linux/init.h>
#include <linux/highmem.h>
+#include <linux/vmstat.h>
#include <linux/file.h>
#include <linux/writeback.h>
#include <linux/blkdev.h>
#include <linux/rwsem.h>
#include <linux/delay.h>
#include <linux/kthread.h>
+#include <linux/freezer.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
struct page *page, int error)
{
lock_page(page);
- if (page_mapping(page) == mapping) {
- if (error == -ENOSPC)
- set_bit(AS_ENOSPC, &mapping->flags);
- else
- set_bit(AS_EIO, &mapping->flags);
- }
+ if (page_mapping(page) == mapping)
+ mapping_set_error(mapping, error);
unlock_page(page);
}
/* synchronous write or broken a_ops? */
ClearPageReclaim(page);
}
-
+ inc_zone_page_state(page, NR_VMSCAN_WRITE);
return PAGE_SUCCESS;
}
return PAGE_CLEAN;
}
+/*
+ * Attempt to detach a locked page from its ->mapping. If it is dirty or if
+ * someone else has a ref on the page, abort and return 0. If it was
+ * successfully detached, return 1. Assumes the caller has a single ref on
+ * this page.
+ */
int remove_mapping(struct address_space *mapping, struct page *page)
{
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
write_lock_irq(&mapping->tree_lock);
-
/*
- * The non-racy check for busy page. It is critical to check
- * PageDirty _after_ making sure that the page is freeable and
- * not in use by anybody. (pagecache + us == 2)
+ * The non racy check for a busy page.
+ *
+ * Must be careful with the order of the tests. When someone has
+ * a ref to the page, it may be possible that they dirty it then
+ * drop the reference. So if PageDirty is tested before page_count
+ * here, then the following race may occur:
+ *
+ * get_user_pages(&page);
+ * [user mapping goes away]
+ * write_to(page);
+ * !PageDirty(page) [good]
+ * SetPageDirty(page);
+ * put_page(page);
+ * !page_count(page) [good, discard it]
+ *
+ * [oops, our write_to data is lost]
+ *
+ * Reversing the order of the tests ensures such a situation cannot
+ * escape unnoticed. The smp_rmb is needed to ensure the page->flags
+ * load is not satisfied before that of page->_count.
+ *
+ * Note that if SetPageDirty is always performed via set_page_dirty,
+ * and thus under tree_lock, then this ordering is not required.
*/
if (unlikely(page_count(page) != 2))
goto cannot_free;
nr_taken = isolate_lru_pages(sc->swap_cluster_max,
&zone->inactive_list,
&page_list, &nr_scan);
- zone->nr_inactive -= nr_taken;
+ __mod_zone_page_state(zone, NR_INACTIVE, -nr_taken);
zone->pages_scanned += nr_scan;
spin_unlock_irq(&zone->lru_lock);
__count_vm_events(KSWAPD_STEAL, nr_freed);
} else
__count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
- __count_vm_events(PGACTIVATE, nr_freed);
+ __count_zone_vm_events(PGSTEAL, zone, nr_freed);
if (nr_taken == 0)
goto done;
return nr_reclaimed;
}
+/*
+ * We are about to scan this zone at a certain priority level. If that priority
+ * level is smaller (ie: more urgent) than the previous priority, then note
+ * that priority level within the zone. This is done so that when the next
+ * process comes in to scan this zone, it will immediately start out at this
+ * priority level rather than having to build up its own scanning priority.
+ * Here, this priority affects only the reclaim-mapped threshold.
+ */
+static inline void note_zone_scanning_priority(struct zone *zone, int priority)
+{
+ if (priority < zone->prev_priority)
+ zone->prev_priority = priority;
+}
+
static inline int zone_is_near_oom(struct zone *zone)
{
- return zone->pages_scanned >= (zone->nr_active + zone->nr_inactive)*3;
+ return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE)
+ + zone_page_state(zone, NR_INACTIVE))*3;
}
/*
* But we had to alter page->flags anyway.
*/
static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
- struct scan_control *sc)
+ struct scan_control *sc, int priority)
{
unsigned long pgmoved;
int pgdeactivate = 0;
* `distress' is a measure of how much trouble we're having
* reclaiming pages. 0 -> no problems. 100 -> great trouble.
*/
- distress = 100 >> zone->prev_priority;
+ distress = 100 >> min(zone->prev_priority, priority);
/*
* The point of this algorithm is to decide when to start
pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
&l_hold, &pgscanned);
zone->pages_scanned += pgscanned;
- zone->nr_active -= pgmoved;
+ __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
spin_unlock_irq(&zone->lru_lock);
while (!list_empty(&l_hold)) {
list_move(&page->lru, &zone->inactive_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
- zone->nr_inactive += pgmoved;
+ __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
spin_unlock_irq(&zone->lru_lock);
pgdeactivate += pgmoved;
pgmoved = 0;
spin_lock_irq(&zone->lru_lock);
}
}
- zone->nr_inactive += pgmoved;
+ __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
pgdeactivate += pgmoved;
if (buffer_heads_over_limit) {
spin_unlock_irq(&zone->lru_lock);
list_move(&page->lru, &zone->active_list);
pgmoved++;
if (!pagevec_add(&pvec, page)) {
- zone->nr_active += pgmoved;
+ __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
pgmoved = 0;
spin_unlock_irq(&zone->lru_lock);
__pagevec_release(&pvec);
spin_lock_irq(&zone->lru_lock);
}
}
- zone->nr_active += pgmoved;
+ __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
__count_zone_vm_events(PGREFILL, zone, pgscanned);
__count_vm_events(PGDEACTIVATE, pgdeactivate);
* Add one to `nr_to_scan' just to make sure that the kernel will
* slowly sift through the active list.
*/
- zone->nr_scan_active += (zone->nr_active >> priority) + 1;
+ zone->nr_scan_active +=
+ (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
nr_active = zone->nr_scan_active;
if (nr_active >= sc->swap_cluster_max)
zone->nr_scan_active = 0;
else
nr_active = 0;
- zone->nr_scan_inactive += (zone->nr_inactive >> priority) + 1;
+ zone->nr_scan_inactive +=
+ (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
nr_inactive = zone->nr_scan_inactive;
if (nr_inactive >= sc->swap_cluster_max)
zone->nr_scan_inactive = 0;
nr_to_scan = min(nr_active,
(unsigned long)sc->swap_cluster_max);
nr_active -= nr_to_scan;
- shrink_active_list(nr_to_scan, zone, sc);
+ shrink_active_list(nr_to_scan, zone, sc, priority);
}
if (nr_inactive) {
}
}
- throttle_vm_writeout();
+ throttle_vm_writeout(sc->gfp_mask);
atomic_dec(&zone->reclaim_in_progress);
return nr_reclaimed;
if (!populated_zone(zone))
continue;
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- zone->temp_priority = priority;
- if (zone->prev_priority > priority)
- zone->prev_priority = priority;
+ note_zone_scanning_priority(zone, priority);
if (zone->all_unreclaimable && priority != DEF_PRIORITY)
continue; /* Let kswapd poll it */
for (i = 0; zones[i] != NULL; i++) {
struct zone *zone = zones[i];
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- zone->temp_priority = DEF_PRIORITY;
- lru_pages += zone->nr_active + zone->nr_inactive;
+ lru_pages += zone_page_state(zone, NR_ACTIVE)
+ + zone_page_state(zone, NR_INACTIVE);
}
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
/* Take a nap, wait for some writeback to complete */
if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
}
/* top priority shrink_caches still had more to do? don't OOM, then */
if (!sc.all_unreclaimable)
ret = 1;
out:
+ /*
+ * Now that we've scanned all the zones at this priority level, note
+ * that level within the zone so that the next thread which performs
+ * scanning of this zone will immediately start out at this priority
+ * level. This affects only the decision whether or not to bring
+ * mapped pages onto the inactive list.
+ */
+ if (priority < 0)
+ priority = 0;
for (i = 0; zones[i] != 0; i++) {
struct zone *zone = zones[i];
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
continue;
- zone->prev_priority = zone->temp_priority;
+ zone->prev_priority = priority;
}
return ret;
}
.swap_cluster_max = SWAP_CLUSTER_MAX,
.swappiness = vm_swappiness,
};
+ /*
+ * temp_priority is used to remember the scanning priority at which
+ * this zone was successfully refilled to free_pages == pages_high.
+ */
+ int temp_priority[MAX_NR_ZONES];
loop_again:
total_scanned = 0;
sc.may_writepage = !laptop_mode;
count_vm_event(PAGEOUTRUN);
- for (i = 0; i < pgdat->nr_zones; i++) {
- struct zone *zone = pgdat->node_zones + i;
-
- zone->temp_priority = DEF_PRIORITY;
- }
+ for (i = 0; i < pgdat->nr_zones; i++)
+ temp_priority[i] = DEF_PRIORITY;
for (priority = DEF_PRIORITY; priority >= 0; priority--) {
int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
if (!zone_watermark_ok(zone, order, zone->pages_high,
0, 0)) {
end_zone = i;
- goto scan;
+ break;
}
}
- goto out;
-scan:
+ if (i < 0)
+ goto out;
+
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
- lru_pages += zone->nr_active + zone->nr_inactive;
+ lru_pages += zone_page_state(zone, NR_ACTIVE)
+ + zone_page_state(zone, NR_INACTIVE);
}
/*
if (!zone_watermark_ok(zone, order, zone->pages_high,
end_zone, 0))
all_zones_ok = 0;
- zone->temp_priority = priority;
- if (zone->prev_priority > priority)
- zone->prev_priority = priority;
+ temp_priority[i] = priority;
sc.nr_scanned = 0;
+ note_zone_scanning_priority(zone, priority);
nr_reclaimed += shrink_zone(priority, zone, &sc);
reclaim_state->reclaimed_slab = 0;
nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
if (zone->all_unreclaimable)
continue;
if (nr_slab == 0 && zone->pages_scanned >=
- (zone->nr_active + zone->nr_inactive) * 6)
- zone->all_unreclaimable = 1;
+ (zone_page_state(zone, NR_ACTIVE)
+ + zone_page_state(zone, NR_INACTIVE)) * 6)
+ zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
* the reclaim ratio is low, start doing writepage
* another pass across the zones.
*/
if (total_scanned && priority < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ/10);
+ congestion_wait(WRITE, HZ/10);
/*
* We do this so kswapd doesn't build up large priorities for
break;
}
out:
+ /*
+ * Note within each zone the priority level at which this zone was
+ * brought into a happy state. So that the next thread which scans this
+ * zone will start out at that priority level.
+ */
for (i = 0; i < pgdat->nr_zones; i++) {
struct zone *zone = pgdat->node_zones + i;
- zone->prev_priority = zone->temp_priority;
+ zone->prev_priority = temp_priority[i];
}
if (!all_zones_ok) {
cond_resched();
+
+ try_to_freeze();
+
goto loop_again;
}
for ( ; ; ) {
unsigned long new_order;
- try_to_freeze();
-
prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
new_order = pgdat->kswapd_max_order;
pgdat->kswapd_max_order = 0;
*/
order = new_order;
} else {
- schedule();
+ if (!freezing(current))
+ schedule();
+
order = pgdat->kswapd_max_order;
}
finish_wait(&pgdat->kswapd_wait, &wait);
- balance_pgdat(pgdat, order);
+ if (!try_to_freeze()) {
+ /* We can speed up thawing tasks if we don't call
+ * balance_pgdat after returning from the refrigerator
+ */
+ balance_pgdat(pgdat, order);
+ }
}
return 0;
}
return;
if (pgdat->kswapd_max_order < order)
pgdat->kswapd_max_order = order;
- if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
return;
if (!waitqueue_active(&pgdat->kswapd_wait))
return;
*
* For pass > 3 we also try to shrink the LRU lists that contain a few pages
*/
-static unsigned long shrink_all_zones(unsigned long nr_pages, int pass,
- int prio, struct scan_control *sc)
+static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
+ int pass, struct scan_control *sc)
{
struct zone *zone;
unsigned long nr_to_scan, ret = 0;
/* For pass = 0 we don't shrink the active list */
if (pass > 0) {
- zone->nr_scan_active += (zone->nr_active >> prio) + 1;
+ zone->nr_scan_active +=
+ (zone_page_state(zone, NR_ACTIVE) >> prio) + 1;
if (zone->nr_scan_active >= nr_pages || pass > 3) {
zone->nr_scan_active = 0;
- nr_to_scan = min(nr_pages, zone->nr_active);
- shrink_active_list(nr_to_scan, zone, sc);
+ nr_to_scan = min(nr_pages,
+ zone_page_state(zone, NR_ACTIVE));
+ shrink_active_list(nr_to_scan, zone, sc, prio);
}
}
- zone->nr_scan_inactive += (zone->nr_inactive >> prio) + 1;
+ zone->nr_scan_inactive +=
+ (zone_page_state(zone, NR_INACTIVE) >> prio) + 1;
if (zone->nr_scan_inactive >= nr_pages || pass > 3) {
zone->nr_scan_inactive = 0;
- nr_to_scan = min(nr_pages, zone->nr_inactive);
+ nr_to_scan = min(nr_pages,
+ zone_page_state(zone, NR_INACTIVE));
ret += shrink_inactive_list(nr_to_scan, zone, sc);
if (ret >= nr_pages)
return ret;
return ret;
}
+static unsigned long count_lru_pages(void)
+{
+ return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE);
+}
+
/*
* Try to free `nr_pages' of memory, system-wide, and return the number of
* freed pages.
unsigned long ret = 0;
int pass;
struct reclaim_state reclaim_state;
- struct zone *zone;
struct scan_control sc = {
.gfp_mask = GFP_KERNEL,
.may_swap = 0,
current->reclaim_state = &reclaim_state;
- lru_pages = 0;
- for_each_zone(zone)
- lru_pages += zone->nr_active + zone->nr_inactive;
-
+ lru_pages = count_lru_pages();
nr_slab = global_page_state(NR_SLAB_RECLAIMABLE);
/* If slab caches are huge, it's better to hit them first */
while (nr_slab >= lru_pages) {
for (pass = 0; pass < 5; pass++) {
int prio;
- /* Needed for shrinking slab caches later on */
- if (!lru_pages)
- for_each_zone(zone) {
- lru_pages += zone->nr_active;
- lru_pages += zone->nr_inactive;
- }
-
/* Force reclaiming mapped pages in the passes #3 and #4 */
if (pass > 2) {
sc.may_swap = 1;
goto out;
reclaim_state.reclaimed_slab = 0;
- shrink_slab(sc.nr_scanned, sc.gfp_mask, lru_pages);
+ shrink_slab(sc.nr_scanned, sc.gfp_mask,
+ count_lru_pages());
ret += reclaim_state.reclaimed_slab;
if (ret >= nr_pages)
goto out;
if (sc.nr_scanned && prio < DEF_PRIORITY - 2)
- blk_congestion_wait(WRITE, HZ / 10);
+ congestion_wait(WRITE, HZ / 10);
}
-
- lru_pages = 0;
}
/*
* If ret = 0, we could not shrink LRUs, but there may be something
* in slab caches
*/
- if (!ret)
+ if (!ret) {
do {
reclaim_state.reclaimed_slab = 0;
- shrink_slab(nr_pages, sc.gfp_mask, lru_pages);
+ shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages());
ret += reclaim_state.reclaimed_slab;
} while (ret < nr_pages && reclaim_state.reclaimed_slab > 0);
+ }
out:
current->reclaim_state = NULL;
}
#endif
-#ifdef CONFIG_HOTPLUG_CPU
/* It's optimal to keep kswapds on the same CPUs as their memory, but
not required for correctness. So if the last cpu in a node goes
away, we get changed to run anywhere: as the first one comes back,
}
return NOTIFY_OK;
}
-#endif /* CONFIG_HOTPLUG_CPU */
/*
* This kswapd start function will be called by init and node-hot-add.
.gfp_mask = gfp_mask,
.swappiness = vm_swappiness,
};
+ unsigned long slab_reclaimable;
disable_swap_token();
cond_resched();
*/
priority = ZONE_RECLAIM_PRIORITY;
do {
+ note_zone_scanning_priority(zone, priority);
nr_reclaimed += shrink_zone(priority, zone, &sc);
priority--;
} while (priority >= 0 && nr_reclaimed < nr_pages);
}
- if (zone_page_state(zone, NR_SLAB_RECLAIMABLE) > zone->min_slab_pages) {
+ slab_reclaimable = zone_page_state(zone, NR_SLAB_RECLAIMABLE);
+ if (slab_reclaimable > zone->min_slab_pages) {
/*
* shrink_slab() does not currently allow us to determine how
* many pages were freed in this zone. So we take the current
* Note that shrink_slab will free memory on all zones and may
* take a long time.
*/
- unsigned long limit = zone_page_state(zone,
- NR_SLAB_RECLAIMABLE) - nr_pages;
-
while (shrink_slab(sc.nr_scanned, gfp_mask, order) &&
- zone_page_state(zone, NR_SLAB_RECLAIMABLE) > limit)
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE) >
+ slab_reclaimable - nr_pages)
;
+
+ /*
+ * Update nr_reclaimed by the number of slab pages we
+ * reclaimed from this zone.
+ */
+ nr_reclaimed += slab_reclaimable -
+ zone_page_state(zone, NR_SLAB_RECLAIMABLE);
}
p->reclaim_state = NULL;
* over remote processors and spread off node memory allocations
* as wide as possible.
*/
- node_id = zone->zone_pgdat->node_id;
+ node_id = zone_to_nid(zone);
mask = node_to_cpumask(node_id);
if (!cpus_empty(mask) && node_id != numa_node_id())
return 0;