Merge branch 'for-2.6.35' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie...
[safe/jmp/linux-2.6] / mm / vmscan.c
index d55d106..3ff3311 100644 (file)
@@ -13,7 +13,7 @@
 
 #include <linux/mm.h>
 #include <linux/module.h>
-#include <linux/slab.h>
+#include <linux/gfp.h>
 #include <linux/kernel_stat.h>
 #include <linux/swap.h>
 #include <linux/pagemap.h>
@@ -71,12 +71,6 @@ struct scan_control {
        /* Can pages be swapped as part of reclaim? */
        int may_swap;
 
-       /* This context's SWAP_CLUSTER_MAX. If freeing memory for
-        * suspend, we effectively ignore SWAP_CLUSTER_MAX.
-        * In this context, it doesn't matter that we scan the
-        * whole list at once. */
-       int swap_cluster_max;
-
        int swappiness;
 
        int all_unreclaimable;
@@ -268,27 +262,6 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
        return ret;
 }
 
-/* Called without lock on whether page is mapped, so answer is unstable */
-static inline int page_mapping_inuse(struct page *page)
-{
-       struct address_space *mapping;
-
-       /* Page is in somebody's page tables. */
-       if (page_mapped(page))
-               return 1;
-
-       /* Be more reluctant to reclaim swapcache than pagecache */
-       if (PageSwapCache(page))
-               return 1;
-
-       mapping = page_mapping(page);
-       if (!mapping)
-               return 0;
-
-       /* File is mmap'd by somebody? */
-       return mapping_mapped(mapping);
-}
-
 static inline int is_page_cache_freeable(struct page *page)
 {
        /*
@@ -585,6 +558,65 @@ redo:
        put_page(page);         /* drop ref from isolate */
 }
 
+enum page_references {
+       PAGEREF_RECLAIM,
+       PAGEREF_RECLAIM_CLEAN,
+       PAGEREF_KEEP,
+       PAGEREF_ACTIVATE,
+};
+
+static enum page_references page_check_references(struct page *page,
+                                                 struct scan_control *sc)
+{
+       int referenced_ptes, referenced_page;
+       unsigned long vm_flags;
+
+       referenced_ptes = page_referenced(page, 1, sc->mem_cgroup, &vm_flags);
+       referenced_page = TestClearPageReferenced(page);
+
+       /* Lumpy reclaim - ignore references */
+       if (sc->order > PAGE_ALLOC_COSTLY_ORDER)
+               return PAGEREF_RECLAIM;
+
+       /*
+        * Mlock lost the isolation race with us.  Let try_to_unmap()
+        * move the page to the unevictable list.
+        */
+       if (vm_flags & VM_LOCKED)
+               return PAGEREF_RECLAIM;
+
+       if (referenced_ptes) {
+               if (PageAnon(page))
+                       return PAGEREF_ACTIVATE;
+               /*
+                * All mapped pages start out with page table
+                * references from the instantiating fault, so we need
+                * to look twice if a mapped file page is used more
+                * than once.
+                *
+                * Mark it and spare it for another trip around the
+                * inactive list.  Another page table reference will
+                * lead to its activation.
+                *
+                * Note: the mark is set for activated pages as well
+                * so that recently deactivated but used pages are
+                * quickly recovered.
+                */
+               SetPageReferenced(page);
+
+               if (referenced_page)
+                       return PAGEREF_ACTIVATE;
+
+               return PAGEREF_KEEP;
+       }
+
+       /* Reclaim if clean, defer dirty pages to writeback */
+       if (referenced_page)
+               return PAGEREF_RECLAIM_CLEAN;
+
+       return PAGEREF_RECLAIM;
+}
+
 /*
  * shrink_page_list() returns the number of reclaimed pages
  */
@@ -596,16 +628,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
        struct pagevec freed_pvec;
        int pgactivate = 0;
        unsigned long nr_reclaimed = 0;
-       unsigned long vm_flags;
 
        cond_resched();
 
        pagevec_init(&freed_pvec, 1);
        while (!list_empty(page_list)) {
+               enum page_references references;
                struct address_space *mapping;
                struct page *page;
                int may_enter_fs;
-               int referenced;
 
                cond_resched();
 
@@ -647,17 +678,16 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
                }
 
-               referenced = page_referenced(page, 1,
-                                               sc->mem_cgroup, &vm_flags);
-               /*
-                * In active use or really unfreeable?  Activate it.
-                * If page which have PG_mlocked lost isoltation race,
-                * try_to_unmap moves it to unevictable list
-                */
-               if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
-                                       referenced && page_mapping_inuse(page)
-                                       && !(vm_flags & VM_LOCKED))
+               references = page_check_references(page, sc);
+               switch (references) {
+               case PAGEREF_ACTIVATE:
                        goto activate_locked;
+               case PAGEREF_KEEP:
+                       goto keep_locked;
+               case PAGEREF_RECLAIM:
+               case PAGEREF_RECLAIM_CLEAN:
+                       ; /* try to reclaim the page below */
+               }
 
                /*
                 * Anonymous process memory has backing store?
@@ -691,7 +721,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                }
 
                if (PageDirty(page)) {
-                       if (sc->order <= PAGE_ALLOC_COSTLY_ORDER && referenced)
+                       if (references == PAGEREF_RECLAIM_CLEAN)
                                goto keep_locked;
                        if (!may_enter_fs)
                                goto keep_locked;
@@ -1137,7 +1167,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                unsigned long nr_anon;
                unsigned long nr_file;
 
-               nr_taken = sc->isolate_pages(sc->swap_cluster_max,
+               nr_taken = sc->isolate_pages(SWAP_CLUSTER_MAX,
                             &page_list, &nr_scan, sc->order, mode,
                                zone, sc->mem_cgroup, 0, file);
 
@@ -1171,10 +1201,8 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                __mod_zone_page_state(zone, NR_ISOLATED_ANON, nr_anon);
                __mod_zone_page_state(zone, NR_ISOLATED_FILE, nr_file);
 
-               reclaim_stat->recent_scanned[0] += count[LRU_INACTIVE_ANON];
-               reclaim_stat->recent_scanned[0] += count[LRU_ACTIVE_ANON];
-               reclaim_stat->recent_scanned[1] += count[LRU_INACTIVE_FILE];
-               reclaim_stat->recent_scanned[1] += count[LRU_ACTIVE_FILE];
+               reclaim_stat->recent_scanned[0] += nr_anon;
+               reclaim_stat->recent_scanned[1] += nr_file;
 
                spin_unlock_irq(&zone->lru_lock);
 
@@ -1358,9 +1386,7 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                        continue;
                }
 
-               /* page_referenced clears PageReferenced */
-               if (page_mapping_inuse(page) &&
-                   page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
+               if (page_referenced(page, 0, sc->mem_cgroup, &vm_flags)) {
                        nr_rotated++;
                        /*
                         * Identify referenced, file-backed active pages and
@@ -1469,20 +1495,26 @@ static int inactive_file_is_low(struct zone *zone, struct scan_control *sc)
        return low;
 }
 
+static int inactive_list_is_low(struct zone *zone, struct scan_control *sc,
+                               int file)
+{
+       if (file)
+               return inactive_file_is_low(zone, sc);
+       else
+               return inactive_anon_is_low(zone, sc);
+}
+
 static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan,
        struct zone *zone, struct scan_control *sc, int priority)
 {
        int file = is_file_lru(lru);
 
-       if (lru == LRU_ACTIVE_FILE && inactive_file_is_low(zone, sc)) {
-               shrink_active_list(nr_to_scan, zone, sc, priority, file);
+       if (is_active_lru(lru)) {
+               if (inactive_list_is_low(zone, sc, file))
+                   shrink_active_list(nr_to_scan, zone, sc, priority, file);
                return 0;
        }
 
-       if (lru == LRU_ACTIVE_ANON && inactive_anon_is_low(zone, sc)) {
-               shrink_active_list(nr_to_scan, zone, sc, priority, file);
-               return 0;
-       }
        return shrink_inactive_list(nr_to_scan, zone, sc, priority, file);
 }
 
@@ -1572,15 +1604,14 @@ static void get_scan_ratio(struct zone *zone, struct scan_control *sc,
  * until we collected @swap_cluster_max pages to scan.
  */
 static unsigned long nr_scan_try_batch(unsigned long nr_to_scan,
-                                      unsigned long *nr_saved_scan,
-                                      unsigned long swap_cluster_max)
+                                      unsigned long *nr_saved_scan)
 {
        unsigned long nr;
 
        *nr_saved_scan += nr_to_scan;
        nr = *nr_saved_scan;
 
-       if (nr >= swap_cluster_max)
+       if (nr >= SWAP_CLUSTER_MAX)
                *nr_saved_scan = 0;
        else
                nr = 0;
@@ -1599,7 +1630,6 @@ static void shrink_zone(int priority, struct zone *zone,
        unsigned long percent[2];       /* anon @ 0; file @ 1 */
        enum lru_list l;
        unsigned long nr_reclaimed = sc->nr_reclaimed;
-       unsigned long swap_cluster_max = sc->swap_cluster_max;
        unsigned long nr_to_reclaim = sc->nr_to_reclaim;
        struct zone_reclaim_stat *reclaim_stat = get_reclaim_stat(zone, sc);
        int noswap = 0;
@@ -1622,15 +1652,15 @@ static void shrink_zone(int priority, struct zone *zone,
                        scan = (scan * percent[file]) / 100;
                }
                nr[l] = nr_scan_try_batch(scan,
-                                         &reclaim_stat->nr_saved_scan[l],
-                                         swap_cluster_max);
+                                         &reclaim_stat->nr_saved_scan[l]);
        }
 
        while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
                                        nr[LRU_INACTIVE_FILE]) {
                for_each_evictable_lru(l) {
                        if (nr[l]) {
-                               nr_to_scan = min(nr[l], swap_cluster_max);
+                               nr_to_scan = min_t(unsigned long,
+                                                  nr[l], SWAP_CLUSTER_MAX);
                                nr[l] -= nr_to_scan;
 
                                nr_reclaimed += shrink_list(l, nr_to_scan,
@@ -1645,7 +1675,7 @@ static void shrink_zone(int priority, struct zone *zone,
                 * with multiple processes reclaiming pages, the total
                 * freeing target can get unreasonably large.
                 */
-               if (nr_reclaimed > nr_to_reclaim && priority < DEF_PRIORITY)
+               if (nr_reclaimed >= nr_to_reclaim && priority < DEF_PRIORITY)
                        break;
        }
 
@@ -1698,8 +1728,7 @@ static void shrink_zones(int priority, struct zonelist *zonelist,
                                continue;
                        note_zone_scanning_priority(zone, priority);
 
-                       if (zone_is_all_unreclaimable(zone) &&
-                                               priority != DEF_PRIORITY)
+                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;       /* Let kswapd poll it */
                        sc->all_unreclaimable = 0;
                } else {
@@ -1838,7 +1867,6 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
        struct scan_control sc = {
                .gfp_mask = gfp_mask,
                .may_writepage = !laptop_mode,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .may_unmap = 1,
                .may_swap = 1,
@@ -1863,7 +1891,6 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
                .swappiness = swappiness,
                .order = 0,
                .mem_cgroup = mem,
@@ -1897,7 +1924,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
                .may_writepage = !laptop_mode,
                .may_unmap = 1,
                .may_swap = !noswap,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
                .nr_to_reclaim = SWAP_CLUSTER_MAX,
                .swappiness = swappiness,
                .order = 0,
@@ -1929,6 +1955,9 @@ static int sleeping_prematurely(pg_data_t *pgdat, int order, long remaining)
                if (!populated_zone(zone))
                        continue;
 
+               if (zone->all_unreclaimable)
+                       continue;
+
                if (!zone_watermark_ok(zone, order, high_wmark_pages(zone),
                                                                0, 0))
                        return 1;
@@ -1969,7 +1998,6 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                .gfp_mask = GFP_KERNEL,
                .may_unmap = 1,
                .may_swap = 1,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
                /*
                 * kswapd doesn't want to be bailed out while reclaim. because
                 * we want to put equal scanning pressure on each zone.
@@ -2017,8 +2045,7 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
 
-                       if (zone_is_all_unreclaimable(zone) &&
-                           priority != DEF_PRIORITY)
+                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
 
                        /*
@@ -2061,13 +2088,9 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
 
-                       if (zone_is_all_unreclaimable(zone) &&
-                                       priority != DEF_PRIORITY)
+                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
                                continue;
 
-                       if (!zone_watermark_ok(zone, order,
-                                       high_wmark_pages(zone), end_zone, 0))
-                               all_zones_ok = 0;
                        temp_priority[i] = priority;
                        sc.nr_scanned = 0;
                        note_zone_scanning_priority(zone, priority);
@@ -2092,12 +2115,11 @@ loop_again:
                                                lru_pages);
                        sc.nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
-                       if (zone_is_all_unreclaimable(zone))
+                       if (zone->all_unreclaimable)
                                continue;
-                       if (nr_slab == 0 && zone->pages_scanned >=
-                                       (zone_reclaimable_pages(zone) * 6))
-                                       zone_set_flag(zone,
-                                                     ZONE_ALL_UNRECLAIMABLE);
+                       if (nr_slab == 0 &&
+                           zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
+                               zone->all_unreclaimable = 1;
                        /*
                         * If we've done a decent amount of scanning and
                         * the reclaim ratio is low, start doing writepage
@@ -2107,13 +2129,18 @@ loop_again:
                            total_scanned > sc.nr_reclaimed + sc.nr_reclaimed / 2)
                                sc.may_writepage = 1;
 
-                       /*
-                        * We are still under min water mark. it mean we have
-                        * GFP_ATOMIC allocation failure risk. Hurry up!
-                        */
-                       if (!zone_watermark_ok(zone, order, min_wmark_pages(zone),
-                                             end_zone, 0))
-                               has_under_min_watermark_zone = 1;
+                       if (!zone_watermark_ok(zone, order,
+                                       high_wmark_pages(zone), end_zone, 0)) {
+                               all_zones_ok = 0;
+                               /*
+                                * We are still under min water mark.  This
+                                * means that we have a GFP_ATOMIC allocation
+                                * failure risk. Hurry up!
+                                */
+                               if (!zone_watermark_ok(zone, order,
+                                           min_wmark_pages(zone), end_zone, 0))
+                                       has_under_min_watermark_zone = 1;
+                       }
 
                }
                if (all_zones_ok)
@@ -2354,7 +2381,6 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim)
                .may_swap = 1,
                .may_unmap = 1,
                .may_writepage = 1,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
                .nr_to_reclaim = nr_to_reclaim,
                .hibernation_mode = 1,
                .swappiness = vm_swappiness,
@@ -2539,7 +2565,6 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                .may_writepage = !!(zone_reclaim_mode & RECLAIM_WRITE),
                .may_unmap = !!(zone_reclaim_mode & RECLAIM_SWAP),
                .may_swap = 1,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
                .nr_to_reclaim = max_t(unsigned long, nr_pages,
                                       SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
@@ -2557,6 +2582,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         * and RECLAIM_SWAP.
         */
        p->flags |= PF_MEMALLOC | PF_SWAPWRITE;
+       lockdep_set_current_reclaim_state(gfp_mask);
        reclaim_state.reclaimed_slab = 0;
        p->reclaim_state = &reclaim_state;
 
@@ -2600,6 +2626,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
        p->reclaim_state = NULL;
        current->flags &= ~(PF_MEMALLOC | PF_SWAPWRITE);
+       lockdep_clear_current_reclaim_state();
        return sc.nr_reclaimed >= nr_pages;
 }
 
@@ -2622,7 +2649,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
            zone_page_state(zone, NR_SLAB_RECLAIMABLE) <= zone->min_slab_pages)
                return ZONE_RECLAIM_FULL;
 
-       if (zone_is_all_unreclaimable(zone))
+       if (zone->all_unreclaimable)
                return ZONE_RECLAIM_FULL;
 
        /*