nfs: prepare to share nfs_set_port
[safe/jmp/linux-2.6] / mm / vmscan.c
index 2225b7c..1ff1a58 100644 (file)
@@ -37,6 +37,8 @@
 #include <linux/delay.h>
 #include <linux/kthread.h>
 #include <linux/freezer.h>
+#include <linux/memcontrol.h>
+#include <linux/delayacct.h>
 
 #include <asm/tlbflush.h>
 #include <asm/div64.h>
@@ -68,6 +70,15 @@ struct scan_control {
        int all_unreclaimable;
 
        int order;
+
+       /* Which cgroup do we reclaim from */
+       struct mem_cgroup *mem_cgroup;
+
+       /* Pluggable isolate pages callback */
+       unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst,
+                       unsigned long *scanned, int order, int mode,
+                       struct zone *z, struct mem_cgroup *mem_cont,
+                       int active);
 };
 
 #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
@@ -109,6 +120,12 @@ long vm_total_pages;       /* The total number of pages which the VM controls */
 static LIST_HEAD(shrinker_list);
 static DECLARE_RWSEM(shrinker_rwsem);
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+#define scan_global_lru(sc)    (!(sc)->mem_cgroup)
+#else
+#define scan_global_lru(sc)    (1)
+#endif
+
 /*
  * Add a shrinker callback to be called from the vm
  */
@@ -141,7 +158,7 @@ EXPORT_SYMBOL(unregister_shrinker);
  * percentages of the lru and ageable caches.  This should balance the seeks
  * generated by these structures.
  *
- * If the vm encounted mapped pages on the LRU it increase the pressure on
+ * If the vm encountered mapped pages on the LRU it increase the pressure on
  * slab to avoid swapping.
  *
  * We do weird things to avoid (scanned*seeks*entries) overflowing 32 bits.
@@ -175,7 +192,7 @@ unsigned long shrink_slab(unsigned long scanned, gfp_t gfp_mask,
                shrinker->nr += delta;
                if (shrinker->nr < 0) {
                        printk(KERN_ERR "%s: nr=%ld\n",
-                                       __FUNCTION__, shrinker->nr);
+                                       __func__, shrinker->nr);
                        shrinker->nr = max_pass;
                }
 
@@ -271,6 +288,12 @@ static void handle_write_error(struct address_space *mapping,
        unlock_page(page);
 }
 
+/* Request for sync pageout. */
+enum pageout_io {
+       PAGEOUT_IO_ASYNC,
+       PAGEOUT_IO_SYNC,
+};
+
 /* possible outcome of pageout() */
 typedef enum {
        /* failed to write page out, page is locked */
@@ -287,7 +310,8 @@ typedef enum {
  * pageout is called by shrink_page_list() for each dirty page.
  * Calls ->writepage().
  */
-static pageout_t pageout(struct page *page, struct address_space *mapping)
+static pageout_t pageout(struct page *page, struct address_space *mapping,
+                                               enum pageout_io sync_writeback)
 {
        /*
         * If the page is dirty, only perform writeback if that write
@@ -316,7 +340,7 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
                if (PagePrivate(page)) {
                        if (try_to_free_buffers(page)) {
                                ClearPageDirty(page);
-                               printk("%s: orphaned page\n", __FUNCTION__);
+                               printk("%s: orphaned page\n", __func__);
                                return PAGE_CLEAN;
                        }
                }
@@ -346,6 +370,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
                        ClearPageReclaim(page);
                        return PAGE_ACTIVATE;
                }
+
+               /*
+                * Wait on writeback if requested to. This happens when
+                * direct reclaiming a large contiguous area and the
+                * first attempt to free a range of pages fails.
+                */
+               if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC)
+                       wait_on_page_writeback(page);
+
                if (!PageWriteback(page)) {
                        /* synchronous write or broken a_ops? */
                        ClearPageReclaim(page);
@@ -358,17 +391,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping)
 }
 
 /*
- * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
- * someone else has a ref on the page, abort and return 0.  If it was
- * successfully detached, return 1.  Assumes the caller has a single ref on
- * this page.
+ * Same as remove_mapping, but if the page is removed from the mapping, it
+ * gets returned with a refcount of 0.
  */
-int remove_mapping(struct address_space *mapping, struct page *page)
+static int __remove_mapping(struct address_space *mapping, struct page *page)
 {
        BUG_ON(!PageLocked(page));
        BUG_ON(mapping != page_mapping(page));
 
-       write_lock_irq(&mapping->tree_lock);
+       spin_lock_irq(&mapping->tree_lock);
        /*
         * The non racy check for a busy page.
         *
@@ -394,28 +425,48 @@ int remove_mapping(struct address_space *mapping, struct page *page)
         * Note that if SetPageDirty is always performed via set_page_dirty,
         * and thus under tree_lock, then this ordering is not required.
         */
-       if (unlikely(page_count(page) != 2))
+       if (!page_freeze_refs(page, 2))
                goto cannot_free;
-       smp_rmb();
-       if (unlikely(PageDirty(page)))
+       /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */
+       if (unlikely(PageDirty(page))) {
+               page_unfreeze_refs(page, 2);
                goto cannot_free;
+       }
 
        if (PageSwapCache(page)) {
                swp_entry_t swap = { .val = page_private(page) };
                __delete_from_swap_cache(page);
-               write_unlock_irq(&mapping->tree_lock);
+               spin_unlock_irq(&mapping->tree_lock);
                swap_free(swap);
-               __put_page(page);       /* The pagecache ref */
-               return 1;
+       } else {
+               __remove_from_page_cache(page);
+               spin_unlock_irq(&mapping->tree_lock);
        }
 
-       __remove_from_page_cache(page);
-       write_unlock_irq(&mapping->tree_lock);
-       __put_page(page);
        return 1;
 
 cannot_free:
-       write_unlock_irq(&mapping->tree_lock);
+       spin_unlock_irq(&mapping->tree_lock);
+       return 0;
+}
+
+/*
+ * Attempt to detach a locked page from its ->mapping.  If it is dirty or if
+ * someone else has a ref on the page, abort and return 0.  If it was
+ * successfully detached, return 1.  Assumes the caller has a single ref on
+ * this page.
+ */
+int remove_mapping(struct address_space *mapping, struct page *page)
+{
+       if (__remove_mapping(mapping, page)) {
+               /*
+                * Unfreezing the refcount with 1 rather than 2 effectively
+                * drops the pagecache ref for us without requiring another
+                * atomic operation.
+                */
+               page_unfreeze_refs(page, 1);
+               return 1;
+       }
        return 0;
 }
 
@@ -423,7 +474,8 @@ cannot_free:
  * shrink_page_list() returns the number of reclaimed pages
  */
 static unsigned long shrink_page_list(struct list_head *page_list,
-                                       struct scan_control *sc)
+                                       struct scan_control *sc,
+                                       enum pageout_io sync_writeback)
 {
        LIST_HEAD(ret_pages);
        struct pagevec freed_pvec;
@@ -444,7 +496,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                page = lru_to_page(page_list);
                list_del(&page->lru);
 
-               if (TestSetPageLocked(page))
+               if (!trylock_page(page))
                        goto keep;
 
                VM_BUG_ON(PageActive(page));
@@ -458,10 +510,25 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (page_mapped(page) || PageSwapCache(page))
                        sc->nr_scanned++;
 
-               if (PageWriteback(page))
-                       goto keep_locked;
+               may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
+                       (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
+
+               if (PageWriteback(page)) {
+                       /*
+                        * Synchronous reclaim is performed in two passes,
+                        * first an asynchronous pass over the list to
+                        * start parallel writeback, and a second synchronous
+                        * pass to wait for the IO to complete.  Wait here
+                        * for any page for which writeback has already
+                        * started.
+                        */
+                       if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs)
+                               wait_on_page_writeback(page);
+                       else
+                               goto keep_locked;
+               }
 
-               referenced = page_referenced(page, 1);
+               referenced = page_referenced(page, 1, sc->mem_cgroup);
                /* In active use or really unfreeable?  Activate it. */
                if (sc->order <= PAGE_ALLOC_COSTLY_ORDER &&
                                        referenced && page_mapping_inuse(page))
@@ -478,8 +545,6 @@ static unsigned long shrink_page_list(struct list_head *page_list,
 #endif /* CONFIG_SWAP */
 
                mapping = page_mapping(page);
-               may_enter_fs = (sc->gfp_mask & __GFP_FS) ||
-                       (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
 
                /*
                 * The page is mapped into the page tables of one or more
@@ -505,7 +570,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                goto keep_locked;
 
                        /* Page is dirty, try to write it out here */
-                       switch(pageout(page, mapping)) {
+                       switch (pageout(page, mapping, sync_writeback)) {
                        case PAGE_KEEP:
                                goto keep_locked;
                        case PAGE_ACTIVATE:
@@ -517,7 +582,7 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                                 * A synchronous write - probably a ramdisk.  Go
                                 * ahead and try to reclaim the page.
                                 */
-                               if (TestSetPageLocked(page))
+                               if (!trylock_page(page))
                                        goto keep;
                                if (PageDirty(page) || PageWriteback(page))
                                        goto keep_locked;
@@ -551,18 +616,34 @@ static unsigned long shrink_page_list(struct list_head *page_list,
                if (PagePrivate(page)) {
                        if (!try_to_release_page(page, sc->gfp_mask))
                                goto activate_locked;
-                       if (!mapping && page_count(page) == 1)
-                               goto free_it;
+                       if (!mapping && page_count(page) == 1) {
+                               unlock_page(page);
+                               if (put_page_testzero(page))
+                                       goto free_it;
+                               else {
+                                       /*
+                                        * rare race with speculative reference.
+                                        * the speculative reference will free
+                                        * this page shortly, so we may
+                                        * increment nr_reclaimed here (and
+                                        * leave it off the LRU).
+                                        */
+                                       nr_reclaimed++;
+                                       continue;
+                               }
+                       }
                }
 
-               if (!mapping || !remove_mapping(mapping, page))
+               if (!mapping || !__remove_mapping(mapping, page))
                        goto keep_locked;
 
-free_it:
                unlock_page(page);
+free_it:
                nr_reclaimed++;
-               if (!pagevec_add(&freed_pvec, page))
-                       __pagevec_release_nonlru(&freed_pvec);
+               if (!pagevec_add(&freed_pvec, page)) {
+                       __pagevec_free(&freed_pvec);
+                       pagevec_reinit(&freed_pvec);
+               }
                continue;
 
 activate_locked:
@@ -576,7 +657,7 @@ keep:
        }
        list_splice(&ret_pages, page_list);
        if (pagevec_count(&freed_pvec))
-               __pagevec_release_nonlru(&freed_pvec);
+               __pagevec_free(&freed_pvec);
        count_vm_events(PGACTIVATE, pgactivate);
        return nr_reclaimed;
 }
@@ -596,7 +677,7 @@ keep:
  *
  * returns 0 on success, -ve errno on failure.
  */
-static int __isolate_lru_page(struct page *page, int mode)
+int __isolate_lru_page(struct page *page, int mode)
 {
        int ret = -EINVAL;
 
@@ -730,6 +811,21 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
        return nr_taken;
 }
 
+static unsigned long isolate_pages_global(unsigned long nr,
+                                       struct list_head *dst,
+                                       unsigned long *scanned, int order,
+                                       int mode, struct zone *z,
+                                       struct mem_cgroup *mem_cont,
+                                       int active)
+{
+       if (active)
+               return isolate_lru_pages(nr, &z->active_list, dst,
+                                               scanned, order, mode);
+       else
+               return isolate_lru_pages(nr, &z->inactive_list, dst,
+                                               scanned, order, mode);
+}
+
 /*
  * clear_active_flags() is a helper for shrink_active_list(), clearing
  * any active bits from the pages in the list.
@@ -771,28 +867,53 @@ static unsigned long shrink_inactive_list(unsigned long max_scan,
                unsigned long nr_freed;
                unsigned long nr_active;
 
-               nr_taken = isolate_lru_pages(sc->swap_cluster_max,
-                            &zone->inactive_list,
+               nr_taken = sc->isolate_pages(sc->swap_cluster_max,
                             &page_list, &nr_scan, sc->order,
                             (sc->order > PAGE_ALLOC_COSTLY_ORDER)?
-                                            ISOLATE_BOTH : ISOLATE_INACTIVE);
+                                            ISOLATE_BOTH : ISOLATE_INACTIVE,
+                               zone, sc->mem_cgroup, 0);
                nr_active = clear_active_flags(&page_list);
+               __count_vm_events(PGDEACTIVATE, nr_active);
 
                __mod_zone_page_state(zone, NR_ACTIVE, -nr_active);
                __mod_zone_page_state(zone, NR_INACTIVE,
                                                -(nr_taken - nr_active));
-               zone->pages_scanned += nr_scan;
+               if (scan_global_lru(sc))
+                       zone->pages_scanned += nr_scan;
                spin_unlock_irq(&zone->lru_lock);
 
                nr_scanned += nr_scan;
-               nr_freed = shrink_page_list(&page_list, sc);
+               nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC);
+
+               /*
+                * If we are direct reclaiming for contiguous pages and we do
+                * not reclaim everything in the list, try again and wait
+                * for IO to complete. This will stall high-order allocations
+                * but that should be acceptable to the caller
+                */
+               if (nr_freed < nr_taken && !current_is_kswapd() &&
+                                       sc->order > PAGE_ALLOC_COSTLY_ORDER) {
+                       congestion_wait(WRITE, HZ/10);
+
+                       /*
+                        * The attempt at page out may have made some
+                        * of the pages active, mark them inactive again.
+                        */
+                       nr_active = clear_active_flags(&page_list);
+                       count_vm_events(PGDEACTIVATE, nr_active);
+
+                       nr_freed += shrink_page_list(&page_list, sc,
+                                                       PAGEOUT_IO_SYNC);
+               }
+
                nr_reclaimed += nr_freed;
                local_irq_disable();
                if (current_is_kswapd()) {
                        __count_zone_vm_events(PGSCAN_KSWAPD, zone, nr_scan);
                        __count_vm_events(KSWAPD_STEAL, nr_freed);
-               } else
+               } else if (scan_global_lru(sc))
                        __count_zone_vm_events(PGSCAN_DIRECT, zone, nr_scan);
+
                __count_zone_vm_events(PGSTEAL, zone, nr_freed);
 
                if (nr_taken == 0)
@@ -846,6 +967,113 @@ static inline int zone_is_near_oom(struct zone *zone)
 }
 
 /*
+ * Determine we should try to reclaim mapped pages.
+ * This is called only when sc->mem_cgroup is NULL.
+ */
+static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone,
+                               int priority)
+{
+       long mapped_ratio;
+       long distress;
+       long swap_tendency;
+       long imbalance;
+       int reclaim_mapped = 0;
+       int prev_priority;
+
+       if (scan_global_lru(sc) && zone_is_near_oom(zone))
+               return 1;
+       /*
+        * `distress' is a measure of how much trouble we're having
+        * reclaiming pages.  0 -> no problems.  100 -> great trouble.
+        */
+       if (scan_global_lru(sc))
+               prev_priority = zone->prev_priority;
+       else
+               prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup);
+
+       distress = 100 >> min(prev_priority, priority);
+
+       /*
+        * The point of this algorithm is to decide when to start
+        * reclaiming mapped memory instead of just pagecache.  Work out
+        * how much memory
+        * is mapped.
+        */
+       if (scan_global_lru(sc))
+               mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
+                               global_page_state(NR_ANON_PAGES)) * 100) /
+                                       vm_total_pages;
+       else
+               mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup);
+
+       /*
+        * Now decide how much we really want to unmap some pages.  The
+        * mapped ratio is downgraded - just because there's a lot of
+        * mapped memory doesn't necessarily mean that page reclaim
+        * isn't succeeding.
+        *
+        * The distress ratio is important - we don't want to start
+        * going oom.
+        *
+        * A 100% value of vm_swappiness overrides this algorithm
+        * altogether.
+        */
+       swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
+
+       /*
+        * If there's huge imbalance between active and inactive
+        * (think active 100 times larger than inactive) we should
+        * become more permissive, or the system will take too much
+        * cpu before it start swapping during memory pressure.
+        * Distress is about avoiding early-oom, this is about
+        * making swappiness graceful despite setting it to low
+        * values.
+        *
+        * Avoid div by zero with nr_inactive+1, and max resulting
+        * value is vm_total_pages.
+        */
+       if (scan_global_lru(sc)) {
+               imbalance  = zone_page_state(zone, NR_ACTIVE);
+               imbalance /= zone_page_state(zone, NR_INACTIVE) + 1;
+       } else
+               imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup);
+
+       /*
+        * Reduce the effect of imbalance if swappiness is low,
+        * this means for a swappiness very low, the imbalance
+        * must be much higher than 100 for this logic to make
+        * the difference.
+        *
+        * Max temporary value is vm_total_pages*100.
+        */
+       imbalance *= (vm_swappiness + 1);
+       imbalance /= 100;
+
+       /*
+        * If not much of the ram is mapped, makes the imbalance
+        * less relevant, it's high priority we refill the inactive
+        * list with mapped pages only in presence of high ratio of
+        * mapped pages.
+        *
+        * Max temporary value is vm_total_pages*100.
+        */
+       imbalance *= mapped_ratio;
+       imbalance /= 100;
+
+       /* apply imbalance feedback to swap_tendency */
+       swap_tendency += imbalance;
+
+       /*
+        * Now use this metric to decide whether to start moving mapped
+        * memory onto the inactive list.
+        */
+       if (swap_tendency >= 100)
+               reclaim_mapped = 1;
+
+       return reclaim_mapped;
+}
+
+/*
  * This moves pages from the active list to the inactive list.
  *
  * We move them the other way if the page is referenced by one or more
@@ -862,6 +1090,8 @@ static inline int zone_is_near_oom(struct zone *zone)
  * The downside is that we have to touch page->_count against each page.
  * But we had to alter page->flags anyway.
  */
+
+
 static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
                                struct scan_control *sc, int priority)
 {
@@ -875,58 +1105,21 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone,
        struct pagevec pvec;
        int reclaim_mapped = 0;
 
-       if (sc->may_swap) {
-               long mapped_ratio;
-               long distress;
-               long swap_tendency;
-
-               if (zone_is_near_oom(zone))
-                       goto force_reclaim_mapped;
-
-               /*
-                * `distress' is a measure of how much trouble we're having
-                * reclaiming pages.  0 -> no problems.  100 -> great trouble.
-                */
-               distress = 100 >> min(zone->prev_priority, priority);
-
-               /*
-                * The point of this algorithm is to decide when to start
-                * reclaiming mapped memory instead of just pagecache.  Work out
-                * how much memory
-                * is mapped.
-                */
-               mapped_ratio = ((global_page_state(NR_FILE_MAPPED) +
-                               global_page_state(NR_ANON_PAGES)) * 100) /
-                                       vm_total_pages;
-
-               /*
-                * Now decide how much we really want to unmap some pages.  The
-                * mapped ratio is downgraded - just because there's a lot of
-                * mapped memory doesn't necessarily mean that page reclaim
-                * isn't succeeding.
-                *
-                * The distress ratio is important - we don't want to start
-                * going oom.
-                *
-                * A 100% value of vm_swappiness overrides this algorithm
-                * altogether.
-                */
-               swap_tendency = mapped_ratio / 2 + distress + sc->swappiness;
-
-               /*
-                * Now use this metric to decide whether to start moving mapped
-                * memory onto the inactive list.
-                */
-               if (swap_tendency >= 100)
-force_reclaim_mapped:
-                       reclaim_mapped = 1;
-       }
+       if (sc->may_swap)
+               reclaim_mapped = calc_reclaim_mapped(sc, zone, priority);
 
        lru_add_drain();
        spin_lock_irq(&zone->lru_lock);
-       pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
-                           &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE);
-       zone->pages_scanned += pgscanned;
+       pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order,
+                                       ISOLATE_ACTIVE, zone,
+                                       sc->mem_cgroup, 1);
+       /*
+        * zone->pages_scanned is used for detect zone's oom
+        * mem_cgroup remembers nr_scan by itself.
+        */
+       if (scan_global_lru(sc))
+               zone->pages_scanned += pgscanned;
+
        __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved);
        spin_unlock_irq(&zone->lru_lock);
 
@@ -937,7 +1130,7 @@ force_reclaim_mapped:
                if (page_mapped(page)) {
                        if (!reclaim_mapped ||
                            (total_swap_pages == 0 && PageAnon(page)) ||
-                           page_referenced(page, 0)) {
+                           page_referenced(page, 0, sc->mem_cgroup)) {
                                list_add(&page->lru, &l_active);
                                continue;
                        }
@@ -957,6 +1150,7 @@ force_reclaim_mapped:
                ClearPageActive(page);
 
                list_move(&page->lru, &zone->inactive_list);
+               mem_cgroup_move_lists(page, false);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
                        __mod_zone_page_state(zone, NR_INACTIVE, pgmoved);
@@ -984,7 +1178,9 @@ force_reclaim_mapped:
                VM_BUG_ON(PageLRU(page));
                SetPageLRU(page);
                VM_BUG_ON(!PageActive(page));
+
                list_move(&page->lru, &zone->active_list);
+               mem_cgroup_move_lists(page, true);
                pgmoved++;
                if (!pagevec_add(&pvec, page)) {
                        __mod_zone_page_state(zone, NR_ACTIVE, pgmoved);
@@ -1014,27 +1210,39 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
        unsigned long nr_to_scan;
        unsigned long nr_reclaimed = 0;
 
-       atomic_inc(&zone->reclaim_in_progress);
+       if (scan_global_lru(sc)) {
+               /*
+                * Add one to nr_to_scan just to make sure that the kernel
+                * will slowly sift through the active list.
+                */
+               zone->nr_scan_active +=
+                       (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
+               nr_active = zone->nr_scan_active;
+               zone->nr_scan_inactive +=
+                       (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
+               nr_inactive = zone->nr_scan_inactive;
+               if (nr_inactive >= sc->swap_cluster_max)
+                       zone->nr_scan_inactive = 0;
+               else
+                       nr_inactive = 0;
+
+               if (nr_active >= sc->swap_cluster_max)
+                       zone->nr_scan_active = 0;
+               else
+                       nr_active = 0;
+       } else {
+               /*
+                * This reclaim occurs not because zone memory shortage but
+                * because memory controller hits its limit.
+                * Then, don't modify zone reclaim related data.
+                */
+               nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup,
+                                       zone, priority);
 
-       /*
-        * Add one to `nr_to_scan' just to make sure that the kernel will
-        * slowly sift through the active list.
-        */
-       zone->nr_scan_active +=
-               (zone_page_state(zone, NR_ACTIVE) >> priority) + 1;
-       nr_active = zone->nr_scan_active;
-       if (nr_active >= sc->swap_cluster_max)
-               zone->nr_scan_active = 0;
-       else
-               nr_active = 0;
+               nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup,
+                                       zone, priority);
+       }
 
-       zone->nr_scan_inactive +=
-               (zone_page_state(zone, NR_INACTIVE) >> priority) + 1;
-       nr_inactive = zone->nr_scan_inactive;
-       if (nr_inactive >= sc->swap_cluster_max)
-               zone->nr_scan_inactive = 0;
-       else
-               nr_inactive = 0;
 
        while (nr_active || nr_inactive) {
                if (nr_active) {
@@ -1054,8 +1262,6 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
        }
 
        throttle_vm_writeout(sc->gfp_mask);
-
-       atomic_dec(&zone->reclaim_in_progress);
        return nr_reclaimed;
 }
 
@@ -1075,31 +1281,44 @@ static unsigned long shrink_zone(int priority, struct zone *zone,
  * If a zone is deemed to be full of pinned pages then just give it a light
  * scan then give up on it.
  */
-static unsigned long shrink_zones(int priority, struct zone **zones,
+static unsigned long shrink_zones(int priority, struct zonelist *zonelist,
                                        struct scan_control *sc)
 {
+       enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
        unsigned long nr_reclaimed = 0;
-       int i;
+       struct zoneref *z;
+       struct zone *zone;
 
        sc->all_unreclaimable = 1;
-       for (i = 0; zones[i] != NULL; i++) {
-               struct zone *zone = zones[i];
-
+       for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
                if (!populated_zone(zone))
                        continue;
+               /*
+                * Take care memory controller reclaiming has small influence
+                * to global LRU.
+                */
+               if (scan_global_lru(sc)) {
+                       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                               continue;
+                       note_zone_scanning_priority(zone, priority);
 
-               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                       continue;
-
-               note_zone_scanning_priority(zone, priority);
-
-               if (zone->all_unreclaimable && priority != DEF_PRIORITY)
-                       continue;       /* Let kswapd poll it */
-
-               sc->all_unreclaimable = 0;
+                       if (zone_is_all_unreclaimable(zone) &&
+                                               priority != DEF_PRIORITY)
+                               continue;       /* Let kswapd poll it */
+                       sc->all_unreclaimable = 0;
+               } else {
+                       /*
+                        * Ignore cpuset limitation here. We just want to reduce
+                        * # of used pages by us regardless of memory shortage.
+                        */
+                       sc->all_unreclaimable = 0;
+                       mem_cgroup_note_reclaim_priority(sc->mem_cgroup,
+                                                       priority);
+               }
 
                nr_reclaimed += shrink_zone(priority, zone, sc);
        }
+
        return nr_reclaimed;
 }
  
@@ -1115,50 +1334,60 @@ static unsigned long shrink_zones(int priority, struct zone **zones,
  * hope that some of these pages can be written.  But if the allocating task
  * holds filesystem locks which prevent writeout this might not work, and the
  * allocation attempt will fail.
+ *
+ * returns:    0, if no pages reclaimed
+ *             else, the number of pages reclaimed
  */
-unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
+static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
+                                       struct scan_control *sc)
 {
        int priority;
-       int ret = 0;
+       unsigned long ret = 0;
        unsigned long total_scanned = 0;
        unsigned long nr_reclaimed = 0;
        struct reclaim_state *reclaim_state = current->reclaim_state;
        unsigned long lru_pages = 0;
-       int i;
-       struct scan_control sc = {
-               .gfp_mask = gfp_mask,
-               .may_writepage = !laptop_mode,
-               .swap_cluster_max = SWAP_CLUSTER_MAX,
-               .may_swap = 1,
-               .swappiness = vm_swappiness,
-               .order = order,
-       };
+       struct zoneref *z;
+       struct zone *zone;
+       enum zone_type high_zoneidx = gfp_zone(sc->gfp_mask);
 
-       count_vm_event(ALLOCSTALL);
+       delayacct_freepages_start();
 
-       for (i = 0; zones[i] != NULL; i++) {
-               struct zone *zone = zones[i];
+       if (scan_global_lru(sc))
+               count_vm_event(ALLOCSTALL);
+       /*
+        * mem_cgroup will not do shrink_slab.
+        */
+       if (scan_global_lru(sc)) {
+               for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
 
-               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                       continue;
+                       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                               continue;
 
-               lru_pages += zone_page_state(zone, NR_ACTIVE)
-                               + zone_page_state(zone, NR_INACTIVE);
+                       lru_pages += zone_page_state(zone, NR_ACTIVE)
+                                       + zone_page_state(zone, NR_INACTIVE);
+               }
        }
 
        for (priority = DEF_PRIORITY; priority >= 0; priority--) {
-               sc.nr_scanned = 0;
+               sc->nr_scanned = 0;
                if (!priority)
                        disable_swap_token();
-               nr_reclaimed += shrink_zones(priority, zones, &sc);
-               shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
-               if (reclaim_state) {
-                       nr_reclaimed += reclaim_state->reclaimed_slab;
-                       reclaim_state->reclaimed_slab = 0;
+               nr_reclaimed += shrink_zones(priority, zonelist, sc);
+               /*
+                * Don't shrink slabs when reclaiming memory from
+                * over limit cgroups
+                */
+               if (scan_global_lru(sc)) {
+                       shrink_slab(sc->nr_scanned, sc->gfp_mask, lru_pages);
+                       if (reclaim_state) {
+                               nr_reclaimed += reclaim_state->reclaimed_slab;
+                               reclaim_state->reclaimed_slab = 0;
+                       }
                }
-               total_scanned += sc.nr_scanned;
-               if (nr_reclaimed >= sc.swap_cluster_max) {
-                       ret = 1;
+               total_scanned += sc->nr_scanned;
+               if (nr_reclaimed >= sc->swap_cluster_max) {
+                       ret = nr_reclaimed;
                        goto out;
                }
 
@@ -1169,19 +1398,19 @@ unsigned long try_to_free_pages(struct zone **zones, int order, gfp_t gfp_mask)
                 * that's undesirable in laptop mode, where we *want* lumpy
                 * writeout.  So in laptop mode, write out the whole world.
                 */
-               if (total_scanned > sc.swap_cluster_max +
-                                       sc.swap_cluster_max / 2) {
+               if (total_scanned > sc->swap_cluster_max +
+                                       sc->swap_cluster_max / 2) {
                        wakeup_pdflush(laptop_mode ? 0 : total_scanned);
-                       sc.may_writepage = 1;
+                       sc->may_writepage = 1;
                }
 
                /* Take a nap, wait for some writeback to complete */
-               if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
+               if (sc->nr_scanned && priority < DEF_PRIORITY - 2)
                        congestion_wait(WRITE, HZ/10);
        }
-       /* top priority shrink_caches still had more to do? don't OOM, then */
-       if (!sc.all_unreclaimable)
-               ret = 1;
+       /* top priority shrink_zones still had more to do? don't OOM, then */
+       if (!sc->all_unreclaimable && scan_global_lru(sc))
+               ret = nr_reclaimed;
 out:
        /*
         * Now that we've scanned all the zones at this priority level, note
@@ -1192,17 +1421,63 @@ out:
         */
        if (priority < 0)
                priority = 0;
-       for (i = 0; zones[i] != 0; i++) {
-               struct zone *zone = zones[i];
 
-               if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
-                       continue;
+       if (scan_global_lru(sc)) {
+               for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+
+                       if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+                               continue;
+
+                       zone->prev_priority = priority;
+               }
+       } else
+               mem_cgroup_record_reclaim_priority(sc->mem_cgroup, priority);
+
+       delayacct_freepages_end();
 
-               zone->prev_priority = priority;
-       }
        return ret;
 }
 
+unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
+                                                               gfp_t gfp_mask)
+{
+       struct scan_control sc = {
+               .gfp_mask = gfp_mask,
+               .may_writepage = !laptop_mode,
+               .swap_cluster_max = SWAP_CLUSTER_MAX,
+               .may_swap = 1,
+               .swappiness = vm_swappiness,
+               .order = order,
+               .mem_cgroup = NULL,
+               .isolate_pages = isolate_pages_global,
+       };
+
+       return do_try_to_free_pages(zonelist, &sc);
+}
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR
+
+unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *mem_cont,
+                                               gfp_t gfp_mask)
+{
+       struct scan_control sc = {
+               .may_writepage = !laptop_mode,
+               .may_swap = 1,
+               .swap_cluster_max = SWAP_CLUSTER_MAX,
+               .swappiness = vm_swappiness,
+               .order = 0,
+               .mem_cgroup = mem_cont,
+               .isolate_pages = mem_cgroup_isolate_pages,
+       };
+       struct zonelist *zonelist;
+
+       sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
+                       (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
+       zonelist = NODE_DATA(numa_node_id())->node_zonelists;
+       return do_try_to_free_pages(zonelist, &sc);
+}
+#endif
+
 /*
  * For kswapd, balance_pgdat() will work across all this node's zones until
  * they are all at pages_high.
@@ -1238,6 +1513,8 @@ static unsigned long balance_pgdat(pg_data_t *pgdat, int order)
                .swap_cluster_max = SWAP_CLUSTER_MAX,
                .swappiness = vm_swappiness,
                .order = order,
+               .mem_cgroup = NULL,
+               .isolate_pages = isolate_pages_global,
        };
        /*
         * temp_priority is used to remember the scanning priority at which
@@ -1274,7 +1551,8 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
 
-                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                       if (zone_is_all_unreclaimable(zone) &&
+                           priority != DEF_PRIORITY)
                                continue;
 
                        if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1309,7 +1587,8 @@ loop_again:
                        if (!populated_zone(zone))
                                continue;
 
-                       if (zone->all_unreclaimable && priority != DEF_PRIORITY)
+                       if (zone_is_all_unreclaimable(zone) &&
+                                       priority != DEF_PRIORITY)
                                continue;
 
                        if (!zone_watermark_ok(zone, order, zone->pages_high,
@@ -1318,18 +1597,25 @@ loop_again:
                        temp_priority[i] = priority;
                        sc.nr_scanned = 0;
                        note_zone_scanning_priority(zone, priority);
-                       nr_reclaimed += shrink_zone(priority, zone, &sc);
+                       /*
+                        * We put equal pressure on every zone, unless one
+                        * zone has way too many pages free already.
+                        */
+                       if (!zone_watermark_ok(zone, order, 8*zone->pages_high,
+                                               end_zone, 0))
+                               nr_reclaimed += shrink_zone(priority, zone, &sc);
                        reclaim_state->reclaimed_slab = 0;
                        nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
                                                lru_pages);
                        nr_reclaimed += reclaim_state->reclaimed_slab;
                        total_scanned += sc.nr_scanned;
-                       if (zone->all_unreclaimable)
+                       if (zone_is_all_unreclaimable(zone))
                                continue;
                        if (nr_slab == 0 && zone->pages_scanned >=
                                (zone_page_state(zone, NR_ACTIVE)
                                + zone_page_state(zone, NR_INACTIVE)) * 6)
-                                       zone->all_unreclaimable = 1;
+                                       zone_set_flag(zone,
+                                                     ZONE_ALL_UNRECLAIMABLE);
                        /*
                         * If we've done a decent amount of scanning and
                         * the reclaim ratio is low, start doing writepage
@@ -1401,11 +1687,10 @@ static int kswapd(void *p)
        struct reclaim_state reclaim_state = {
                .reclaimed_slab = 0,
        };
-       cpumask_t cpumask;
+       node_to_cpumask_ptr(cpumask, pgdat->node_id);
 
-       cpumask = node_to_cpumask(pgdat->node_id);
-       if (!cpus_empty(cpumask))
-               set_cpus_allowed(tsk, cpumask);
+       if (!cpus_empty(*cpumask))
+               set_cpus_allowed_ptr(tsk, cpumask);
        current->reclaim_state = &reclaim_state;
 
        /*
@@ -1421,6 +1706,7 @@ static int kswapd(void *p)
         * trying to free the first piece of memory in the first place).
         */
        tsk->flags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
+       set_freezable();
 
        order = 0;
        for ( ; ; ) {
@@ -1494,7 +1780,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
                if (!populated_zone(zone))
                        continue;
 
-               if (zone->all_unreclaimable && prio != DEF_PRIORITY)
+               if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
                        continue;
 
                /* For pass = 0 we don't shrink the active list */
@@ -1549,6 +1835,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages)
                .swap_cluster_max = nr_pages,
                .may_writepage = 1,
                .swappiness = vm_swappiness,
+               .isolate_pages = isolate_pages_global,
        };
 
        current->reclaim_state = &reclaim_state;
@@ -1632,15 +1919,16 @@ out:
 static int __devinit cpu_callback(struct notifier_block *nfb,
                                  unsigned long action, void *hcpu)
 {
-       pg_data_t *pgdat;
-       cpumask_t mask;
+       int nid;
 
        if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) {
-               for_each_online_pgdat(pgdat) {
-                       mask = node_to_cpumask(pgdat->node_id);
-                       if (any_online_cpu(mask) != NR_CPUS)
+               for_each_node_state(nid, N_HIGH_MEMORY) {
+                       pg_data_t *pgdat = NODE_DATA(nid);
+                       node_to_cpumask_ptr(mask, pgdat->node_id);
+
+                       if (any_online_cpu(*mask) < nr_cpu_ids)
                                /* One of our CPUs online: restore mask */
-                               set_cpus_allowed(pgdat->kswapd, mask);
+                               set_cpus_allowed_ptr(pgdat->kswapd, mask);
                }
        }
        return NOTIFY_OK;
@@ -1673,7 +1961,7 @@ static int __init kswapd_init(void)
        int nid;
 
        swap_setup();
-       for_each_online_node(nid)
+       for_each_node_state(nid, N_HIGH_MEMORY)
                kswapd_run(nid);
        hotcpu_notifier(cpu_callback, 0);
        return 0;
@@ -1691,7 +1979,7 @@ module_init(kswapd_init)
 int zone_reclaim_mode __read_mostly;
 
 #define RECLAIM_OFF 0
-#define RECLAIM_ZONE (1<<0)    /* Run shrink_cache on the zone */
+#define RECLAIM_ZONE (1<<0)    /* Run shrink_inactive_list on the zone */
 #define RECLAIM_WRITE (1<<1)   /* Writeout pages during reclaim */
 #define RECLAIM_SWAP (1<<2)    /* Swap pages out during reclaim */
 
@@ -1732,6 +2020,7 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                                        SWAP_CLUSTER_MAX),
                .gfp_mask = gfp_mask,
                .swappiness = vm_swappiness,
+               .isolate_pages = isolate_pages_global,
        };
        unsigned long slab_reclaimable;
 
@@ -1793,8 +2082,8 @@ static int __zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 
 int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
 {
-       cpumask_t mask;
        int node_id;
+       int ret;
 
        /*
         * Zone reclaim reclaims unmapped file backed pages and
@@ -1812,15 +2101,13 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
                        <= zone->min_slab_pages)
                return 0;
 
+       if (zone_is_all_unreclaimable(zone))
+               return 0;
+
        /*
-        * Avoid concurrent zone reclaims, do not reclaim in a zone that does
-        * not have reclaimable pages and if we should not delay the allocation
-        * then do not scan.
+        * Do not scan if the allocation should not be delayed.
         */
-       if (!(gfp_mask & __GFP_WAIT) ||
-               zone->all_unreclaimable ||
-               atomic_read(&zone->reclaim_in_progress) > 0 ||
-               (current->flags & PF_MEMALLOC))
+       if (!(gfp_mask & __GFP_WAIT) || (current->flags & PF_MEMALLOC))
                        return 0;
 
        /*
@@ -1830,9 +2117,14 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order)
         * as wide as possible.
         */
        node_id = zone_to_nid(zone);
-       mask = node_to_cpumask(node_id);
-       if (!cpus_empty(mask) && node_id != numa_node_id())
+       if (node_state(node_id, N_CPU) && node_id != numa_node_id())
                return 0;
-       return __zone_reclaim(zone, gfp_mask, order);
+
+       if (zone_test_and_set_flag(zone, ZONE_RECLAIM_LOCKED))
+               return 0;
+       ret = __zone_reclaim(zone, gfp_mask, order);
+       zone_clear_flag(zone, ZONE_RECLAIM_LOCKED);
+
+       return ret;
 }
 #endif