memcg: mem+swap controller Kconfig
[safe/jmp/linux-2.6] / mm / memcontrol.c
index 49234d9..59dd8c1 100644 (file)
@@ -21,6 +21,7 @@
 #include <linux/memcontrol.h>
 #include <linux/cgroup.h>
 #include <linux/mm.h>
+#include <linux/pagemap.h>
 #include <linux/smp.h>
 #include <linux/page-flags.h>
 #include <linux/backing-dev.h>
 struct cgroup_subsys mem_cgroup_subsys __read_mostly;
 #define MEM_CGROUP_RECLAIM_RETRIES     5
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+/* Turned on only when memory cgroup is enabled && really_do_swap_account = 0 */
+int do_swap_account __read_mostly;
+static int really_do_swap_account __initdata = 1; /* for remember boot option*/
+#else
+#define do_swap_account                (0)
+#endif
+
+
 /*
  * Statistics for memory cgroup.
  */
@@ -60,7 +70,7 @@ struct mem_cgroup_stat_cpu {
 } ____cacheline_aligned_in_smp;
 
 struct mem_cgroup_stat {
-       struct mem_cgroup_stat_cpu cpustat[NR_CPUS];
+       struct mem_cgroup_stat_cpu cpustat[0];
 };
 
 /*
@@ -129,17 +139,17 @@ struct mem_cgroup {
 
        int     prev_priority;  /* for recording reclaim priority */
        /*
-        * statistics.
+        * statistics. This must be placed at the end of memcg.
         */
        struct mem_cgroup_stat stat;
 };
-static struct mem_cgroup init_mem_cgroup;
 
 enum charge_type {
        MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
        MEM_CGROUP_CHARGE_TYPE_MAPPED,
        MEM_CGROUP_CHARGE_TYPE_SHMEM,   /* used by page migration of shmem */
        MEM_CGROUP_CHARGE_TYPE_FORCE,   /* used by force_empty */
+       MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
        NR_CHARGE_TYPE,
 };
 
@@ -781,6 +791,33 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
                                MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
 }
 
+#ifdef CONFIG_SWAP
+int mem_cgroup_cache_charge_swapin(struct page *page,
+                       struct mm_struct *mm, gfp_t mask, bool locked)
+{
+       int ret = 0;
+
+       if (mem_cgroup_subsys.disabled)
+               return 0;
+       if (unlikely(!mm))
+               mm = &init_mm;
+       if (!locked)
+               lock_page(page);
+       /*
+        * If not locked, the page can be dropped from SwapCache until
+        * we reach here.
+        */
+       if (PageSwapCache(page)) {
+               ret = mem_cgroup_charge_common(page, mm, mask,
+                               MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL);
+       }
+       if (!locked)
+               unlock_page(page);
+
+       return ret;
+}
+#endif
+
 void mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr)
 {
        struct page_cgroup *pc;
@@ -818,6 +855,9 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        if (mem_cgroup_subsys.disabled)
                return;
 
+       if (PageSwapCache(page))
+               return;
+
        /*
         * Check if our page_cgroup is valid
         */
@@ -826,12 +866,26 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
                return;
 
        lock_page_cgroup(pc);
-       if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page))
-            || !PageCgroupUsed(pc)) {
-               /* This happens at race in zap_pte_range() and do_swap_page()*/
-               unlock_page_cgroup(pc);
-               return;
+
+       if (!PageCgroupUsed(pc))
+               goto unlock_out;
+
+       switch (ctype) {
+       case MEM_CGROUP_CHARGE_TYPE_MAPPED:
+               if (page_mapped(page))
+                       goto unlock_out;
+               break;
+       case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
+               if (!PageAnon(page)) {  /* Shared memory */
+                       if (page->mapping && !page_is_file_cache(page))
+                               goto unlock_out;
+               } else if (page_mapped(page)) /* Anon */
+                               goto unlock_out;
+               break;
+       default:
+               break;
        }
+
        ClearPageCgroupUsed(pc);
        mem = pc->mem_cgroup;
 
@@ -845,6 +899,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
        css_put(&mem->css);
 
        return;
+
+unlock_out:
+       unlock_page_cgroup(pc);
+       return;
 }
 
 void mem_cgroup_uncharge_page(struct page *page)
@@ -864,6 +922,11 @@ void mem_cgroup_uncharge_cache_page(struct page *page)
        __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE);
 }
 
+void mem_cgroup_uncharge_swapcache(struct page *page)
+{
+       __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_SWAPOUT);
+}
+
 /*
  * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
  * page belongs to.
@@ -921,7 +984,7 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
                ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
 
        /* unused page is not on radix-tree now. */
-       if (unused && ctype != MEM_CGROUP_CHARGE_TYPE_MAPPED)
+       if (unused)
                __mem_cgroup_uncharge_common(unused, ctype);
 
        pc = lookup_page_cgroup(target);
@@ -1063,21 +1126,27 @@ static int mem_cgroup_force_empty_list(struct mem_cgroup *mem,
  * make mem_cgroup's charge to be 0 if there is no task.
  * This enables deleting this mem_cgroup.
  */
-static int mem_cgroup_force_empty(struct mem_cgroup *mem)
+static int mem_cgroup_force_empty(struct mem_cgroup *mem, bool free_all)
 {
        int ret;
        int node, zid, shrink;
        int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+       struct cgroup *cgrp = mem->css.cgroup;
 
        css_get(&mem->css);
 
        shrink = 0;
+       /* should free all ? */
+       if (free_all)
+               goto try_to_free;
 move_account:
        while (mem->res.usage > 0) {
                ret = -EBUSY;
-               if (atomic_read(&mem->css.cgroup->count) > 0)
+               if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children))
+                       goto out;
+               ret = -EINTR;
+               if (signal_pending(current))
                        goto out;
-
                /* This is for making all *used* pages to be on LRU. */
                lru_add_drain_all();
                ret = 0;
@@ -1107,19 +1176,29 @@ out:
        return ret;
 
 try_to_free:
-       /* returns EBUSY if we come here twice. */
-       if (shrink)  {
+       /* returns EBUSY if there is a task or if we come here twice. */
+       if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children) || shrink) {
                ret = -EBUSY;
                goto out;
        }
+       /* we call try-to-free pages for make this cgroup empty */
+       lru_add_drain_all();
        /* try to free all pages in this cgroup */
        shrink = 1;
        while (nr_retries && mem->res.usage > 0) {
                int progress;
+
+               if (signal_pending(current)) {
+                       ret = -EINTR;
+                       goto out;
+               }
                progress = try_to_free_mem_cgroup_pages(mem,
                                                  GFP_HIGHUSER_MOVABLE);
-               if (!progress)
+               if (!progress) {
                        nr_retries--;
+                       /* maybe some writeback is necessary */
+                       congestion_wait(WRITE, HZ/10);
+               }
 
        }
        /* try move_account...there may be some *locked* pages. */
@@ -1129,6 +1208,12 @@ try_to_free:
        goto out;
 }
 
+int mem_cgroup_force_empty_write(struct cgroup *cont, unsigned int event)
+{
+       return mem_cgroup_force_empty(mem_cgroup_from_cont(cont), true);
+}
+
+
 static u64 mem_cgroup_read(struct cgroup *cont, struct cftype *cft)
 {
        return res_counter_read_u64(&mem_cgroup_from_cont(cont)->res,
@@ -1226,6 +1311,7 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft,
        return 0;
 }
 
+
 static struct cftype mem_cgroup_files[] = {
        {
                .name = "usage_in_bytes",
@@ -1254,6 +1340,10 @@ static struct cftype mem_cgroup_files[] = {
                .name = "stat",
                .read_map = mem_control_stat_show,
        },
+       {
+               .name = "force_empty",
+               .trigger = mem_cgroup_force_empty_write,
+       },
 };
 
 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
@@ -1293,55 +1383,72 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
        kfree(mem->info.nodeinfo[node]);
 }
 
+static int mem_cgroup_size(void)
+{
+       int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
+       return sizeof(struct mem_cgroup) + cpustat_size;
+}
+
 static struct mem_cgroup *mem_cgroup_alloc(void)
 {
        struct mem_cgroup *mem;
+       int size = mem_cgroup_size();
 
-       if (sizeof(*mem) < PAGE_SIZE)
-               mem = kmalloc(sizeof(*mem), GFP_KERNEL);
+       if (size < PAGE_SIZE)
+               mem = kmalloc(size, GFP_KERNEL);
        else
-               mem = vmalloc(sizeof(*mem));
+               mem = vmalloc(size);
 
        if (mem)
-               memset(mem, 0, sizeof(*mem));
+               memset(mem, 0, size);
        return mem;
 }
 
 static void mem_cgroup_free(struct mem_cgroup *mem)
 {
-       if (sizeof(*mem) < PAGE_SIZE)
+       if (mem_cgroup_size() < PAGE_SIZE)
                kfree(mem);
        else
                vfree(mem);
 }
 
 
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+static void __init enable_swap_cgroup(void)
+{
+       if (!mem_cgroup_subsys.disabled && really_do_swap_account)
+               do_swap_account = 1;
+}
+#else
+static void __init enable_swap_cgroup(void)
+{
+}
+#endif
+
 static struct cgroup_subsys_state *
 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 {
        struct mem_cgroup *mem;
        int node;
 
-       if (unlikely((cont->parent) == NULL)) {
-               mem = &init_mem_cgroup;
-       } else {
-               mem = mem_cgroup_alloc();
-               if (!mem)
-                       return ERR_PTR(-ENOMEM);
-       }
+       mem = mem_cgroup_alloc();
+       if (!mem)
+               return ERR_PTR(-ENOMEM);
 
        res_counter_init(&mem->res);
 
        for_each_node_state(node, N_POSSIBLE)
                if (alloc_mem_cgroup_per_zone_info(mem, node))
                        goto free_out;
+       /* root ? */
+       if (cont->parent == NULL)
+               enable_swap_cgroup();
 
        return &mem->css;
 free_out:
        for_each_node_state(node, N_POSSIBLE)
                free_mem_cgroup_per_zone_info(mem, node);
-       if (cont->parent != NULL)
-               mem_cgroup_free(mem);
+       mem_cgroup_free(mem);
        return ERR_PTR(-ENOMEM);
 }
 
@@ -1349,7 +1456,7 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
                                        struct cgroup *cont)
 {
        struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-       mem_cgroup_force_empty(mem);
+       mem_cgroup_force_empty(mem, false);
 }
 
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
@@ -1407,3 +1514,13 @@ struct cgroup_subsys mem_cgroup_subsys = {
        .attach = mem_cgroup_move_task,
        .early_init = 0,
 };
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_SWAP
+
+static int __init disable_swap_account(char *s)
+{
+       really_do_swap_account = 0;
+       return 1;
+}
+__setup("noswapaccount", disable_swap_account);
+#endif