usb: atm: speedtch: use new hex_to_bin() method

[safe/jmp/linux-2.6] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 649df43..c8569bc 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -63,8 +63,15 @@ static int really_do_swap_account __initdata = 1; /* for remember boot option*/
  #define do_swap_account                (0)
  #endif
  
-#define SOFTLIMIT_EVENTS_THRESH (1000)
-#define THRESHOLDS_EVENTS_THRESH (100)
+/*
+ * Per memcg event counter is incremented at every pagein/pageout. This counter
+ * is used for trigger some periodic events. This is straightforward and better
+ * than using jiffies etc. to handle periodic memcg event.
+ *
+ * These values will be used as !((event) & ((1 <<(thresh)) - 1))
+ */
+#define THRESHOLDS_EVENTS_THRESH (7) /* once in 128 */
+#define SOFTLIMIT_EVENTS_THRESH (10) /* once in 1024 */
  
  /*
   * Statistics for memory cgroup.
@@ -79,64 +86,15 @@ enum mem_cgroup_stat_index {
         MEM_CGROUP_STAT_PGPGIN_COUNT,   /* # of pages paged in */
         MEM_CGROUP_STAT_PGPGOUT_COUNT,  /* # of pages paged out */
         MEM_CGROUP_STAT_SWAPOUT, /* # of pages, swapped out */
-       MEM_CGROUP_STAT_SOFTLIMIT, /* decrements on each page in/out.
-                                       used by soft limit implementation */
-       MEM_CGROUP_STAT_THRESHOLDS, /* decrements on each page in/out.
-                                       used by threshold implementation */
+       MEM_CGROUP_EVENTS,      /* incremented at every  pagein/pageout */
  
         MEM_CGROUP_STAT_NSTATS,
  };
  
  struct mem_cgroup_stat_cpu {
         s64 count[MEM_CGROUP_STAT_NSTATS];
-} ____cacheline_aligned_in_smp;
-
-struct mem_cgroup_stat {
-       struct mem_cgroup_stat_cpu cpustat[0];
  };
  
-static inline void
-__mem_cgroup_stat_set_safe(struct mem_cgroup_stat_cpu *stat,
-                               enum mem_cgroup_stat_index idx, s64 val)
-{
-       stat->count[idx] = val;
-}
-
-static inline s64
-__mem_cgroup_stat_read_local(struct mem_cgroup_stat_cpu *stat,
-                               enum mem_cgroup_stat_index idx)
-{
-       return stat->count[idx];
-}
-
-/*
- * For accounting under irq disable, no need for increment preempt count.
- */
-static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat,
-               enum mem_cgroup_stat_index idx, int val)
-{
-       stat->count[idx] += val;
-}
-
-static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat,
-               enum mem_cgroup_stat_index idx)
-{
-       int cpu;
-       s64 ret = 0;
-       for_each_possible_cpu(cpu)
-               ret += stat->cpustat[cpu].count[idx];
-       return ret;
-}
-
-static s64 mem_cgroup_local_usage(struct mem_cgroup_stat *stat)
-{
-       s64 ret;
-
-       ret = mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_CACHE);
-       ret += mem_cgroup_read_stat(stat, MEM_CGROUP_STAT_RSS);
-       return ret;
-}
-
  /*
   * per-zone information in memory controller.
   */
@@ -200,7 +158,6 @@ struct mem_cgroup_threshold_ary {
         struct mem_cgroup_threshold entries[0];
  };
  
-static bool mem_cgroup_threshold_check(struct mem_cgroup *mem);
  static void mem_cgroup_threshold(struct mem_cgroup *mem);
  
  /*
@@ -246,7 +203,7 @@ struct mem_cgroup {
          * Should the accounting and control be hierarchical, per subtree?
          */
         bool use_hierarchy;
-       unsigned long   last_oom_jiffies;
+       atomic_t        oom_lock;
         atomic_t        refcnt;
  
         unsigned int    swappiness;
@@ -270,9 +227,9 @@ struct mem_cgroup {
         unsigned long   move_charge_at_immigrate;
  
         /*
-        * statistics. This must be placed at the end of memcg.
+        * percpu counter.
          */
-       struct mem_cgroup_stat stat;
+       struct mem_cgroup_stat_cpu *stat;
  };
  
  /* Stuffs for move charges at task migration. */
@@ -438,24 +395,6 @@ mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
         spin_unlock(&mctz->lock);
  }
  
-static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
-{
-       bool ret = false;
-       int cpu;
-       s64 val;
-       struct mem_cgroup_stat_cpu *cpustat;
-
-       cpu = get_cpu();
-       cpustat = &mem->stat.cpustat[cpu];
-       val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_SOFTLIMIT);
-       if (unlikely(val < 0)) {
-               __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT,
-                               SOFTLIMIT_EVENTS_THRESH);
-               ret = true;
-       }
-       put_cpu();
-       return ret;
-}
  
  static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
  {
@@ -549,17 +488,31 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz)
         return mz;
  }
  
+static s64 mem_cgroup_read_stat(struct mem_cgroup *mem,
+               enum mem_cgroup_stat_index idx)
+{
+       int cpu;
+       s64 val = 0;
+
+       for_each_possible_cpu(cpu)
+               val += per_cpu(mem->stat->count[idx], cpu);
+       return val;
+}
+
+static s64 mem_cgroup_local_usage(struct mem_cgroup *mem)
+{
+       s64 ret;
+
+       ret = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
+       ret += mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
+       return ret;
+}
+
  static void mem_cgroup_swap_statistics(struct mem_cgroup *mem,
                                          bool charge)
  {
         int val = (charge) ? 1 : -1;
-       struct mem_cgroup_stat *stat = &mem->stat;
-       struct mem_cgroup_stat_cpu *cpustat;
-       int cpu = get_cpu();
-
-       cpustat = &stat->cpustat[cpu];
-       __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SWAPOUT, val);
-       put_cpu();
+       this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_SWAPOUT], val);
  }
  
  static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
@@ -567,26 +520,21 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *mem,
                                          bool charge)
  {
         int val = (charge) ? 1 : -1;
-       struct mem_cgroup_stat *stat = &mem->stat;
-       struct mem_cgroup_stat_cpu *cpustat;
-       int cpu = get_cpu();
  
-       cpustat = &stat->cpustat[cpu];
+       preempt_disable();
+
         if (PageCgroupCache(pc))
-               __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val);
+               __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_CACHE], val);
         else
-               __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val);
+               __this_cpu_add(mem->stat->count[MEM_CGROUP_STAT_RSS], val);
  
         if (charge)
-               __mem_cgroup_stat_add_safe(cpustat,
-                               MEM_CGROUP_STAT_PGPGIN_COUNT, 1);
+               __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGIN_COUNT]);
         else
-               __mem_cgroup_stat_add_safe(cpustat,
-                               MEM_CGROUP_STAT_PGPGOUT_COUNT, 1);
-       __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_SOFTLIMIT, -1);
-       __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS, -1);
+               __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_PGPGOUT_COUNT]);
+       __this_cpu_inc(mem->stat->count[MEM_CGROUP_EVENTS]);
  
-       put_cpu();
+       preempt_enable();
  }
  
  static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
@@ -604,6 +552,29 @@ static unsigned long mem_cgroup_get_local_zonestat(struct mem_cgroup *mem,
         return total;
  }
  
+static bool __memcg_event_check(struct mem_cgroup *mem, int event_mask_shift)
+{
+       s64 val;
+
+       val = this_cpu_read(mem->stat->count[MEM_CGROUP_EVENTS]);
+
+       return !(val & ((1 << event_mask_shift) - 1));
+}
+
+/*
+ * Check events in order.
+ *
+ */
+static void memcg_check_events(struct mem_cgroup *mem, struct page *page)
+{
+       /* threshold event is triggered in finer grain than soft limit */
+       if (unlikely(__memcg_event_check(mem, THRESHOLDS_EVENTS_THRESH))) {
+               mem_cgroup_threshold(mem);
+               if (unlikely(__memcg_event_check(mem, SOFTLIMIT_EVENTS_THRESH)))
+                       mem_cgroup_update_tree(mem, page);
+       }
+}
+
  static struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
  {
         return container_of(cgroup_subsys_state(cont,
@@ -1070,7 +1041,7 @@ static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
  }
  
  /**
- * mem_cgroup_print_mem_info: Called from OOM with tasklist_lock held in read mode.
+ * mem_cgroup_print_oom_info: Called from OOM with tasklist_lock held in read mode.
   * @memcg: The memory cgroup that went over limit
   * @p: Task that is going to be killed
   *
@@ -1244,7 +1215,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
                                 }
                         }
                 }
-               if (!mem_cgroup_local_usage(&victim->stat)) {
+               if (!mem_cgroup_local_usage(victim)) {
                         /* this cgroup's local usage == 0 */
                         css_put(&victim->css);
                         continue;
@@ -1275,32 +1246,102 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
         return total;
  }
  
-bool mem_cgroup_oom_called(struct task_struct *task)
+static int mem_cgroup_oom_lock_cb(struct mem_cgroup *mem, void *data)
  {
-       bool ret = false;
-       struct mem_cgroup *mem;
-       struct mm_struct *mm;
+       int *val = (int *)data;
+       int x;
+       /*
+        * Logically, we can stop scanning immediately when we find
+        * a memcg is already locked. But condidering unlock ops and
+        * creation/removal of memcg, scan-all is simple operation.
+        */
+       x = atomic_inc_return(&mem->oom_lock);
+       *val = max(x, *val);
+       return 0;
+}
+/*
+ * Check OOM-Killer is already running under our hierarchy.
+ * If someone is running, return false.
+ */
+static bool mem_cgroup_oom_lock(struct mem_cgroup *mem)
+{
+       int lock_count = 0;
  
-       rcu_read_lock();
-       mm = task->mm;
-       if (!mm)
-               mm = &init_mm;
-       mem = mem_cgroup_from_task(rcu_dereference(mm->owner));
-       if (mem && time_before(jiffies, mem->last_oom_jiffies + HZ/10))
-               ret = true;
-       rcu_read_unlock();
-       return ret;
+       mem_cgroup_walk_tree(mem, &lock_count, mem_cgroup_oom_lock_cb);
+
+       if (lock_count == 1)
+               return true;
+       return false;
  }
  
-static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+static int mem_cgroup_oom_unlock_cb(struct mem_cgroup *mem, void *data)
  {
-       mem->last_oom_jiffies = jiffies;
+       /*
+        * When a new child is created while the hierarchy is under oom,
+        * mem_cgroup_oom_lock() may not be called. We have to use
+        * atomic_add_unless() here.
+        */
+       atomic_add_unless(&mem->oom_lock, -1, 0);
         return 0;
  }
  
-static void record_last_oom(struct mem_cgroup *mem)
+static void mem_cgroup_oom_unlock(struct mem_cgroup *mem)
+{
+       mem_cgroup_walk_tree(mem, NULL, mem_cgroup_oom_unlock_cb);
+}
+
+static DEFINE_MUTEX(memcg_oom_mutex);
+static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
+
+/*
+ * try to call OOM killer. returns false if we should exit memory-reclaim loop.
+ */
+bool mem_cgroup_handle_oom(struct mem_cgroup *mem, gfp_t mask)
  {
-       mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+       DEFINE_WAIT(wait);
+       bool locked;
+
+       /* At first, try to OOM lock hierarchy under mem.*/
+       mutex_lock(&memcg_oom_mutex);
+       locked = mem_cgroup_oom_lock(mem);
+       /*
+        * Even if signal_pending(), we can't quit charge() loop without
+        * accounting. So, UNINTERRUPTIBLE is appropriate. But SIGKILL
+        * under OOM is always welcomed, use TASK_KILLABLE here.
+        */
+       if (!locked)
+               prepare_to_wait(&memcg_oom_waitq, &wait, TASK_KILLABLE);
+       mutex_unlock(&memcg_oom_mutex);
+
+       if (locked)
+               mem_cgroup_out_of_memory(mem, mask);
+       else {
+               schedule();
+               finish_wait(&memcg_oom_waitq, &wait);
+       }
+       mutex_lock(&memcg_oom_mutex);
+       mem_cgroup_oom_unlock(mem);
+       /*
+        * Here, we use global waitq .....more fine grained waitq ?
+        * Assume following hierarchy.
+        * A/
+        *   01
+        *   02
+        * assume OOM happens both in A and 01 at the same time. Tthey are
+        * mutually exclusive by lock. (kill in 01 helps A.)
+        * When we use per memcg waitq, we have to wake up waiters on A and 02
+        * in addtion to waiters on 01. We use global waitq for avoiding mess.
+        * It will not be a big problem.
+        * (And a task may be moved to other groups while it's waiting for OOM.)
+        */
+       wake_up_all(&memcg_oom_waitq);
+       mutex_unlock(&memcg_oom_mutex);
+
+       if (test_thread_flag(TIF_MEMDIE) || fatal_signal_pending(current))
+               return false;
+       /* Give chance to dying process */
+       schedule_timeout(1);
+       return true;
  }
  
  /*
@@ -1310,9 +1351,6 @@ static void record_last_oom(struct mem_cgroup *mem)
  void mem_cgroup_update_file_mapped(struct page *page, int val)
  {
         struct mem_cgroup *mem;
-       struct mem_cgroup_stat *stat;
-       struct mem_cgroup_stat_cpu *cpustat;
-       int cpu;
         struct page_cgroup *pc;
  
         pc = lookup_page_cgroup(page);
@@ -1321,20 +1359,20 @@ void mem_cgroup_update_file_mapped(struct page *page, int val)
  
         lock_page_cgroup(pc);
         mem = pc->mem_cgroup;
-       if (!mem)
-               goto done;
-
-       if (!PageCgroupUsed(pc))
+       if (!mem || !PageCgroupUsed(pc))
                 goto done;
  
         /*
-        * Preemption is already disabled, we don't need get_cpu()
+        * Preemption is already disabled. We can use __this_cpu_xxx
          */
-       cpu = smp_processor_id();
-       stat = &mem->stat;
-       cpustat = &stat->cpustat[cpu];
+       if (val > 0) {
+               __this_cpu_inc(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+               SetPageCgroupFileMapped(pc);
+       } else {
+               __this_cpu_dec(mem->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+               ClearPageCgroupFileMapped(pc);
+       }
  
-       __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED, val);
  done:
         unlock_page_cgroup(pc);
  }
@@ -1400,7 +1438,7 @@ static void drain_local_stock(struct work_struct *dummy)
  
  /*
   * Cache charges(val) which is from res_counter, to local per_cpu area.
- * This will be consumed by consumt_stock() function, later.
+ * This will be consumed by consume_stock() function, later.
   */
  static void refill_stock(struct mem_cgroup *mem, int val)
  {
@@ -1471,19 +1509,21 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
   * oom-killer can be invoked.
   */
  static int __mem_cgroup_try_charge(struct mm_struct *mm,
-                       gfp_t gfp_mask, struct mem_cgroup **memcg,
-                       bool oom, struct page *page)
+                       gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
  {
         struct mem_cgroup *mem, *mem_over_limit;
         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
         struct res_counter *fail_res;
         int csize = CHARGE_SIZE;
  
-       if (unlikely(test_thread_flag(TIF_MEMDIE))) {
-               /* Don't account this! */
-               *memcg = NULL;
-               return 0;
-       }
+       /*
+        * Unlike gloval-vm's OOM-kill, we're not in memory shortage
+        * in system level. So, allow to go ahead dying process in addition to
+        * MEMDIE process.
+        */
+       if (unlikely(test_thread_flag(TIF_MEMDIE)
+                    || fatal_signal_pending(current)))
+               goto bypass;
  
         /*
          * We always charge the cgroup the mm_struct belongs to.
@@ -1510,7 +1550,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                 unsigned long flags = 0;
  
                 if (consume_stock(mem))
-                       goto charged;
+                       goto done;
  
                 ret = res_counter_charge(&mem->res, csize, &fail_res);
                 if (likely(!ret)) {
@@ -1561,7 +1601,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                          * There is a small race that "from" or "to" can be
                          * freed by rmdir, so we use css_tryget().
                          */
-                       rcu_read_lock();
                         from = mc.from;
                         to = mc.to;
                         if (from && css_tryget(&from->css)) {
@@ -1582,7 +1621,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                                         do_continue = (to == mem_over_limit);
                                 css_put(&to->css);
                         }
-                       rcu_read_unlock();
                         if (do_continue) {
                                 DEFINE_WAIT(wait);
                                 prepare_to_wait(&mc.waitq, &wait,
@@ -1596,29 +1634,27 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
                 }
  
                 if (!nr_retries--) {
-                       if (oom) {
-                               mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
-                               record_last_oom(mem_over_limit);
+                       if (!oom)
+                               goto nomem;
+                       if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
+                               nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+                               continue;
                         }
-                       goto nomem;
+                       /* When we reach here, current task is dying .*/
+                       css_put(&mem->css);
+                       goto bypass;
                 }
         }
         if (csize > PAGE_SIZE)
                 refill_stock(mem, csize - PAGE_SIZE);
-charged:
-       /*
-        * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
-        * if they exceeds softlimit.
-        */
-       if (page && mem_cgroup_soft_limit_check(mem))
-               mem_cgroup_update_tree(mem, page);
  done:
-       if (mem_cgroup_threshold_check(mem))
-               mem_cgroup_threshold(mem);
         return 0;
  nomem:
         css_put(&mem->css);
         return -ENOMEM;
+bypass:
+       *memcg = NULL;
+       return 0;
  }
  
  /*
@@ -1738,6 +1774,12 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
         mem_cgroup_charge_statistics(mem, pc, true);
  
         unlock_page_cgroup(pc);
+       /*
+        * "charge_statistics" updated event counter. Then, check it.
+        * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+        * if they exceeds softlimit.
+        */
+       memcg_check_events(mem, pc->page);
  }
  
  /**
@@ -1760,31 +1802,18 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
  static void __mem_cgroup_move_account(struct page_cgroup *pc,
         struct mem_cgroup *from, struct mem_cgroup *to, bool uncharge)
  {
-       struct page *page;
-       int cpu;
-       struct mem_cgroup_stat *stat;
-       struct mem_cgroup_stat_cpu *cpustat;
-
         VM_BUG_ON(from == to);
         VM_BUG_ON(PageLRU(pc->page));
         VM_BUG_ON(!PageCgroupLocked(pc));
         VM_BUG_ON(!PageCgroupUsed(pc));
         VM_BUG_ON(pc->mem_cgroup != from);
  
-       page = pc->page;
-       if (page_mapped(page) && !PageAnon(page)) {
-               cpu = smp_processor_id();
-               /* Update mapped_file data for mem_cgroup "from" */
-               stat = &from->stat;
-               cpustat = &stat->cpustat[cpu];
-               __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-                                               -1);
-
-               /* Update mapped_file data for mem_cgroup "to" */
-               stat = &to->stat;
-               cpustat = &stat->cpustat[cpu];
-               __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_FILE_MAPPED,
-                                               1);
+       if (PageCgroupFileMapped(pc)) {
+               /* Update mapped_file data for mem_cgroup */
+               preempt_disable();
+               __this_cpu_dec(from->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+               __this_cpu_inc(to->stat->count[MEM_CGROUP_STAT_FILE_MAPPED]);
+               preempt_enable();
         }
         mem_cgroup_charge_statistics(from, pc, false);
         if (uncharge)
@@ -1817,6 +1846,11 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
                 ret = 0;
         }
         unlock_page_cgroup(pc);
+       /*
+        * check events
+        */
+       memcg_check_events(to, pc->page);
+       memcg_check_events(from, pc->page);
         return ret;
  }
  
@@ -1845,7 +1879,7 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc,
                 goto put;
  
         parent = mem_cgroup_from_cont(pcg);
-       ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false, page);
+       ret = __mem_cgroup_try_charge(NULL, gfp_mask, &parent, false);
         if (ret || !parent)
                 goto put_back;
  
@@ -1881,7 +1915,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
         prefetchw(pc);
  
         mem = memcg;
-       ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true, page);
+       ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
         if (ret || !mem)
                 return ret;
  
@@ -2001,14 +2035,14 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
         if (!mem)
                 goto charge_cur_mm;
         *ptr = mem;
-       ret = __mem_cgroup_try_charge(NULL, mask, ptr, true, page);
+       ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
         /* drop extra refcnt from tryget */
         css_put(&mem->css);
         return ret;
  charge_cur_mm:
         if (unlikely(!mm))
                 mm = &init_mm;
-       return __mem_cgroup_try_charge(mm, mask, ptr, true, page);
+       return __mem_cgroup_try_charge(mm, mask, ptr, true);
  }
  
  static void
@@ -2185,10 +2219,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
         mz = page_cgroup_zoneinfo(pc);
         unlock_page_cgroup(pc);
  
-       if (mem_cgroup_soft_limit_check(mem))
-               mem_cgroup_update_tree(mem, page);
-       if (mem_cgroup_threshold_check(mem))
-               mem_cgroup_threshold(mem);
+       memcg_check_events(mem, page);
         /* at swapout, this memcg will be accessed to record to swap */
         if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
                 css_put(&mem->css);
@@ -2396,12 +2427,11 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
         }
         unlock_page_cgroup(pc);
  
+       *ptr = mem;
         if (mem) {
-               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false,
-                                               page);
+               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
                 css_put(&mem->css);
         }
-       *ptr = mem;
         return ret;
  }
  
@@ -2885,7 +2915,7 @@ static int
  mem_cgroup_get_idx_stat(struct mem_cgroup *mem, void *data)
  {
         struct mem_cgroup_idx_data *d = data;
-       d->val += mem_cgroup_read_stat(&mem->stat, d->idx);
+       d->val += mem_cgroup_read_stat(mem, d->idx);
         return 0;
  }
  
@@ -3134,18 +3164,18 @@ static int mem_cgroup_get_local_stat(struct mem_cgroup *mem, void *data)
         s64 val;
  
         /* per cpu stat */
-       val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_CACHE);
+       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_CACHE);
         s->stat[MCS_CACHE] += val * PAGE_SIZE;
-       val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_RSS);
+       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_RSS);
         s->stat[MCS_RSS] += val * PAGE_SIZE;
-       val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_FILE_MAPPED);
+       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_FILE_MAPPED);
         s->stat[MCS_FILE_MAPPED] += val * PAGE_SIZE;
-       val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGIN_COUNT);
+       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGIN_COUNT);
         s->stat[MCS_PGPGIN] += val;
-       val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_PGPGOUT_COUNT);
+       val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_PGPGOUT_COUNT);
         s->stat[MCS_PGPGOUT] += val;
         if (do_swap_account) {
-               val = mem_cgroup_read_stat(&mem->stat, MEM_CGROUP_STAT_SWAPOUT);
+               val = mem_cgroup_read_stat(mem, MEM_CGROUP_STAT_SWAPOUT);
                 s->stat[MCS_SWAP] += val * PAGE_SIZE;
         }
  
@@ -3273,25 +3303,6 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft,
         return 0;
  }
  
-static bool mem_cgroup_threshold_check(struct mem_cgroup *mem)
-{
-       bool ret = false;
-       int cpu;
-       s64 val;
-       struct mem_cgroup_stat_cpu *cpustat;
-
-       cpu = get_cpu();
-       cpustat = &mem->stat.cpustat[cpu];
-       val = __mem_cgroup_stat_read_local(cpustat, MEM_CGROUP_STAT_THRESHOLDS);
-       if (unlikely(val < 0)) {
-               __mem_cgroup_stat_set_safe(cpustat, MEM_CGROUP_STAT_THRESHOLDS,
-                               THRESHOLDS_EVENTS_THRESH);
-               ret = true;
-       }
-       put_cpu();
-       return ret;
-}
-
  static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
  {
         struct mem_cgroup_threshold_ary *t;
@@ -3428,12 +3439,6 @@ static int mem_cgroup_register_event(struct cgroup *cgrp, struct cftype *cft,
                 }
         }
  
-       /*
-        * We need to increment refcnt to be sure that all thresholds
-        * will be unregistered before calling __mem_cgroup_free()
-        */
-       mem_cgroup_get(memcg);
-
         if (type == _MEM)
                 rcu_assign_pointer(memcg->thresholds, thresholds_new);
         else
@@ -3527,9 +3532,6 @@ assign:
         /* To be sure that nobody uses thresholds before freeing it */
         synchronize_rcu();
  
-       for (i = 0; i < thresholds->size - size; i++)
-               mem_cgroup_put(memcg);
-
         kfree(thresholds);
  unlock:
         mutex_unlock(&memcg->thresholds_lock);
@@ -3676,24 +3678,29 @@ static void free_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node)
         kfree(mem->info.nodeinfo[node]);
  }
  
-static int mem_cgroup_size(void)
-{
-       int cpustat_size = nr_cpu_ids * sizeof(struct mem_cgroup_stat_cpu);
-       return sizeof(struct mem_cgroup) + cpustat_size;
-}
-
  static struct mem_cgroup *mem_cgroup_alloc(void)
  {
         struct mem_cgroup *mem;
-       int size = mem_cgroup_size();
+       int size = sizeof(struct mem_cgroup);
  
+       /* Can be very big if MAX_NUMNODES is very big */
         if (size < PAGE_SIZE)
                 mem = kmalloc(size, GFP_KERNEL);
         else
                 mem = vmalloc(size);
  
-       if (mem)
-               memset(mem, 0, size);
+       if (!mem)
+               return NULL;
+
+       memset(mem, 0, size);
+       mem->stat = alloc_percpu(struct mem_cgroup_stat_cpu);
+       if (!mem->stat) {
+               if (size < PAGE_SIZE)
+                       kfree(mem);
+               else
+                       vfree(mem);
+               mem = NULL;
+       }
         return mem;
  }
  
@@ -3718,7 +3725,8 @@ static void __mem_cgroup_free(struct mem_cgroup *mem)
         for_each_node_state(node, N_POSSIBLE)
                 free_mem_cgroup_per_zone_info(mem, node);
  
-       if (mem_cgroup_size() < PAGE_SIZE)
+       free_percpu(mem->stat);
+       if (sizeof(struct mem_cgroup) < PAGE_SIZE)
                 kfree(mem);
         else
                 vfree(mem);
@@ -3930,8 +3938,7 @@ one_by_one:
                         batch_count = PRECHARGE_COUNT_AT_ONCE;
                         cond_resched();
                 }
-               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem,
-                                                               false, NULL);
+               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, &mem, false);
                 if (ret || !mem)
                         /* mem_cgroup_clear_mc() will do uncharge later */
                         return -ENOMEM;
@@ -3939,28 +3946,6 @@ one_by_one:
         }
         return ret;
  }
-#else  /* !CONFIG_MMU */
-static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
-                               struct cgroup *cgroup,
-                               struct task_struct *p,
-                               bool threadgroup)
-{
-       return 0;
-}
-static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
-                               struct cgroup *cgroup,
-                               struct task_struct *p,
-                               bool threadgroup)
-{
-}
-static void mem_cgroup_move_task(struct cgroup_subsys *ss,
-                               struct cgroup *cont,
-                               struct cgroup *old_cont,
-                               struct task_struct *p,
-                               bool threadgroup)
-{
-}
-#endif
  
  /**
   * is_target_pte_for_mc - check a pte whether it is valid for move charge
@@ -4323,6 +4308,28 @@ static void mem_cgroup_move_task(struct cgroup_subsys *ss,
         }
         mem_cgroup_clear_mc();
  }
+#else  /* !CONFIG_MMU */
+static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
+                               struct cgroup *cgroup,
+                               struct task_struct *p,
+                               bool threadgroup)
+{
+       return 0;
+}
+static void mem_cgroup_cancel_attach(struct cgroup_subsys *ss,
+                               struct cgroup *cgroup,
+                               struct task_struct *p,
+                               bool threadgroup)
+{
+}
+static void mem_cgroup_move_task(struct cgroup_subsys *ss,
+                               struct cgroup *cont,
+                               struct cgroup *old_cont,
+                               struct task_struct *p,
+                               bool threadgroup)
+{
+}
+#endif
  
  struct cgroup_subsys mem_cgroup_subsys = {
         .name = "memory",