memcg: fix mis-accounting of file mapped racy with migration

[safe/jmp/linux-2.6] / mm / memcontrol.c
diff --git a/mm/memcontrol.c b/mm/memcontrol.c

index 8c200e8..df1234c 100644 (file)
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2258,7 +2258,8 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
         switch (ctype) {
         case MEM_CGROUP_CHARGE_TYPE_MAPPED:
         case MEM_CGROUP_CHARGE_TYPE_DROP:
-               if (page_mapped(page))
+               /* See mem_cgroup_prepare_migration() */
+               if (page_mapped(page) || PageCgroupMigration(pc))
                         goto unlock_out;
                 break;
         case MEM_CGROUP_CHARGE_TYPE_SWAPOUT:
@@ -2481,10 +2482,12 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
   * Before starting migration, account PAGE_SIZE to mem_cgroup that the old
   * page belongs to.
   */
-int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
+int mem_cgroup_prepare_migration(struct page *page,
+       struct page *newpage, struct mem_cgroup **ptr)
  {
         struct page_cgroup *pc;
         struct mem_cgroup *mem = NULL;
+       enum charge_type ctype;
         int ret = 0;
  
         if (mem_cgroup_disabled())
@@ -2495,69 +2498,125 @@ int mem_cgroup_prepare_migration(struct page *page, struct mem_cgroup **ptr)
         if (PageCgroupUsed(pc)) {
                 mem = pc->mem_cgroup;
                 css_get(&mem->css);
+               /*
+                * At migrating an anonymous page, its mapcount goes down
+                * to 0 and uncharge() will be called. But, even if it's fully
+                * unmapped, migration may fail and this page has to be
+                * charged again. We set MIGRATION flag here and delay uncharge
+                * until end_migration() is called
+                *
+                * Corner Case Thinking
+                * A)
+                * When the old page was mapped as Anon and it's unmap-and-freed
+                * while migration was ongoing.
+                * If unmap finds the old page, uncharge() of it will be delayed
+                * until end_migration(). If unmap finds a new page, it's
+                * uncharged when it make mapcount to be 1->0. If unmap code
+                * finds swap_migration_entry, the new page will not be mapped
+                * and end_migration() will find it(mapcount==0).
+                *
+                * B)
+                * When the old page was mapped but migraion fails, the kernel
+                * remaps it. A charge for it is kept by MIGRATION flag even
+                * if mapcount goes down to 0. We can do remap successfully
+                * without charging it again.
+                *
+                * C)
+                * The "old" page is under lock_page() until the end of
+                * migration, so, the old page itself will not be swapped-out.
+                * If the new page is swapped out before end_migraton, our
+                * hook to usual swap-out path will catch the event.
+                */
+               if (PageAnon(page))
+                       SetPageCgroupMigration(pc);
         }
         unlock_page_cgroup(pc);
+       /*
+        * If the page is not charged at this point,
+        * we return here.
+        */
+       if (!mem)
+               return 0;
  
         *ptr = mem;
-       if (mem) {
-               ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
-               css_put(&mem->css);
+       ret = __mem_cgroup_try_charge(NULL, GFP_KERNEL, ptr, false);
+       css_put(&mem->css);/* drop extra refcnt */
+       if (ret || *ptr == NULL) {
+               if (PageAnon(page)) {
+                       lock_page_cgroup(pc);
+                       ClearPageCgroupMigration(pc);
+                       unlock_page_cgroup(pc);
+                       /*
+                        * The old page may be fully unmapped while we kept it.
+                        */
+                       mem_cgroup_uncharge_page(page);
+               }
+               return -ENOMEM;
         }
+       /*
+        * We charge new page before it's used/mapped. So, even if unlock_page()
+        * is called before end_migration, we can catch all events on this new
+        * page. In the case new page is migrated but not remapped, new page's
+        * mapcount will be finally 0 and we call uncharge in end_migration().
+        */
+       pc = lookup_page_cgroup(newpage);
+       if (PageAnon(page))
+               ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
+       else if (page_is_file_cache(page))
+               ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
+       else
+               ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
+       __mem_cgroup_commit_charge(mem, pc, ctype);
         return ret;
  }
  
  /* remove redundant charge if migration failed*/
  void mem_cgroup_end_migration(struct mem_cgroup *mem,
-               struct page *oldpage, struct page *newpage)
+       struct page *oldpage, struct page *newpage)
  {
-       struct page *target, *unused;
+       struct page *used, *unused;
         struct page_cgroup *pc;
-       enum charge_type ctype;
  
         if (!mem)
                 return;
+       /* blocks rmdir() */
         cgroup_exclude_rmdir(&mem->css);
         /* at migration success, oldpage->mapping is NULL. */
         if (oldpage->mapping) {
-               target = oldpage;
-               unused = NULL;
+               used = oldpage;
+               unused = newpage;
         } else {
-               target = newpage;
+               used = newpage;
                 unused = oldpage;
         }
-
-       if (PageAnon(target))
-               ctype = MEM_CGROUP_CHARGE_TYPE_MAPPED;
-       else if (page_is_file_cache(target))
-               ctype = MEM_CGROUP_CHARGE_TYPE_CACHE;
-       else
-               ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM;
-
-       /* unused page is not on radix-tree now. */
-       if (unused)
-               __mem_cgroup_uncharge_common(unused, ctype);
-
-       pc = lookup_page_cgroup(target);
         /*
-        * __mem_cgroup_commit_charge() check PCG_USED bit of page_cgroup.
-        * So, double-counting is effectively avoided.
+        * We disallowed uncharge of pages under migration because mapcount
+        * of the page goes down to zero, temporarly.
+        * Clear the flag and check the page should be charged.
          */
-       __mem_cgroup_commit_charge(mem, pc, ctype);
+       pc = lookup_page_cgroup(oldpage);
+       lock_page_cgroup(pc);
+       ClearPageCgroupMigration(pc);
+       unlock_page_cgroup(pc);
  
+       if (unused != oldpage)
+               pc = lookup_page_cgroup(unused);
+       __mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
+
+       pc = lookup_page_cgroup(used);
         /*
-        * Both of oldpage and newpage are still under lock_page().
-        * Then, we don't have to care about race in radix-tree.
-        * But we have to be careful that this page is unmapped or not.
-        *
-        * There is a case for !page_mapped(). At the start of
-        * migration, oldpage was mapped. But now, it's zapped.
-        * But we know *target* page is not freed/reused under us.
-        * mem_cgroup_uncharge_page() does all necessary checks.
+        * If a page is a file cache, radix-tree replacement is very atomic
+        * and we can skip this check. When it was an Anon page, its mapcount
+        * goes down to 0. But because we added MIGRATION flage, it's not
+        * uncharged yet. There are several case but page->mapcount check
+        * and USED bit check in mem_cgroup_uncharge_page() will do enough
+        * check. (see prepare_charge() also)
          */
-       if (ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED)
-               mem_cgroup_uncharge_page(target);
+       if (PageAnon(used))
+               mem_cgroup_uncharge_page(used);
         /*
-        * At migration, we may charge account against cgroup which has no tasks
+        * At migration, we may charge account against cgroup which has no
+        * tasks.
          * So, rmdir()->pre_destroy() can be called while we do this charge.
          * In that case, we need to call pre_destroy() again. check it here.
          */