SAFE public projects git trees. - safe/jmp/linux-2.6/blob - mm/memcontrol.c

   1 /* memcontrol.c - Memory Controller
   2  *
   3  * Copyright IBM Corporation, 2007
   4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
   5  *
   6  * Copyright 2007 OpenVZ SWsoft Inc
   7  * Author: Pavel Emelianov <xemul@openvz.org>
   8  *
   9  * This program is free software; you can redistribute it and/or modify
  10  * it under the terms of the GNU General Public License as published by
  11  * the Free Software Foundation; either version 2 of the License, or
  12  * (at your option) any later version.
  13  *
  14  * This program is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  17  * GNU General Public License for more details.
  18  */
  19
  20 #include <linux/res_counter.h>
  21 #include <linux/memcontrol.h>
  22 #include <linux/cgroup.h>
  23 #include <linux/mm.h>
  24 #include <linux/page-flags.h>
  25 #include <linux/backing-dev.h>
  26 #include <linux/bit_spinlock.h>
  27 #include <linux/rcupdate.h>
  28 #include <linux/swap.h>
  29 #include <linux/spinlock.h>
  30 #include <linux/fs.h>
  31
  32 struct cgroup_subsys mem_cgroup_subsys;
  33 static const int MEM_CGROUP_RECLAIM_RETRIES = 5;
  34
  35 /*
  36  * The memory controller data structure. The memory controller controls both
  37  * page cache and RSS per cgroup. We would eventually like to provide
  38  * statistics based on the statistics developed by Rik Van Riel for clock-pro,
  39  * to help the administrator determine what knobs to tune.
  40  *
  41  * TODO: Add a water mark for the memory controller. Reclaim will begin when
  42  * we hit the water mark. May be even add a low water mark, such that
  43  * no reclaim occurs from a cgroup at it's low water mark, this is
  44  * a feature that will be implemented much later in the future.
  45  */
  46 struct mem_cgroup {
  47         struct cgroup_subsys_state css;
  48         /*
  49          * the counter to account for memory usage
  50          */
  51         struct res_counter res;
  52         /*
  53          * Per cgroup active and inactive list, similar to the
  54          * per zone LRU lists.
  55          * TODO: Consider making these lists per zone
  56          */
  57         struct list_head active_list;
  58         struct list_head inactive_list;
  59         /*
  60          * spin_lock to protect the per cgroup LRU
  61          */
  62         spinlock_t lru_lock;
  63 };
  64
  65 /*
  66  * We use the lower bit of the page->page_cgroup pointer as a bit spin
  67  * lock. We need to ensure that page->page_cgroup is atleast two
  68  * byte aligned (based on comments from Nick Piggin)
  69  */
  70 #define PAGE_CGROUP_LOCK_BIT    0x0
  71 #define PAGE_CGROUP_LOCK                (1 << PAGE_CGROUP_LOCK_BIT)
  72
  73 /*
  74  * A page_cgroup page is associated with every page descriptor. The
  75  * page_cgroup helps us identify information about the cgroup
  76  */
  77 struct page_cgroup {
  78         struct list_head lru;           /* per cgroup LRU list */
  79         struct page *page;
  80         struct mem_cgroup *mem_cgroup;
  81         atomic_t ref_cnt;               /* Helpful when pages move b/w  */
  82                                         /* mapped and cached states     */
  83 };
  84
  85
  86 static inline
  87 struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont)
  88 {
  89         return container_of(cgroup_subsys_state(cont,
  90                                 mem_cgroup_subsys_id), struct mem_cgroup,
  91                                 css);
  92 }
  93
  94 static inline
  95 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
  96 {
  97         return container_of(task_subsys_state(p, mem_cgroup_subsys_id),
  98                                 struct mem_cgroup, css);
  99 }
 100
 101 void mm_init_cgroup(struct mm_struct *mm, struct task_struct *p)
 102 {
 103         struct mem_cgroup *mem;
 104
 105         mem = mem_cgroup_from_task(p);
 106         css_get(&mem->css);
 107         mm->mem_cgroup = mem;
 108 }
 109
 110 void mm_free_cgroup(struct mm_struct *mm)
 111 {
 112         css_put(&mm->mem_cgroup->css);
 113 }
 114
 115 static inline int page_cgroup_locked(struct page *page)
 116 {
 117         return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT,
 118                                         &page->page_cgroup);
 119 }
 120
 121 void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc)
 122 {
 123         int locked;
 124
 125         /*
 126          * While resetting the page_cgroup we might not hold the
 127          * page_cgroup lock. free_hot_cold_page() is an example
 128          * of such a scenario
 129          */
 130         if (pc)
 131                 VM_BUG_ON(!page_cgroup_locked(page));
 132         locked = (page->page_cgroup & PAGE_CGROUP_LOCK);
 133         page->page_cgroup = ((unsigned long)pc | locked);
 134 }
 135
 136 struct page_cgroup *page_get_page_cgroup(struct page *page)
 137 {
 138         return (struct page_cgroup *)
 139                 (page->page_cgroup & ~PAGE_CGROUP_LOCK);
 140 }
 141
 142 void __always_inline lock_page_cgroup(struct page *page)
 143 {
 144         bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 145         VM_BUG_ON(!page_cgroup_locked(page));
 146 }
 147
 148 void __always_inline unlock_page_cgroup(struct page *page)
 149 {
 150         bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
 151 }
 152
 153 void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
 154 {
 155         if (active)
 156                 list_move(&pc->lru, &pc->mem_cgroup->active_list);
 157         else
 158                 list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
 159 }
 160
 161 /*
 162  * This routine assumes that the appropriate zone's lru lock is already held
 163  */
 164 void mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
 165 {
 166         struct mem_cgroup *mem;
 167         if (!pc)
 168                 return;
 169
 170         mem = pc->mem_cgroup;
 171
 172         spin_lock(&mem->lru_lock);
 173         __mem_cgroup_move_lists(pc, active);
 174         spin_unlock(&mem->lru_lock);
 175 }
 176
 177 unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
 178                                         struct list_head *dst,
 179                                         unsigned long *scanned, int order,
 180                                         int mode, struct zone *z,
 181                                         struct mem_cgroup *mem_cont,
 182                                         int active)
 183 {
 184         unsigned long nr_taken = 0;
 185         struct page *page;
 186         unsigned long scan;
 187         LIST_HEAD(pc_list);
 188         struct list_head *src;
 189         struct page_cgroup *pc;
 190
 191         if (active)
 192                 src = &mem_cont->active_list;
 193         else
 194                 src = &mem_cont->inactive_list;
 195
 196         spin_lock(&mem_cont->lru_lock);
 197         for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
 198                 pc = list_entry(src->prev, struct page_cgroup, lru);
 199                 page = pc->page;
 200                 VM_BUG_ON(!pc);
 201
 202                 if (PageActive(page) && !active) {
 203                         __mem_cgroup_move_lists(pc, true);
 204                         scan--;
 205                         continue;
 206                 }
 207                 if (!PageActive(page) && active) {
 208                         __mem_cgroup_move_lists(pc, false);
 209                         scan--;
 210                         continue;
 211                 }
 212
 213                 /*
 214                  * Reclaim, per zone
 215                  * TODO: make the active/inactive lists per zone
 216                  */
 217                 if (page_zone(page) != z)
 218                         continue;
 219
 220                 /*
 221                  * Check if the meta page went away from under us
 222                  */
 223                 if (!list_empty(&pc->lru))
 224                         list_move(&pc->lru, &pc_list);
 225                 else
 226                         continue;
 227
 228                 if (__isolate_lru_page(page, mode) == 0) {
 229                         list_move(&page->lru, dst);
 230                         nr_taken++;
 231                 }
 232         }
 233
 234         list_splice(&pc_list, src);
 235         spin_unlock(&mem_cont->lru_lock);
 236
 237         *scanned = scan;
 238         return nr_taken;
 239 }
 240
 241 /*
 242  * Charge the memory controller for page usage.
 243  * Return
 244  * 0 if the charge was successful
 245  * < 0 if the cgroup is over its limit
 246  */
 247 int mem_cgroup_charge(struct page *page, struct mm_struct *mm)
 248 {
 249         struct mem_cgroup *mem;
 250         struct page_cgroup *pc, *race_pc;
 251         unsigned long flags;
 252         unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
 253
 254         /*
 255          * Should page_cgroup's go to their own slab?
 256          * One could optimize the performance of the charging routine
 257          * by saving a bit in the page_flags and using it as a lock
 258          * to see if the cgroup page already has a page_cgroup associated
 259          * with it
 260          */
 261 retry:
 262         lock_page_cgroup(page);
 263         pc = page_get_page_cgroup(page);
 264         /*
 265          * The page_cgroup exists and the page has already been accounted
 266          */
 267         if (pc) {
 268                 if (unlikely(!atomic_inc_not_zero(&pc->ref_cnt))) {
 269                         /* this page is under being uncharged ? */
 270                         unlock_page_cgroup(page);
 271                         cpu_relax();
 272                         goto retry;
 273                 } else
 274                         goto done;
 275         }
 276
 277         unlock_page_cgroup(page);
 278
 279         pc = kzalloc(sizeof(struct page_cgroup), GFP_KERNEL);
 280         if (pc == NULL)
 281                 goto err;
 282
 283         rcu_read_lock();
 284         /*
 285          * We always charge the cgroup the mm_struct belongs to
 286          * the mm_struct's mem_cgroup changes on task migration if the
 287          * thread group leader migrates. It's possible that mm is not
 288          * set, if so charge the init_mm (happens for pagecache usage).
 289          */
 290         if (!mm)
 291                 mm = &init_mm;
 292
 293         mem = rcu_dereference(mm->mem_cgroup);
 294         /*
 295          * For every charge from the cgroup, increment reference
 296          * count
 297          */
 298         css_get(&mem->css);
 299         rcu_read_unlock();
 300
 301         /*
 302          * If we created the page_cgroup, we should free it on exceeding
 303          * the cgroup limit.
 304          */
 305         while (res_counter_charge(&mem->res, 1)) {
 306                 if (try_to_free_mem_cgroup_pages(mem))
 307                         continue;
 308
 309                 /*
 310                  * try_to_free_mem_cgroup_pages() might not give us a full
 311                  * picture of reclaim. Some pages are reclaimed and might be
 312                  * moved to swap cache or just unmapped from the cgroup.
 313                  * Check the limit again to see if the reclaim reduced the
 314                  * current usage of the cgroup before giving up
 315                  */
 316                 if (res_counter_check_under_limit(&mem->res))
 317                         continue;
 318                         /*
 319                          * Since we control both RSS and cache, we end up with a
 320                          * very interesting scenario where we end up reclaiming
 321                          * memory (essentially RSS), since the memory is pushed
 322                          * to swap cache, we eventually end up adding those
 323                          * pages back to our list. Hence we give ourselves a
 324                          * few chances before we fail
 325                          */
 326                 else if (nr_retries--) {
 327                         congestion_wait(WRITE, HZ/10);
 328                         continue;
 329                 }
 330
 331                 css_put(&mem->css);
 332                 goto free_pc;
 333         }
 334
 335         lock_page_cgroup(page);
 336         /*
 337          * Check if somebody else beat us to allocating the page_cgroup
 338          */
 339         race_pc = page_get_page_cgroup(page);
 340         if (race_pc) {
 341                 kfree(pc);
 342                 pc = race_pc;
 343                 atomic_inc(&pc->ref_cnt);
 344                 res_counter_uncharge(&mem->res, 1);
 345                 css_put(&mem->css);
 346                 goto done;
 347         }
 348
 349         atomic_set(&pc->ref_cnt, 1);
 350         pc->mem_cgroup = mem;
 351         pc->page = page;
 352         page_assign_page_cgroup(page, pc);
 353
 354         spin_lock_irqsave(&mem->lru_lock, flags);
 355         list_add(&pc->lru, &mem->active_list);
 356         spin_unlock_irqrestore(&mem->lru_lock, flags);
 357
 358 done:
 359         unlock_page_cgroup(page);
 360         return 0;
 361 free_pc:
 362         kfree(pc);
 363 err:
 364         return -ENOMEM;
 365 }
 366
 367 /*
 368  * Uncharging is always a welcome operation, we never complain, simply
 369  * uncharge.
 370  */
 371 void mem_cgroup_uncharge(struct page_cgroup *pc)
 372 {
 373         struct mem_cgroup *mem;
 374         struct page *page;
 375         unsigned long flags;
 376
 377         if (!pc)
 378                 return;
 379
 380         if (atomic_dec_and_test(&pc->ref_cnt)) {
 381                 page = pc->page;
 382                 lock_page_cgroup(page);
 383                 mem = pc->mem_cgroup;
 384                 css_put(&mem->css);
 385                 page_assign_page_cgroup(page, NULL);
 386                 unlock_page_cgroup(page);
 387                 res_counter_uncharge(&mem->res, 1);
 388
 389                 spin_lock_irqsave(&mem->lru_lock, flags);
 390                 list_del_init(&pc->lru);
 391                 spin_unlock_irqrestore(&mem->lru_lock, flags);
 392                 kfree(pc);
 393         }
 394 }
 395
 396 static ssize_t mem_cgroup_read(struct cgroup *cont, struct cftype *cft,
 397                         struct file *file, char __user *userbuf, size_t nbytes,
 398                         loff_t *ppos)
 399 {
 400         return res_counter_read(&mem_cgroup_from_cont(cont)->res,
 401                                 cft->private, userbuf, nbytes, ppos);
 402 }
 403
 404 static ssize_t mem_cgroup_write(struct cgroup *cont, struct cftype *cft,
 405                                 struct file *file, const char __user *userbuf,
 406                                 size_t nbytes, loff_t *ppos)
 407 {
 408         return res_counter_write(&mem_cgroup_from_cont(cont)->res,
 409                                 cft->private, userbuf, nbytes, ppos);
 410 }
 411
 412 static struct cftype mem_cgroup_files[] = {
 413         {
 414                 .name = "usage",
 415                 .private = RES_USAGE,
 416                 .read = mem_cgroup_read,
 417         },
 418         {
 419                 .name = "limit",
 420                 .private = RES_LIMIT,
 421                 .write = mem_cgroup_write,
 422                 .read = mem_cgroup_read,
 423         },
 424         {
 425                 .name = "failcnt",
 426                 .private = RES_FAILCNT,
 427                 .read = mem_cgroup_read,
 428         },
 429 };
 430
 431 static struct mem_cgroup init_mem_cgroup;
 432
 433 static struct cgroup_subsys_state *
 434 mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont)
 435 {
 436         struct mem_cgroup *mem;
 437
 438         if (unlikely((cont->parent) == NULL)) {
 439                 mem = &init_mem_cgroup;
 440                 init_mm.mem_cgroup = mem;
 441         } else
 442                 mem = kzalloc(sizeof(struct mem_cgroup), GFP_KERNEL);
 443
 444         if (mem == NULL)
 445                 return NULL;
 446
 447         res_counter_init(&mem->res);
 448         INIT_LIST_HEAD(&mem->active_list);
 449         INIT_LIST_HEAD(&mem->inactive_list);
 450         spin_lock_init(&mem->lru_lock);
 451         return &mem->css;
 452 }
 453
 454 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
 455                                 struct cgroup *cont)
 456 {
 457         kfree(mem_cgroup_from_cont(cont));
 458 }
 459
 460 static int mem_cgroup_populate(struct cgroup_subsys *ss,
 461                                 struct cgroup *cont)
 462 {
 463         return cgroup_add_files(cont, ss, mem_cgroup_files,
 464                                         ARRAY_SIZE(mem_cgroup_files));
 465 }
 466
 467 static void mem_cgroup_move_task(struct cgroup_subsys *ss,
 468                                 struct cgroup *cont,
 469                                 struct cgroup *old_cont,
 470                                 struct task_struct *p)
 471 {
 472         struct mm_struct *mm;
 473         struct mem_cgroup *mem, *old_mem;
 474
 475         mm = get_task_mm(p);
 476         if (mm == NULL)
 477                 return;
 478
 479         mem = mem_cgroup_from_cont(cont);
 480         old_mem = mem_cgroup_from_cont(old_cont);
 481
 482         if (mem == old_mem)
 483                 goto out;
 484
 485         /*
 486          * Only thread group leaders are allowed to migrate, the mm_struct is
 487          * in effect owned by the leader
 488          */
 489         if (p->tgid != p->pid)
 490                 goto out;
 491
 492         css_get(&mem->css);
 493         rcu_assign_pointer(mm->mem_cgroup, mem);
 494         css_put(&old_mem->css);
 495
 496 out:
 497         mmput(mm);
 498         return;
 499 }
 500
 501 struct cgroup_subsys mem_cgroup_subsys = {
 502         .name = "memory",
 503         .subsys_id = mem_cgroup_subsys_id,
 504         .create = mem_cgroup_create,
 505         .destroy = mem_cgroup_destroy,
 506         .populate = mem_cgroup_populate,
 507         .attach = mem_cgroup_move_task,
 508         .early_init = 1,
 509 };