struct mem_cgroup *mem_cgroup;
atomic_t ref_cnt; /* Helpful when pages move b/w */
/* mapped and cached states */
+ int flags;
};
+#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */
enum {
MEM_CGROUP_TYPE_UNSPEC = 0,
MEM_CGROUP_TYPE_MAX,
};
+enum charge_type {
+ MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
+ MEM_CGROUP_CHARGE_TYPE_MAPPED,
+};
+
static struct mem_cgroup init_mem_cgroup;
static inline
bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup);
}
+/*
+ * Tie new page_cgroup to struct page under lock_page_cgroup()
+ * This can fail if the page has been tied to a page_cgroup.
+ * If success, returns 0.
+ */
+static inline int
+page_cgroup_assign_new_page_cgroup(struct page *page, struct page_cgroup *pc)
+{
+ int ret = 0;
+
+ lock_page_cgroup(page);
+ if (!page_get_page_cgroup(page))
+ page_assign_page_cgroup(page, pc);
+ else /* A page is tied to other pc. */
+ ret = 1;
+ unlock_page_cgroup(page);
+ return ret;
+}
+
+/*
+ * Clear page->page_cgroup member under lock_page_cgroup().
+ * If given "pc" value is different from one page->page_cgroup,
+ * page->cgroup is not cleared.
+ * Returns a value of page->page_cgroup at lock taken.
+ * A can can detect failure of clearing by following
+ * clear_page_cgroup(page, pc) == pc
+ */
+
+static inline struct page_cgroup *
+clear_page_cgroup(struct page *page, struct page_cgroup *pc)
+{
+ struct page_cgroup *ret;
+ /* lock and clear */
+ lock_page_cgroup(page);
+ ret = page_get_page_cgroup(page);
+ if (likely(ret == pc))
+ page_assign_page_cgroup(page, NULL);
+ unlock_page_cgroup(page);
+ return ret;
+}
+
+
static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active)
{
if (active)
list_move(&pc->lru, &pc->mem_cgroup->inactive_list);
}
+int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
+{
+ int ret;
+
+ task_lock(task);
+ ret = task->mm && mm_cgroup(task->mm) == mem;
+ task_unlock(task);
+ return ret;
+}
+
/*
* This routine assumes that the appropriate zone's lru lock is already held
*/
unsigned long scan;
LIST_HEAD(pc_list);
struct list_head *src;
- struct page_cgroup *pc;
+ struct page_cgroup *pc, *tmp;
if (active)
src = &mem_cont->active_list;
src = &mem_cont->inactive_list;
spin_lock(&mem_cont->lru_lock);
- for (scan = 0; scan < nr_to_scan && !list_empty(src); scan++) {
- pc = list_entry(src->prev, struct page_cgroup, lru);
+ scan = 0;
+ list_for_each_entry_safe_reverse(pc, tmp, src, lru) {
+ if (scan >= nr_to_scan)
+ break;
page = pc->page;
VM_BUG_ON(!pc);
+ if (unlikely(!PageLRU(page)))
+ continue;
+
if (PageActive(page) && !active) {
__mem_cgroup_move_lists(pc, true);
- scan--;
continue;
}
if (!PageActive(page) && active) {
__mem_cgroup_move_lists(pc, false);
- scan--;
continue;
}
if (page_zone(page) != z)
continue;
- /*
- * Check if the meta page went away from under us
- */
- if (!list_empty(&pc->lru))
- list_move(&pc->lru, &pc_list);
- else
- continue;
+ scan++;
+ list_move(&pc->lru, &pc_list);
if (__isolate_lru_page(page, mode) == 0) {
list_move(&page->lru, dst);
* 0 if the charge was successful
* < 0 if the cgroup is over its limit
*/
-int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask)
+static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask, enum charge_type ctype)
{
struct mem_cgroup *mem;
- struct page_cgroup *pc, *race_pc;
+ struct page_cgroup *pc;
unsigned long flags;
unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
unlock_page_cgroup(page);
cpu_relax();
goto retry;
- } else
+ } else {
+ unlock_page_cgroup(page);
goto done;
+ }
}
unlock_page_cgroup(page);
goto free_pc;
}
- lock_page_cgroup(page);
- /*
- * Check if somebody else beat us to allocating the page_cgroup
- */
- race_pc = page_get_page_cgroup(page);
- if (race_pc) {
- kfree(pc);
- pc = race_pc;
- atomic_inc(&pc->ref_cnt);
- res_counter_uncharge(&mem->res, PAGE_SIZE);
- css_put(&mem->css);
- goto done;
- }
-
atomic_set(&pc->ref_cnt, 1);
pc->mem_cgroup = mem;
pc->page = page;
- page_assign_page_cgroup(page, pc);
+ pc->flags = 0;
+ if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE)
+ pc->flags |= PAGE_CGROUP_FLAG_CACHE;
+ if (page_cgroup_assign_new_page_cgroup(page, pc)) {
+ /*
+ * an another charge is added to this page already.
+ * we do take lock_page_cgroup(page) again and read
+ * page->cgroup, increment refcnt.... just retry is OK.
+ */
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ css_put(&mem->css);
+ kfree(pc);
+ goto retry;
+ }
spin_lock_irqsave(&mem->lru_lock, flags);
list_add(&pc->lru, &mem->active_list);
spin_unlock_irqrestore(&mem->lru_lock, flags);
done:
- unlock_page_cgroup(page);
return 0;
free_pc:
kfree(pc);
return -ENOMEM;
}
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm,
+ gfp_t gfp_mask)
+{
+ return mem_cgroup_charge_common(page, mm, gfp_mask,
+ MEM_CGROUP_CHARGE_TYPE_MAPPED);
+}
+
/*
* See if the cached pages should be charged at all?
*/
mem = rcu_dereference(mm->mem_cgroup);
if (mem->control_type == MEM_CGROUP_TYPE_ALL)
- return mem_cgroup_charge(page, mm, gfp_mask);
+ return mem_cgroup_charge_common(page, mm, gfp_mask,
+ MEM_CGROUP_CHARGE_TYPE_CACHE);
else
return 0;
}
if (atomic_dec_and_test(&pc->ref_cnt)) {
page = pc->page;
- lock_page_cgroup(page);
- mem = pc->mem_cgroup;
- css_put(&mem->css);
- page_assign_page_cgroup(page, NULL);
- unlock_page_cgroup(page);
- res_counter_uncharge(&mem->res, PAGE_SIZE);
+ /*
+ * get page->cgroup and clear it under lock.
+ * force_empty can drop page->cgroup without checking refcnt.
+ */
+ if (clear_page_cgroup(page, pc) == pc) {
+ mem = pc->mem_cgroup;
+ css_put(&mem->css);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ spin_lock_irqsave(&mem->lru_lock, flags);
+ list_del_init(&pc->lru);
+ spin_unlock_irqrestore(&mem->lru_lock, flags);
+ kfree(pc);
+ }
+ }
+}
+/*
+ * Returns non-zero if a page (under migration) has valid page_cgroup member.
+ * Refcnt of page_cgroup is incremented.
+ */
- spin_lock_irqsave(&mem->lru_lock, flags);
- list_del_init(&pc->lru);
- spin_unlock_irqrestore(&mem->lru_lock, flags);
- kfree(pc);
+int mem_cgroup_prepare_migration(struct page *page)
+{
+ struct page_cgroup *pc;
+ int ret = 0;
+ lock_page_cgroup(page);
+ pc = page_get_page_cgroup(page);
+ if (pc && atomic_inc_not_zero(&pc->ref_cnt))
+ ret = 1;
+ unlock_page_cgroup(page);
+ return ret;
+}
+
+void mem_cgroup_end_migration(struct page *page)
+{
+ struct page_cgroup *pc = page_get_page_cgroup(page);
+ mem_cgroup_uncharge(pc);
+}
+/*
+ * We know both *page* and *newpage* are now not-on-LRU and Pg_locked.
+ * And no race with uncharge() routines because page_cgroup for *page*
+ * has extra one reference by mem_cgroup_prepare_migration.
+ */
+
+void mem_cgroup_page_migration(struct page *page, struct page *newpage)
+{
+ struct page_cgroup *pc;
+retry:
+ pc = page_get_page_cgroup(page);
+ if (!pc)
+ return;
+ if (clear_page_cgroup(page, pc) != pc)
+ goto retry;
+ pc->page = newpage;
+ lock_page_cgroup(newpage);
+ page_assign_page_cgroup(newpage, pc);
+ unlock_page_cgroup(newpage);
+ return;
+}
+
+/*
+ * This routine traverse page_cgroup in given list and drop them all.
+ * This routine ignores page_cgroup->ref_cnt.
+ * *And* this routine doesn't reclaim page itself, just removes page_cgroup.
+ */
+#define FORCE_UNCHARGE_BATCH (128)
+static void
+mem_cgroup_force_empty_list(struct mem_cgroup *mem, struct list_head *list)
+{
+ struct page_cgroup *pc;
+ struct page *page;
+ int count;
+ unsigned long flags;
+
+retry:
+ count = FORCE_UNCHARGE_BATCH;
+ spin_lock_irqsave(&mem->lru_lock, flags);
+
+ while (--count && !list_empty(list)) {
+ pc = list_entry(list->prev, struct page_cgroup, lru);
+ page = pc->page;
+ /* Avoid race with charge */
+ atomic_set(&pc->ref_cnt, 0);
+ if (clear_page_cgroup(page, pc) == pc) {
+ css_put(&mem->css);
+ res_counter_uncharge(&mem->res, PAGE_SIZE);
+ list_del_init(&pc->lru);
+ kfree(pc);
+ } else /* being uncharged ? ...do relax */
+ break;
+ }
+ spin_unlock_irqrestore(&mem->lru_lock, flags);
+ if (!list_empty(list)) {
+ cond_resched();
+ goto retry;
}
+ return;
}
+/*
+ * make mem_cgroup's charge to be 0 if there is no task.
+ * This enables deleting this mem_cgroup.
+ */
+
+int mem_cgroup_force_empty(struct mem_cgroup *mem)
+{
+ int ret = -EBUSY;
+ css_get(&mem->css);
+ /*
+ * page reclaim code (kswapd etc..) will move pages between
+` * active_list <-> inactive_list while we don't take a lock.
+ * So, we have to do loop here until all lists are empty.
+ */
+ while (!(list_empty(&mem->active_list) &&
+ list_empty(&mem->inactive_list))) {
+ if (atomic_read(&mem->css.cgroup->count) > 0)
+ goto out;
+ /* drop all page_cgroup in active_list */
+ mem_cgroup_force_empty_list(mem, &mem->active_list);
+ /* drop all page_cgroup in inactive_list */
+ mem_cgroup_force_empty_list(mem, &mem->inactive_list);
+ }
+ ret = 0;
+out:
+ css_put(&mem->css);
+ return ret;
+}
+
+
+
int mem_cgroup_write_strategy(char *buf, unsigned long long *tmp)
{
*tmp = memparse(buf, &buf);
ppos, buf, s - buf);
}
+
+static ssize_t mem_force_empty_write(struct cgroup *cont,
+ struct cftype *cft, struct file *file,
+ const char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
+ int ret;
+ ret = mem_cgroup_force_empty(mem);
+ if (!ret)
+ ret = nbytes;
+ return ret;
+}
+
+/*
+ * Note: This should be removed if cgroup supports write-only file.
+ */
+
+static ssize_t mem_force_empty_read(struct cgroup *cont,
+ struct cftype *cft,
+ struct file *file, char __user *userbuf,
+ size_t nbytes, loff_t *ppos)
+{
+ return -EINVAL;
+}
+
+
static struct cftype mem_cgroup_files[] = {
{
.name = "usage_in_bytes",
.write = mem_control_type_write,
.read = mem_control_type_read,
},
+ {
+ .name = "force_empty",
+ .write = mem_force_empty_write,
+ .read = mem_force_empty_read,
+ },
};
static struct mem_cgroup init_mem_cgroup;