sched: fix buddie group latency
[safe/jmp/linux-2.6] / kernel / cgroup.c
index 1e49218..c298310 100644 (file)
@@ -84,7 +84,7 @@ struct cgroupfs_root {
        /* Tracks how many cgroups are currently defined in hierarchy.*/
        int number_of_cgroups;
 
-       /* A list running through the mounted hierarchies */
+       /* A list running through the active hierarchies */
        struct list_head root_list;
 
        /* Hierarchy-specific flags */
@@ -116,7 +116,6 @@ static int root_count;
  * be called.
  */
 static int need_forkexit_callback __read_mostly;
-static int need_mm_owner_callback __read_mostly;
 
 /* convenient tests for these bits */
 inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -149,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
 #define for_each_subsys(_root, _ss) \
 list_for_each_entry(_ss, &_root->subsys_list, sibling)
 
-/* for_each_root() allows you to iterate across the active hierarchies */
-#define for_each_root(_root) \
+/* for_each_active_root() allows you to iterate across the active hierarchies */
+#define for_each_active_root(_root) \
 list_for_each_entry(_root, &roots, root_list)
 
 /* the list of cgroups eligible for automatic release. Protected by
@@ -272,7 +271,7 @@ static void __put_css_set(struct css_set *cg, int taskexit)
 
        rcu_read_lock();
        for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               struct cgroup *cgrp = cg->subsys[i]->cgroup;
+               struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
                if (atomic_dec_and_test(&cgrp->count) &&
                    notify_on_release(cgrp)) {
                        if (taskexit)
@@ -385,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
        return 0;
 }
 
+/**
+ * link_css_set - a helper function to link a css_set to a cgroup
+ * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
+ * @cg: the css_set to be linked
+ * @cgrp: the destination cgroup
+ */
+static void link_css_set(struct list_head *tmp_cg_links,
+                        struct css_set *cg, struct cgroup *cgrp)
+{
+       struct cg_cgroup_link *link;
+
+       BUG_ON(list_empty(tmp_cg_links));
+       link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
+                               cgrp_link_list);
+       link->cg = cg;
+       list_move(&link->cgrp_link_list, &cgrp->css_sets);
+       list_add(&link->cg_link_list, &cg->cg_links);
+}
+
 /*
  * find_css_set() takes an existing cgroup group and a
  * cgroup object, and returns a css_set object that's
@@ -400,7 +418,6 @@ static struct css_set *find_css_set(
        int i;
 
        struct list_head tmp_cg_links;
-       struct cg_cgroup_link *link;
 
        struct hlist_head *hhead;
 
@@ -445,26 +462,11 @@ static struct css_set *find_css_set(
                 * only do it for the first subsystem in each
                 * hierarchy
                 */
-               if (ss->root->subsys_list.next == &ss->sibling) {
-                       BUG_ON(list_empty(&tmp_cg_links));
-                       link = list_entry(tmp_cg_links.next,
-                                         struct cg_cgroup_link,
-                                         cgrp_link_list);
-                       list_del(&link->cgrp_link_list);
-                       list_add(&link->cgrp_link_list, &cgrp->css_sets);
-                       link->cg = res;
-                       list_add(&link->cg_link_list, &res->cg_links);
-               }
-       }
-       if (list_empty(&rootnode.subsys_list)) {
-               link = list_entry(tmp_cg_links.next,
-                                 struct cg_cgroup_link,
-                                 cgrp_link_list);
-               list_del(&link->cgrp_link_list);
-               list_add(&link->cgrp_link_list, &dummytop->css_sets);
-               link->cg = res;
-               list_add(&link->cg_link_list, &res->cg_links);
+               if (ss->root->subsys_list.next == &ss->sibling)
+                       link_css_set(&tmp_cg_links, res, cgrp);
        }
+       if (list_empty(&rootnode.subsys_list))
+               link_css_set(&tmp_cg_links, res, dummytop);
 
        BUG_ON(!list_empty(&tmp_cg_links));
 
@@ -571,9 +573,8 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 
        if (inode) {
                inode->i_mode = mode;
-               inode->i_uid = current->fsuid;
-               inode->i_gid = current->fsgid;
-               inode->i_blocks = 0;
+               inode->i_uid = current_fsuid();
+               inode->i_gid = current_fsgid();
                inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
        }
@@ -588,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
        struct cgroup_subsys *ss;
        for_each_subsys(cgrp->root, ss)
-               if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
+               if (ss->pre_destroy)
                        ss->pre_destroy(ss, cgrp);
        return;
 }
 
+static void free_cgroup_rcu(struct rcu_head *obj)
+{
+       struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+
+       kfree(cgrp);
+}
+
 static void cgroup_diput(struct dentry *dentry, struct inode *inode)
 {
        /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -612,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                /*
                 * Release the subsystem state objects.
                 */
-               for_each_subsys(cgrp->root, ss) {
-                       if (cgrp->subsys[ss->subsys_id])
-                               ss->destroy(ss, cgrp);
-               }
+               for_each_subsys(cgrp->root, ss)
+                       ss->destroy(ss, cgrp);
 
                cgrp->root->number_of_cgroups--;
                mutex_unlock(&cgroup_mutex);
 
-               /* Drop the active superblock reference that we took when we
-                * created the cgroup */
+               /*
+                * Drop the active superblock reference that we took when we
+                * created the cgroup
+                */
                deactivate_super(cgrp->root->sb);
 
-               kfree(cgrp);
+               call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
        }
        iput(inode);
 }
@@ -702,7 +710,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
         * any child cgroups exist. This is theoretically supportable
         * but involves complex error handling, so it's being left until
         * later */
-       if (!list_empty(&cgrp->children))
+       if (root->number_of_cgroups > 1)
                return -EBUSY;
 
        /* Process each subsystem */
@@ -714,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                        BUG_ON(cgrp->subsys[i]);
                        BUG_ON(!dummytop->subsys[i]);
                        BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+                       mutex_lock(&ss->hierarchy_mutex);
                        cgrp->subsys[i] = dummytop->subsys[i];
                        cgrp->subsys[i]->cgroup = cgrp;
-                       list_add(&ss->sibling, &root->subsys_list);
-                       rcu_assign_pointer(ss->root, root);
+                       list_move(&ss->sibling, &root->subsys_list);
+                       ss->root = root;
                        if (ss->bind)
                                ss->bind(ss, cgrp);
-
+                       mutex_unlock(&ss->hierarchy_mutex);
                } else if (bit & removed_bits) {
                        /* We're removing this subsystem */
                        BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                        BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+                       mutex_lock(&ss->hierarchy_mutex);
                        if (ss->bind)
                                ss->bind(ss, dummytop);
                        dummytop->subsys[i]->cgroup = dummytop;
                        cgrp->subsys[i] = NULL;
-                       rcu_assign_pointer(subsys[i]->root, &rootnode);
-                       list_del(&ss->sibling);
+                       subsys[i]->root = &rootnode;
+                       list_move(&ss->sibling, &rootnode.subsys_list);
+                       mutex_unlock(&ss->hierarchy_mutex);
                } else if (bit & final_bits) {
                        /* Subsystem state should already exist */
                        BUG_ON(!cgrp->subsys[i]);
@@ -868,6 +879,14 @@ static struct super_operations cgroup_ops = {
        .remount_fs = cgroup_remount,
 };
 
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
+{
+       INIT_LIST_HEAD(&cgrp->sibling);
+       INIT_LIST_HEAD(&cgrp->children);
+       INIT_LIST_HEAD(&cgrp->css_sets);
+       INIT_LIST_HEAD(&cgrp->release_list);
+       init_rwsem(&cgrp->pids_mutex);
+}
 static void init_cgroup_root(struct cgroupfs_root *root)
 {
        struct cgroup *cgrp = &root->top_cgroup;
@@ -876,10 +895,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
        root->number_of_cgroups = 1;
        cgrp->root = root;
        cgrp->top_cgroup = cgrp;
-       INIT_LIST_HEAD(&cgrp->sibling);
-       INIT_LIST_HEAD(&cgrp->children);
-       INIT_LIST_HEAD(&cgrp->css_sets);
-       INIT_LIST_HEAD(&cgrp->release_list);
+       init_cgroup_housekeeping(cgrp);
 }
 
 static int cgroup_test_super(struct super_block *sb, void *data)
@@ -987,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                root = NULL;
        } else {
                /* New superblock */
-               struct cgroup *cgrp = &root->top_cgroup;
+               struct cgroup *root_cgrp = &root->top_cgroup;
                struct inode *inode;
                int i;
 
@@ -1019,7 +1035,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                if (ret == -EBUSY) {
                        mutex_unlock(&cgroup_mutex);
                        mutex_unlock(&inode->i_mutex);
-                       goto drop_new_super;
+                       goto free_cg_links;
                }
 
                /* EBUSY should be the only error here */
@@ -1028,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                list_add(&root->root_list, &roots);
                root_count++;
 
-               sb->s_root->d_fsdata = &root->top_cgroup;
+               sb->s_root->d_fsdata = root_cgrp;
                root->top_cgroup.dentry = sb->s_root;
 
                /* Link the top cgroup in this hierarchy into all
@@ -1039,39 +1055,29 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                        struct hlist_node *node;
                        struct css_set *cg;
 
-                       hlist_for_each_entry(cg, node, hhead, hlist) {
-                               struct cg_cgroup_link *link;
-
-                               BUG_ON(list_empty(&tmp_cg_links));
-                               link = list_entry(tmp_cg_links.next,
-                                                 struct cg_cgroup_link,
-                                                 cgrp_link_list);
-                               list_del(&link->cgrp_link_list);
-                               link->cg = cg;
-                               list_add(&link->cgrp_link_list,
-                                        &root->top_cgroup.css_sets);
-                               list_add(&link->cg_link_list, &cg->cg_links);
-                       }
+                       hlist_for_each_entry(cg, node, hhead, hlist)
+                               link_css_set(&tmp_cg_links, cg, root_cgrp);
                }
                write_unlock(&css_set_lock);
 
                free_cg_links(&tmp_cg_links);
 
-               BUG_ON(!list_empty(&cgrp->sibling));
-               BUG_ON(!list_empty(&cgrp->children));
+               BUG_ON(!list_empty(&root_cgrp->sibling));
+               BUG_ON(!list_empty(&root_cgrp->children));
                BUG_ON(root->number_of_cgroups != 1);
 
-               cgroup_populate_dir(cgrp);
+               cgroup_populate_dir(root_cgrp);
                mutex_unlock(&inode->i_mutex);
                mutex_unlock(&cgroup_mutex);
        }
 
        return simple_set_mnt(mnt, sb);
 
+ free_cg_links:
+       free_cg_links(&tmp_cg_links);
  drop_new_super:
        up_write(&sb->s_umount);
        deactivate_super(sb);
-       free_cg_links(&tmp_cg_links);
        return ret;
 }
 
@@ -1109,10 +1115,9 @@ static void cgroup_kill_sb(struct super_block *sb) {
        }
        write_unlock(&css_set_lock);
 
-       if (!list_empty(&root->root_list)) {
-               list_del(&root->root_list);
-               root_count--;
-       }
+       list_del(&root->root_list);
+       root_count--;
+
        mutex_unlock(&cgroup_mutex);
 
        kfree(root);
@@ -1141,14 +1146,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
  * @buf: the buffer to write the path into
  * @buflen: the length of the buffer
  *
- * Called with cgroup_mutex held. Writes path of cgroup into buf.
- * Returns 0 on success, -errno on error.
+ * Called with cgroup_mutex held or else with an RCU-protected cgroup
+ * reference.  Writes path of cgroup into buf.  Returns 0 on success,
+ * -errno on error.
  */
 int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 {
        char *start;
+       struct dentry *dentry = rcu_dereference(cgrp->dentry);
 
-       if (cgrp == dummytop) {
+       if (!dentry || cgrp == dummytop) {
                /*
                 * Inactive subsystems have no dentry for their root
                 * cgroup
@@ -1161,13 +1168,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
 
        *--start = '\0';
        for (;;) {
-               int len = cgrp->dentry->d_name.len;
+               int len = dentry->d_name.len;
                if ((start -= len) < buf)
                        return -ENAMETOOLONG;
                memcpy(start, cgrp->dentry->d_name.name, len);
                cgrp = cgrp->parent;
                if (!cgrp)
                        break;
+               dentry = rcu_dereference(cgrp->dentry);
                if (!cgrp->parent)
                        continue;
                if (--start < buf)
@@ -1212,7 +1220,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
        int retval = 0;
        struct cgroup_subsys *ss;
        struct cgroup *oldcgrp;
-       struct css_set *cg = tsk->cgroups;
+       struct css_set *cg;
        struct css_set *newcg;
        struct cgroupfs_root *root = cgrp->root;
        int subsys_id;
@@ -1232,11 +1240,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                }
        }
 
+       task_lock(tsk);
+       cg = tsk->cgroups;
+       get_css_set(cg);
+       task_unlock(tsk);
        /*
         * Locate or allocate a new css_set for this task,
         * based on its final set of cgroups
         */
        newcg = find_css_set(cg, cgrp);
+       put_css_set(cg);
        if (!newcg)
                return -ENOMEM;
 
@@ -1274,6 +1287,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
 {
        struct task_struct *tsk;
+       const struct cred *cred = current_cred(), *tcred;
        int ret;
 
        if (pid) {
@@ -1283,14 +1297,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
                        rcu_read_unlock();
                        return -ESRCH;
                }
-               get_task_struct(tsk);
-               rcu_read_unlock();
 
-               if ((current->euid) && (current->euid != tsk->uid)
-                   && (current->euid != tsk->suid)) {
-                       put_task_struct(tsk);
+               tcred = __task_cred(tsk);
+               if (cred->euid &&
+                   cred->euid != tcred->uid &&
+                   cred->euid != tcred->suid) {
+                       rcu_read_unlock();
                        return -EACCES;
                }
+               get_task_struct(tsk);
+               rcu_read_unlock();
        } else {
                tsk = current;
                get_task_struct(tsk);
@@ -1438,7 +1454,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
-       if (!cft || cgroup_is_removed(cgrp))
+       if (cgroup_is_removed(cgrp))
                return -ENODEV;
        if (cft->write)
                return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1483,7 +1499,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
        struct cftype *cft = __d_cft(file->f_dentry);
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
 
-       if (!cft || cgroup_is_removed(cgrp))
+       if (cgroup_is_removed(cgrp))
                return -ENODEV;
 
        if (cft->read)
@@ -1547,10 +1563,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
        err = generic_file_open(inode, file);
        if (err)
                return err;
-
        cft = __d_cft(file->f_dentry);
-       if (!cft)
-               return -ENODEV;
+
        if (cft->read_map || cft->read_seq_string) {
                struct cgroup_seqfile_state *state =
                        kzalloc(sizeof(*state), GFP_USER);
@@ -1664,7 +1678,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
        if (!error) {
                dentry->d_fsdata = cgrp;
                inc_nlink(parent->d_inode);
-               cgrp->dentry = dentry;
+               rcu_assign_pointer(cgrp->dentry, dentry);
                dget(dentry);
        }
        dput(dentry);
@@ -1805,6 +1819,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
 {
        struct task_struct *res;
        struct list_head *l = it->task;
+       struct cg_cgroup_link *link;
 
        /* If the iterator cg is NULL, we have no tasks */
        if (!it->cg_link)
@@ -1812,7 +1827,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
        res = list_entry(l, struct task_struct, cg_list);
        /* Advance iterator to find next entry */
        l = l->next;
-       if (l == &res->cgroups->tasks) {
+       link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
+       if (l == &link->cg->tasks) {
                /* We reached the end of this task list - move on to
                 * the next cg_cgroup_link */
                cgroup_advance_iter(cgrp, it);
@@ -1995,16 +2011,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
  * but we cannot guarantee that the information we produce is correct
  * unless we produce it entirely atomically.
  *
- * Upon tasks file open(), a struct ctr_struct is allocated, that
- * will have a pointer to an array (also allocated here).  The struct
- * ctr_struct * is stored in file->private_data.  Its resources will
- * be freed by release() when the file is closed.  The array is used
- * to sprintf the PIDs and then used by read().
  */
-struct ctr_struct {
-       char *buf;
-       int bufsz;
-};
 
 /*
  * Load into 'pidarray' up to 'npids' of the tasks using cgroup
@@ -2015,14 +2022,16 @@ struct ctr_struct {
  */
 static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
 {
-       int n = 0;
+       int n = 0, pid;
        struct cgroup_iter it;
        struct task_struct *tsk;
        cgroup_iter_start(cgrp, &it);
        while ((tsk = cgroup_iter_next(cgrp, &it))) {
                if (unlikely(n == npids))
                        break;
-               pidarray[n++] = task_pid_vnr(tsk);
+               pid = task_pid_vnr(tsk);
+               if (pid > 0)
+                       pidarray[n++] = pid;
        }
        cgroup_iter_end(cgrp, &it);
        return n;
@@ -2043,15 +2052,17 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
        struct cgroup *cgrp;
        struct cgroup_iter it;
        struct task_struct *tsk;
+
        /*
-        * Validate dentry by checking the superblock operations
+        * Validate dentry by checking the superblock operations,
+        * and make sure it's a directory.
         */
-       if (dentry->d_sb->s_op != &cgroup_ops)
+       if (dentry->d_sb->s_op != &cgroup_ops ||
+           !S_ISDIR(dentry->d_inode->i_mode))
                 goto err;
 
        ret = 0;
        cgrp = dentry->d_fsdata;
-       rcu_read_lock();
 
        cgroup_iter_start(cgrp, &it);
        while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2076,7 +2087,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
        }
        cgroup_iter_end(cgrp, &it);
 
-       rcu_read_unlock();
 err:
        return ret;
 }
@@ -2086,42 +2096,132 @@ static int cmppid(const void *a, const void *b)
        return *(pid_t *)a - *(pid_t *)b;
 }
 
+
 /*
- * Convert array 'a' of 'npids' pid_t's to a string of newline separated
- * decimal pids in 'buf'.  Don't write more than 'sz' chars, but return
- * count 'cnt' of how many chars would be written if buf were large enough.
+ * seq_file methods for the "tasks" file. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->tasks_pids array.
  */
-static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
+
+static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
 {
-       int cnt = 0;
-       int i;
+       /*
+        * Initially we receive a position value that corresponds to
+        * one more than the last pid shown (or 0 on the first call or
+        * after a seek to the start). Use a binary-search to find the
+        * next pid to display, if any
+        */
+       struct cgroup *cgrp = s->private;
+       int index = 0, pid = *pos;
+       int *iter;
+
+       down_read(&cgrp->pids_mutex);
+       if (pid) {
+               int end = cgrp->pids_length;
+
+               while (index < end) {
+                       int mid = (index + end) / 2;
+                       if (cgrp->tasks_pids[mid] == pid) {
+                               index = mid;
+                               break;
+                       } else if (cgrp->tasks_pids[mid] <= pid)
+                               index = mid + 1;
+                       else
+                               end = mid;
+               }
+       }
+       /* If we're off the end of the array, we're done */
+       if (index >= cgrp->pids_length)
+               return NULL;
+       /* Update the abstract position to be the actual pid that we found */
+       iter = cgrp->tasks_pids + index;
+       *pos = *iter;
+       return iter;
+}
+
+static void cgroup_tasks_stop(struct seq_file *s, void *v)
+{
+       struct cgroup *cgrp = s->private;
+       up_read(&cgrp->pids_mutex);
+}
+
+static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct cgroup *cgrp = s->private;
+       int *p = v;
+       int *end = cgrp->tasks_pids + cgrp->pids_length;
+
+       /*
+        * Advance to the next pid in the array. If this goes off the
+        * end, we're done
+        */
+       p++;
+       if (p >= end) {
+               return NULL;
+       } else {
+               *pos = *p;
+               return p;
+       }
+}
+
+static int cgroup_tasks_show(struct seq_file *s, void *v)
+{
+       return seq_printf(s, "%d\n", *(int *)v);
+}
+
+static struct seq_operations cgroup_tasks_seq_operations = {
+       .start = cgroup_tasks_start,
+       .stop = cgroup_tasks_stop,
+       .next = cgroup_tasks_next,
+       .show = cgroup_tasks_show,
+};
+
+static void release_cgroup_pid_array(struct cgroup *cgrp)
+{
+       down_write(&cgrp->pids_mutex);
+       BUG_ON(!cgrp->pids_use_count);
+       if (!--cgrp->pids_use_count) {
+               kfree(cgrp->tasks_pids);
+               cgrp->tasks_pids = NULL;
+               cgrp->pids_length = 0;
+       }
+       up_write(&cgrp->pids_mutex);
+}
+
+static int cgroup_tasks_release(struct inode *inode, struct file *file)
+{
+       struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+
+       if (!(file->f_mode & FMODE_READ))
+               return 0;
 
-       for (i = 0; i < npids; i++)
-               cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
-       return cnt;
+       release_cgroup_pid_array(cgrp);
+       return seq_release(inode, file);
 }
 
+static struct file_operations cgroup_tasks_operations = {
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .write = cgroup_file_write,
+       .release = cgroup_tasks_release,
+};
+
 /*
- * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * Handle an open on 'tasks' file.  Prepare an array containing the
  * process id's of tasks currently attached to the cgroup being opened.
- *
- * Does not require any specific cgroup mutexes, and does not take any.
  */
+
 static int cgroup_tasks_open(struct inode *unused, struct file *file)
 {
        struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-       struct ctr_struct *ctr;
        pid_t *pidarray;
        int npids;
-       char c;
+       int retval;
 
+       /* Nothing to do for write-only files */
        if (!(file->f_mode & FMODE_READ))
                return 0;
 
-       ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
-       if (!ctr)
-               goto err0;
-
        /*
         * If cgroup gets more users after we read count, we won't have
         * enough space - tough.  This race is indistinguishable to the
@@ -2129,57 +2229,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
         * show up until sometime later on.
         */
        npids = cgroup_task_count(cgrp);
-       if (npids) {
-               pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
-               if (!pidarray)
-                       goto err1;
-
-               npids = pid_array_load(pidarray, npids, cgrp);
-               sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
-               /* Call pid_array_to_buf() twice, first just to get bufsz */
-               ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
-               ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
-               if (!ctr->buf)
-                       goto err2;
-               ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
-
-               kfree(pidarray);
-       } else {
-               ctr->buf = NULL;
-               ctr->bufsz = 0;
-       }
-       file->private_data = ctr;
-       return 0;
-
-err2:
-       kfree(pidarray);
-err1:
-       kfree(ctr);
-err0:
-       return -ENOMEM;
-}
-
-static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
-                                   struct cftype *cft,
-                                   struct file *file, char __user *buf,
-                                   size_t nbytes, loff_t *ppos)
-{
-       struct ctr_struct *ctr = file->private_data;
+       pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+       if (!pidarray)
+               return -ENOMEM;
+       npids = pid_array_load(pidarray, npids, cgrp);
+       sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
 
-       return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
-}
+       /*
+        * Store the array in the cgroup, freeing the old
+        * array if necessary
+        */
+       down_write(&cgrp->pids_mutex);
+       kfree(cgrp->tasks_pids);
+       cgrp->tasks_pids = pidarray;
+       cgrp->pids_length = npids;
+       cgrp->pids_use_count++;
+       up_write(&cgrp->pids_mutex);
 
-static int cgroup_tasks_release(struct inode *unused_inode,
-                                       struct file *file)
-{
-       struct ctr_struct *ctr;
+       file->f_op = &cgroup_tasks_operations;
 
-       if (file->f_mode & FMODE_READ) {
-               ctr = file->private_data;
-               kfree(ctr->buf);
-               kfree(ctr);
+       retval = seq_open(file, &cgroup_tasks_seq_operations);
+       if (retval) {
+               release_cgroup_pid_array(cgrp);
+               return retval;
        }
+       ((struct seq_file *)file->private_data)->private = cgrp;
        return 0;
 }
 
@@ -2208,7 +2282,6 @@ static struct cftype files[] = {
        {
                .name = "tasks",
                .open = cgroup_tasks_open,
-               .read = cgroup_tasks_read,
                .write_u64 = cgroup_tasks_write,
                .release = cgroup_tasks_release,
                .private = FILE_TASKLIST,
@@ -2260,7 +2333,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                               struct cgroup *cgrp)
 {
        css->cgroup = cgrp;
-       atomic_set(&css->refcnt, 0);
+       atomic_set(&css->refcnt, 1);
        css->flags = 0;
        if (cgrp == dummytop)
                set_bit(CSS_ROOT, &css->flags);
@@ -2268,6 +2341,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
        cgrp->subsys[ss->subsys_id] = css;
 }
 
+static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
+{
+       /* We need to take each hierarchy_mutex in a consistent order */
+       int i;
+
+       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               struct cgroup_subsys *ss = subsys[i];
+               if (ss->root == root)
+                       mutex_lock_nested(&ss->hierarchy_mutex, i);
+       }
+}
+
+static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
+{
+       int i;
+
+       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               struct cgroup_subsys *ss = subsys[i];
+               if (ss->root == root)
+                       mutex_unlock(&ss->hierarchy_mutex);
+       }
+}
+
 /*
  * cgroup_create - create a cgroup
  * @parent: cgroup that will be parent of the new cgroup
@@ -2298,10 +2394,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 
        mutex_lock(&cgroup_mutex);
 
-       INIT_LIST_HEAD(&cgrp->sibling);
-       INIT_LIST_HEAD(&cgrp->children);
-       INIT_LIST_HEAD(&cgrp->css_sets);
-       INIT_LIST_HEAD(&cgrp->release_list);
+       init_cgroup_housekeeping(cgrp);
 
        cgrp->parent = parent;
        cgrp->root = parent->root;
@@ -2319,7 +2412,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                init_cgroup_css(css, ss, cgrp);
        }
 
+       cgroup_lock_hierarchy(root);
        list_add(&cgrp->sibling, &cgrp->parent->children);
+       cgroup_unlock_hierarchy(root);
        root->number_of_cgroups++;
 
        err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2370,7 +2465,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
 {
        /* Check the reference count on each subsystem. Since we
         * already established that there are no tasks in the
-        * cgroup, if the css refcount is also 0, then there should
+        * cgroup, if the css refcount is also 1, then there should
         * be no outstanding references, so the subsystem is safe to
         * destroy. We scan across all subsystems rather than using
         * the per-hierarchy linked list of mounted subsystems since
@@ -2391,19 +2486,67 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
                 * matter, since it can only happen if the cgroup
                 * has been deleted and hence no longer needs the
                 * release agent to be called anyway. */
-               if (css && atomic_read(&css->refcnt))
+               if (css && (atomic_read(&css->refcnt) > 1))
                        return 1;
        }
        return 0;
 }
 
+/*
+ * Atomically mark all (or else none) of the cgroup's CSS objects as
+ * CSS_REMOVED. Return true on success, or false if the cgroup has
+ * busy subsystems. Call with cgroup_mutex held
+ */
+
+static int cgroup_clear_css_refs(struct cgroup *cgrp)
+{
+       struct cgroup_subsys *ss;
+       unsigned long flags;
+       bool failed = false;
+       local_irq_save(flags);
+       for_each_subsys(cgrp->root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+               int refcnt;
+               do {
+                       /* We can only remove a CSS with a refcnt==1 */
+                       refcnt = atomic_read(&css->refcnt);
+                       if (refcnt > 1) {
+                               failed = true;
+                               goto done;
+                       }
+                       BUG_ON(!refcnt);
+                       /*
+                        * Drop the refcnt to 0 while we check other
+                        * subsystems. This will cause any racing
+                        * css_tryget() to spin until we set the
+                        * CSS_REMOVED bits or abort
+                        */
+               } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+       }
+ done:
+       for_each_subsys(cgrp->root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+               if (failed) {
+                       /*
+                        * Restore old refcnt if we previously managed
+                        * to clear it from 1 to 0
+                        */
+                       if (!atomic_read(&css->refcnt))
+                               atomic_set(&css->refcnt, 1);
+               } else {
+                       /* Commit the fact that the CSS is removed */
+                       set_bit(CSS_REMOVED, &css->flags);
+               }
+       }
+       local_irq_restore(flags);
+       return !failed;
+}
+
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 {
        struct cgroup *cgrp = dentry->d_fsdata;
        struct dentry *d;
        struct cgroup *parent;
-       struct super_block *sb;
-       struct cgroupfs_root *root;
 
        /* the vfs holds both inode->i_mutex already */
 
@@ -2416,10 +2559,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
-
-       parent = cgrp->parent;
-       root = cgrp->root;
-       sb = root->sb;
+       mutex_unlock(&cgroup_mutex);
 
        /*
         * Call pre_destroy handlers of subsys. Notify subsystems
@@ -2427,7 +2567,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
         */
        cgroup_call_pre_destroy(cgrp);
 
-       if (cgroup_has_css_refs(cgrp)) {
+       mutex_lock(&cgroup_mutex);
+       parent = cgrp->parent;
+
+       if (atomic_read(&cgrp->count)
+           || !list_empty(&cgrp->children)
+           || !cgroup_clear_css_refs(cgrp)) {
                mutex_unlock(&cgroup_mutex);
                return -EBUSY;
        }
@@ -2437,11 +2582,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
        if (!list_empty(&cgrp->release_list))
                list_del(&cgrp->release_list);
        spin_unlock(&release_list_lock);
-       /* delete my sibling from parent->children */
+
+       cgroup_lock_hierarchy(cgrp->root);
+       /* delete this cgroup from parent->children */
        list_del(&cgrp->sibling);
+       cgroup_unlock_hierarchy(cgrp->root);
+
        spin_lock(&cgrp->dentry->d_lock);
        d = dget(cgrp->dentry);
-       cgrp->dentry = NULL;
        spin_unlock(&d->d_lock);
 
        cgroup_d_remove_dir(d);
@@ -2461,6 +2609,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
 
        /* Create the top cgroup state for this subsystem */
+       list_add(&ss->sibling, &rootnode.subsys_list);
        ss->root = &rootnode;
        css = ss->create(ss, dummytop);
        /* We don't handle early failures gracefully */
@@ -2474,13 +2623,13 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
        init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
 
        need_forkexit_callback |= ss->fork || ss->exit;
-       need_mm_owner_callback |= !!ss->mm_owner_changed;
 
        /* At system boot, before all subsystems have been
         * registered, no tasks have been forked, so we don't
         * need to invoke fork callbacks here. */
        BUG_ON(!list_empty(&init_task.tasks));
 
+       mutex_init(&ss->hierarchy_mutex);
        ss->active = 1;
 }
 
@@ -2499,7 +2648,6 @@ int __init cgroup_init_early(void)
        INIT_HLIST_NODE(&init_css_set.hlist);
        css_set_count = 1;
        init_cgroup_root(&rootnode);
-       list_add(&rootnode.root_list, &roots);
        root_count = 1;
        init_task.cgroups = &init_css_set;
 
@@ -2606,15 +2754,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
 
        mutex_lock(&cgroup_mutex);
 
-       for_each_root(root) {
+       for_each_active_root(root) {
                struct cgroup_subsys *ss;
                struct cgroup *cgrp;
                int subsys_id;
                int count = 0;
 
-               /* Skip this hierarchy if it has no active subsystems */
-               if (!root->actual_subsys_bits)
-                       continue;
                seq_printf(m, "%lu:", root->subsys_bits);
                for_each_subsys(root, ss)
                        seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2724,37 +2869,6 @@ void cgroup_fork_callbacks(struct task_struct *child)
        }
 }
 
-#ifdef CONFIG_MM_OWNER
-/**
- * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
- * @p: the new owner
- *
- * Called on every change to mm->owner. mm_init_owner() does not
- * invoke this routine, since it assigns the mm->owner the first time
- * and does not change it.
- *
- * The callbacks are invoked with mmap_sem held in read mode.
- */
-void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
-{
-       struct cgroup *oldcgrp, *newcgrp = NULL;
-
-       if (need_mm_owner_callback) {
-               int i;
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                       struct cgroup_subsys *ss = subsys[i];
-                       oldcgrp = task_cgroup(old, ss->subsys_id);
-                       if (new)
-                               newcgrp = task_cgroup(new, ss->subsys_id);
-                       if (oldcgrp == newcgrp)
-                               continue;
-                       if (ss->mm_owner_changed)
-                               ss->mm_owner_changed(ss, oldcgrp, newcgrp, new);
-               }
-       }
-}
-#endif /* CONFIG_MM_OWNER */
-
 /**
  * cgroup_post_fork - called on a new task after adding it to the task list
  * @child: the task in question
@@ -2768,8 +2882,10 @@ void cgroup_post_fork(struct task_struct *child)
 {
        if (use_task_css_set_links) {
                write_lock(&css_set_lock);
+               task_lock(child);
                if (list_empty(&child->cg_list))
                        list_add(&child->cg_list, &child->cgroups->tasks);
+               task_unlock(child);
                write_unlock(&css_set_lock);
        }
 }
@@ -2872,20 +2988,23 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
  again:
        root = subsys->root;
        if (root == &rootnode) {
-               printk(KERN_INFO
-                      "Not cloning cgroup for unused subsystem %s\n",
-                      subsys->name);
                mutex_unlock(&cgroup_mutex);
                return 0;
        }
+       task_lock(tsk);
        cg = tsk->cgroups;
        parent = task_cgroup(tsk, subsys->subsys_id);
 
        /* Pin the hierarchy */
-       atomic_inc(&parent->root->sb->s_active);
+       if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
+               /* We race with the final deactivate_super() */
+               mutex_unlock(&cgroup_mutex);
+               return 0;
+       }
 
        /* Keep the cgroup alive */
        get_css_set(cg);
+       task_unlock(tsk);
        mutex_unlock(&cgroup_mutex);
 
        /* Now do the VFS work to create a cgroup */
@@ -2904,7 +3023,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
        }
 
        /* Create the cgroup directory, which also creates the cgroup */
-       ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
+       ret = vfs_mkdir(inode, dentry, 0755);
        child = __d_cgrp(dentry);
        dput(dentry);
        if (ret) {
@@ -2914,13 +3033,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
                goto out_release;
        }
 
-       if (!child) {
-               printk(KERN_INFO
-                      "Couldn't find new cgroup %s\n", nodename);
-               ret = -ENOMEM;
-               goto out_release;
-       }
-
        /* The cgroup now exists. Retake cgroup_mutex and check
         * that we're still in the same state that we thought we
         * were. */
@@ -3016,7 +3128,8 @@ void __css_put(struct cgroup_subsys_state *css)
 {
        struct cgroup *cgrp = css->cgroup;
        rcu_read_lock();
-       if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
+       if ((atomic_dec_return(&css->refcnt) == 1) &&
+           notify_on_release(cgrp)) {
                set_bit(CGRP_RELEASABLE, &cgrp->flags);
                check_for_release(cgrp);
        }