cgroups: relax ns_can_attach checks to allow attaching to grandchild cgroups

[safe/jmp/linux-2.6] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index a0123d7..27792bc 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -84,7 +84,7 @@ struct cgroupfs_root {
         /* Tracks how many cgroups are currently defined in hierarchy.*/
         int number_of_cgroups;
  
-       /* A list running through the mounted hierarchies */
+       /* A list running through the active hierarchies */
         struct list_head root_list;
  
         /* Hierarchy-specific flags */
@@ -116,7 +116,6 @@ static int root_count;
   * be called.
   */
  static int need_forkexit_callback __read_mostly;
-static int need_mm_owner_callback __read_mostly;
  
  /* convenient tests for these bits */
  inline int cgroup_is_removed(const struct cgroup *cgrp)
@@ -149,8 +148,8 @@ static int notify_on_release(const struct cgroup *cgrp)
  #define for_each_subsys(_root, _ss) \
  list_for_each_entry(_ss, &_root->subsys_list, sibling)
  
-/* for_each_root() allows you to iterate across the active hierarchies */
-#define for_each_root(_root) \
+/* for_each_active_root() allows you to iterate across the active hierarchies */
+#define for_each_active_root(_root) \
  list_for_each_entry(_root, &roots, root_list)
  
  /* the list of cgroups eligible for automatic release. Protected by
@@ -241,7 +240,6 @@ static void unlink_css_set(struct css_set *cg)
         struct cg_cgroup_link *link;
         struct cg_cgroup_link *saved_link;
  
-       write_lock(&css_set_lock);
         hlist_del(&cg->hlist);
         css_set_count--;
  
@@ -251,20 +249,29 @@ static void unlink_css_set(struct css_set *cg)
                 list_del(&link->cgrp_link_list);
                 kfree(link);
         }
-
-       write_unlock(&css_set_lock);
  }
  
-static void __release_css_set(struct kref *k, int taskexit)
+static void __put_css_set(struct css_set *cg, int taskexit)
  {
         int i;
-       struct css_set *cg = container_of(k, struct css_set, ref);
-
+       /*
+        * Ensure that the refcount doesn't hit zero while any readers
+        * can see it. Similar to atomic_dec_and_lock(), but for an
+        * rwlock
+        */
+       if (atomic_add_unless(&cg->refcount, -1, 1))
+               return;
+       write_lock(&css_set_lock);
+       if (!atomic_dec_and_test(&cg->refcount)) {
+               write_unlock(&css_set_lock);
+               return;
+       }
         unlink_css_set(cg);
+       write_unlock(&css_set_lock);
  
         rcu_read_lock();
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-               struct cgroup *cgrp = cg->subsys[i]->cgroup;
+               struct cgroup *cgrp = rcu_dereference(cg->subsys[i]->cgroup);
                 if (atomic_dec_and_test(&cgrp->count) &&
                     notify_on_release(cgrp)) {
                         if (taskexit)
@@ -276,32 +283,22 @@ static void __release_css_set(struct kref *k, int taskexit)
         kfree(cg);
  }
  
-static void release_css_set(struct kref *k)
-{
-       __release_css_set(k, 0);
-}
-
-static void release_css_set_taskexit(struct kref *k)
-{
-       __release_css_set(k, 1);
-}
-
  /*
   * refcounted get/put for css_set objects
   */
  static inline void get_css_set(struct css_set *cg)
  {
-       kref_get(&cg->ref);
+       atomic_inc(&cg->refcount);
  }
  
  static inline void put_css_set(struct css_set *cg)
  {
-       kref_put(&cg->ref, release_css_set);
+       __put_css_set(cg, 0);
  }
  
  static inline void put_css_set_taskexit(struct css_set *cg)
  {
-       kref_put(&cg->ref, release_css_set_taskexit);
+       __put_css_set(cg, 1);
  }
  
  /*
@@ -387,6 +384,25 @@ static int allocate_cg_links(int count, struct list_head *tmp)
         return 0;
  }
  
+/**
+ * link_css_set - a helper function to link a css_set to a cgroup
+ * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links()
+ * @cg: the css_set to be linked
+ * @cgrp: the destination cgroup
+ */
+static void link_css_set(struct list_head *tmp_cg_links,
+                        struct css_set *cg, struct cgroup *cgrp)
+{
+       struct cg_cgroup_link *link;
+
+       BUG_ON(list_empty(tmp_cg_links));
+       link = list_first_entry(tmp_cg_links, struct cg_cgroup_link,
+                               cgrp_link_list);
+       link->cg = cg;
+       list_move(&link->cgrp_link_list, &cgrp->css_sets);
+       list_add(&link->cg_link_list, &cg->cg_links);
+}
+
  /*
   * find_css_set() takes an existing cgroup group and a
   * cgroup object, and returns a css_set object that's
@@ -402,7 +418,6 @@ static struct css_set *find_css_set(
         int i;
  
         struct list_head tmp_cg_links;
-       struct cg_cgroup_link *link;
  
         struct hlist_head *hhead;
  
@@ -427,7 +442,7 @@ static struct css_set *find_css_set(
                 return NULL;
         }
  
-       kref_init(&res->ref);
+       atomic_set(&res->refcount, 1);
         INIT_LIST_HEAD(&res->cg_links);
         INIT_LIST_HEAD(&res->tasks);
         INIT_HLIST_NODE(&res->hlist);
@@ -447,26 +462,11 @@ static struct css_set *find_css_set(
                  * only do it for the first subsystem in each
                  * hierarchy
                  */
-               if (ss->root->subsys_list.next == &ss->sibling) {
-                       BUG_ON(list_empty(&tmp_cg_links));
-                       link = list_entry(tmp_cg_links.next,
-                                         struct cg_cgroup_link,
-                                         cgrp_link_list);
-                       list_del(&link->cgrp_link_list);
-                       list_add(&link->cgrp_link_list, &cgrp->css_sets);
-                       link->cg = res;
-                       list_add(&link->cg_link_list, &res->cg_links);
-               }
-       }
-       if (list_empty(&rootnode.subsys_list)) {
-               link = list_entry(tmp_cg_links.next,
-                                 struct cg_cgroup_link,
-                                 cgrp_link_list);
-               list_del(&link->cgrp_link_list);
-               list_add(&link->cgrp_link_list, &dummytop->css_sets);
-               link->cg = res;
-               list_add(&link->cg_link_list, &res->cg_links);
+               if (ss->root->subsys_list.next == &ss->sibling)
+                       link_css_set(&tmp_cg_links, res, cgrp);
         }
+       if (list_empty(&rootnode.subsys_list))
+               link_css_set(&tmp_cg_links, res, dummytop);
  
         BUG_ON(!list_empty(&tmp_cg_links));
  
@@ -573,9 +573,8 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  
         if (inode) {
                 inode->i_mode = mode;
-               inode->i_uid = current->fsuid;
-               inode->i_gid = current->fsgid;
-               inode->i_blocks = 0;
+               inode->i_uid = current_fsuid();
+               inode->i_gid = current_fsgid();
                 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME;
                 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info;
         }
@@ -590,11 +589,18 @@ static void cgroup_call_pre_destroy(struct cgroup *cgrp)
  {
         struct cgroup_subsys *ss;
         for_each_subsys(cgrp->root, ss)
-               if (ss->pre_destroy && cgrp->subsys[ss->subsys_id])
+               if (ss->pre_destroy)
                         ss->pre_destroy(ss, cgrp);
         return;
  }
  
+static void free_cgroup_rcu(struct rcu_head *obj)
+{
+       struct cgroup *cgrp = container_of(obj, struct cgroup, rcu_head);
+
+       kfree(cgrp);
+}
+
  static void cgroup_diput(struct dentry *dentry, struct inode *inode)
  {
         /* is dentry a directory ? if so, kfree() associated cgroup */
@@ -614,19 +620,19 @@ static void cgroup_diput(struct dentry *dentry, struct inode *inode)
                 /*
                  * Release the subsystem state objects.
                  */
-               for_each_subsys(cgrp->root, ss) {
-                       if (cgrp->subsys[ss->subsys_id])
-                               ss->destroy(ss, cgrp);
-               }
+               for_each_subsys(cgrp->root, ss)
+                       ss->destroy(ss, cgrp);
  
                 cgrp->root->number_of_cgroups--;
                 mutex_unlock(&cgroup_mutex);
  
-               /* Drop the active superblock reference that we took when we
-                * created the cgroup */
+               /*
+                * Drop the active superblock reference that we took when we
+                * created the cgroup
+                */
                 deactivate_super(cgrp->root->sb);
  
-               kfree(cgrp);
+               call_rcu(&cgrp->rcu_head, free_cgroup_rcu);
         }
         iput(inode);
  }
@@ -704,7 +710,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
          * any child cgroups exist. This is theoretically supportable
          * but involves complex error handling, so it's being left until
          * later */
-       if (!list_empty(&cgrp->children))
+       if (root->number_of_cgroups > 1)
                 return -EBUSY;
  
         /* Process each subsystem */
@@ -716,23 +722,26 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                         BUG_ON(cgrp->subsys[i]);
                         BUG_ON(!dummytop->subsys[i]);
                         BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
+                       mutex_lock(&ss->hierarchy_mutex);
                         cgrp->subsys[i] = dummytop->subsys[i];
                         cgrp->subsys[i]->cgroup = cgrp;
-                       list_add(&ss->sibling, &root->subsys_list);
-                       rcu_assign_pointer(ss->root, root);
+                       list_move(&ss->sibling, &root->subsys_list);
+                       ss->root = root;
                         if (ss->bind)
                                 ss->bind(ss, cgrp);
-
+                       mutex_unlock(&ss->hierarchy_mutex);
                 } else if (bit & removed_bits) {
                         /* We're removing this subsystem */
                         BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                         BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
+                       mutex_lock(&ss->hierarchy_mutex);
                         if (ss->bind)
                                 ss->bind(ss, dummytop);
                         dummytop->subsys[i]->cgroup = dummytop;
                         cgrp->subsys[i] = NULL;
-                       rcu_assign_pointer(subsys[i]->root, &rootnode);
-                       list_del(&ss->sibling);
+                       subsys[i]->root = &rootnode;
+                       list_move(&ss->sibling, &rootnode.subsys_list);
+                       mutex_unlock(&ss->hierarchy_mutex);
                 } else if (bit & final_bits) {
                         /* Subsystem state should already exist */
                         BUG_ON(!cgrp->subsys[i]);
@@ -870,6 +879,14 @@ static struct super_operations cgroup_ops = {
         .remount_fs = cgroup_remount,
  };
  
+static void init_cgroup_housekeeping(struct cgroup *cgrp)
+{
+       INIT_LIST_HEAD(&cgrp->sibling);
+       INIT_LIST_HEAD(&cgrp->children);
+       INIT_LIST_HEAD(&cgrp->css_sets);
+       INIT_LIST_HEAD(&cgrp->release_list);
+       init_rwsem(&cgrp->pids_mutex);
+}
  static void init_cgroup_root(struct cgroupfs_root *root)
  {
         struct cgroup *cgrp = &root->top_cgroup;
@@ -878,10 +895,7 @@ static void init_cgroup_root(struct cgroupfs_root *root)
         root->number_of_cgroups = 1;
         cgrp->root = root;
         cgrp->top_cgroup = cgrp;
-       INIT_LIST_HEAD(&cgrp->sibling);
-       INIT_LIST_HEAD(&cgrp->children);
-       INIT_LIST_HEAD(&cgrp->css_sets);
-       INIT_LIST_HEAD(&cgrp->release_list);
+       init_cgroup_housekeeping(cgrp);
  }
  
  static int cgroup_test_super(struct super_block *sb, void *data)
@@ -989,7 +1003,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                 root = NULL;
         } else {
                 /* New superblock */
-               struct cgroup *cgrp = &root->top_cgroup;
+               struct cgroup *root_cgrp = &root->top_cgroup;
                 struct inode *inode;
                 int i;
  
@@ -1021,7 +1035,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                 if (ret == -EBUSY) {
                         mutex_unlock(&cgroup_mutex);
                         mutex_unlock(&inode->i_mutex);
-                       goto drop_new_super;
+                       goto free_cg_links;
                 }
  
                 /* EBUSY should be the only error here */
@@ -1030,7 +1044,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                 list_add(&root->root_list, &roots);
                 root_count++;
  
-               sb->s_root->d_fsdata = &root->top_cgroup;
+               sb->s_root->d_fsdata = root_cgrp;
                 root->top_cgroup.dentry = sb->s_root;
  
                 /* Link the top cgroup in this hierarchy into all
@@ -1041,39 +1055,30 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
                         struct hlist_node *node;
                         struct css_set *cg;
  
-                       hlist_for_each_entry(cg, node, hhead, hlist) {
-                               struct cg_cgroup_link *link;
-
-                               BUG_ON(list_empty(&tmp_cg_links));
-                               link = list_entry(tmp_cg_links.next,
-                                                 struct cg_cgroup_link,
-                                                 cgrp_link_list);
-                               list_del(&link->cgrp_link_list);
-                               link->cg = cg;
-                               list_add(&link->cgrp_link_list,
-                                        &root->top_cgroup.css_sets);
-                               list_add(&link->cg_link_list, &cg->cg_links);
-                       }
+                       hlist_for_each_entry(cg, node, hhead, hlist)
+                               link_css_set(&tmp_cg_links, cg, root_cgrp);
                 }
                 write_unlock(&css_set_lock);
  
                 free_cg_links(&tmp_cg_links);
  
-               BUG_ON(!list_empty(&cgrp->sibling));
-               BUG_ON(!list_empty(&cgrp->children));
+               BUG_ON(!list_empty(&root_cgrp->sibling));
+               BUG_ON(!list_empty(&root_cgrp->children));
                 BUG_ON(root->number_of_cgroups != 1);
  
-               cgroup_populate_dir(cgrp);
+               cgroup_populate_dir(root_cgrp);
                 mutex_unlock(&inode->i_mutex);
                 mutex_unlock(&cgroup_mutex);
         }
  
-       return simple_set_mnt(mnt, sb);
+       simple_set_mnt(mnt, sb);
+       return 0;
  
+ free_cg_links:
+       free_cg_links(&tmp_cg_links);
   drop_new_super:
         up_write(&sb->s_umount);
         deactivate_super(sb);
-       free_cg_links(&tmp_cg_links);
         return ret;
  }
  
@@ -1115,10 +1120,11 @@ static void cgroup_kill_sb(struct super_block *sb) {
                 list_del(&root->root_list);
                 root_count--;
         }
+
         mutex_unlock(&cgroup_mutex);
  
-       kfree(root);
         kill_litter_super(sb);
+       kfree(root);
  }
  
  static struct file_system_type cgroup_fs_type = {
@@ -1143,14 +1149,16 @@ static inline struct cftype *__d_cft(struct dentry *dentry)
   * @buf: the buffer to write the path into
   * @buflen: the length of the buffer
   *
- * Called with cgroup_mutex held. Writes path of cgroup into buf.
- * Returns 0 on success, -errno on error.
+ * Called with cgroup_mutex held or else with an RCU-protected cgroup
+ * reference.  Writes path of cgroup into buf.  Returns 0 on success,
+ * -errno on error.
   */
  int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
  {
         char *start;
+       struct dentry *dentry = rcu_dereference(cgrp->dentry);
  
-       if (cgrp == dummytop) {
+       if (!dentry || cgrp == dummytop) {
                 /*
                  * Inactive subsystems have no dentry for their root
                  * cgroup
@@ -1163,13 +1171,14 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen)
  
         *--start = '\0';
         for (;;) {
-               int len = cgrp->dentry->d_name.len;
+               int len = dentry->d_name.len;
                 if ((start -= len) < buf)
                         return -ENAMETOOLONG;
                 memcpy(start, cgrp->dentry->d_name.name, len);
                 cgrp = cgrp->parent;
                 if (!cgrp)
                         break;
+               dentry = rcu_dereference(cgrp->dentry);
                 if (!cgrp->parent)
                         continue;
                 if (--start < buf)
@@ -1214,7 +1223,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         int retval = 0;
         struct cgroup_subsys *ss;
         struct cgroup *oldcgrp;
-       struct css_set *cg = tsk->cgroups;
+       struct css_set *cg;
         struct css_set *newcg;
         struct cgroupfs_root *root = cgrp->root;
         int subsys_id;
@@ -1234,11 +1243,16 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
                 }
         }
  
+       task_lock(tsk);
+       cg = tsk->cgroups;
+       get_css_set(cg);
+       task_unlock(tsk);
         /*
          * Locate or allocate a new css_set for this task,
          * based on its final set of cgroups
          */
         newcg = find_css_set(cg, cgrp);
+       put_css_set(cg);
         if (!newcg)
                 return -ENOMEM;
  
@@ -1276,6 +1290,7 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
  static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
  {
         struct task_struct *tsk;
+       const struct cred *cred = current_cred(), *tcred;
         int ret;
  
         if (pid) {
@@ -1285,14 +1300,16 @@ static int attach_task_by_pid(struct cgroup *cgrp, u64 pid)
                         rcu_read_unlock();
                         return -ESRCH;
                 }
-               get_task_struct(tsk);
-               rcu_read_unlock();
  
-               if ((current->euid) && (current->euid != tsk->uid)
-                   && (current->euid != tsk->suid)) {
-                       put_task_struct(tsk);
+               tcred = __task_cred(tsk);
+               if (cred->euid &&
+                   cred->euid != tcred->uid &&
+                   cred->euid != tcred->suid) {
+                       rcu_read_unlock();
                         return -EACCES;
                 }
+               get_task_struct(tsk);
+               rcu_read_unlock();
         } else {
                 tsk = current;
                 get_task_struct(tsk);
@@ -1440,7 +1457,7 @@ static ssize_t cgroup_file_write(struct file *file, const char __user *buf,
         struct cftype *cft = __d_cft(file->f_dentry);
         struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
  
-       if (!cft || cgroup_is_removed(cgrp))
+       if (cgroup_is_removed(cgrp))
                 return -ENODEV;
         if (cft->write)
                 return cft->write(cgrp, cft, file, buf, nbytes, ppos);
@@ -1485,7 +1502,7 @@ static ssize_t cgroup_file_read(struct file *file, char __user *buf,
         struct cftype *cft = __d_cft(file->f_dentry);
         struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
  
-       if (!cft || cgroup_is_removed(cgrp))
+       if (cgroup_is_removed(cgrp))
                 return -ENODEV;
  
         if (cft->read)
@@ -1549,10 +1566,8 @@ static int cgroup_file_open(struct inode *inode, struct file *file)
         err = generic_file_open(inode, file);
         if (err)
                 return err;
-
         cft = __d_cft(file->f_dentry);
-       if (!cft)
-               return -ENODEV;
+
         if (cft->read_map || cft->read_seq_string) {
                 struct cgroup_seqfile_state *state =
                         kzalloc(sizeof(*state), GFP_USER);
@@ -1613,7 +1628,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
  static int cgroup_create_file(struct dentry *dentry, int mode,
                                 struct super_block *sb)
  {
-       static struct dentry_operations cgroup_dops = {
+       static const struct dentry_operations cgroup_dops = {
                 .d_iput = cgroup_diput,
         };
  
@@ -1666,7 +1681,7 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
         if (!error) {
                 dentry->d_fsdata = cgrp;
                 inc_nlink(parent->d_inode);
-               cgrp->dentry = dentry;
+               rcu_assign_pointer(cgrp->dentry, dentry);
                 dget(dentry);
         }
         dput(dentry);
@@ -1728,7 +1743,7 @@ int cgroup_task_count(const struct cgroup *cgrp)
  
         read_lock(&css_set_lock);
         list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) {
-               count += atomic_read(&link->cg->ref.refcount);
+               count += atomic_read(&link->cg->refcount);
         }
         read_unlock(&css_set_lock);
         return count;
@@ -1807,6 +1822,7 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
  {
         struct task_struct *res;
         struct list_head *l = it->task;
+       struct cg_cgroup_link *link;
  
         /* If the iterator cg is NULL, we have no tasks */
         if (!it->cg_link)
@@ -1814,7 +1830,8 @@ struct task_struct *cgroup_iter_next(struct cgroup *cgrp,
         res = list_entry(l, struct task_struct, cg_list);
         /* Advance iterator to find next entry */
         l = l->next;
-       if (l == &res->cgroups->tasks) {
+       link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list);
+       if (l == &link->cg->tasks) {
                 /* We reached the end of this task list - move on to
                  * the next cg_cgroup_link */
                 cgroup_advance_iter(cgrp, it);
@@ -1997,16 +2014,7 @@ int cgroup_scan_tasks(struct cgroup_scanner *scan)
   * but we cannot guarantee that the information we produce is correct
   * unless we produce it entirely atomically.
   *
- * Upon tasks file open(), a struct ctr_struct is allocated, that
- * will have a pointer to an array (also allocated here).  The struct
- * ctr_struct * is stored in file->private_data.  Its resources will
- * be freed by release() when the file is closed.  The array is used
- * to sprintf the PIDs and then used by read().
   */
-struct ctr_struct {
-       char *buf;
-       int bufsz;
-};
  
  /*
   * Load into 'pidarray' up to 'npids' of the tasks using cgroup
@@ -2017,14 +2025,16 @@ struct ctr_struct {
   */
  static int pid_array_load(pid_t *pidarray, int npids, struct cgroup *cgrp)
  {
-       int n = 0;
+       int n = 0, pid;
         struct cgroup_iter it;
         struct task_struct *tsk;
         cgroup_iter_start(cgrp, &it);
         while ((tsk = cgroup_iter_next(cgrp, &it))) {
                 if (unlikely(n == npids))
                         break;
-               pidarray[n++] = task_pid_vnr(tsk);
+               pid = task_pid_vnr(tsk);
+               if (pid > 0)
+                       pidarray[n++] = pid;
         }
         cgroup_iter_end(cgrp, &it);
         return n;
@@ -2045,15 +2055,17 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
         struct cgroup *cgrp;
         struct cgroup_iter it;
         struct task_struct *tsk;
+
         /*
-        * Validate dentry by checking the superblock operations
+        * Validate dentry by checking the superblock operations,
+        * and make sure it's a directory.
          */
-       if (dentry->d_sb->s_op != &cgroup_ops)
+       if (dentry->d_sb->s_op != &cgroup_ops ||
+           !S_ISDIR(dentry->d_inode->i_mode))
                  goto err;
  
         ret = 0;
         cgrp = dentry->d_fsdata;
-       rcu_read_lock();
  
         cgroup_iter_start(cgrp, &it);
         while ((tsk = cgroup_iter_next(cgrp, &it))) {
@@ -2078,7 +2090,6 @@ int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry)
         }
         cgroup_iter_end(cgrp, &it);
  
-       rcu_read_unlock();
  err:
         return ret;
  }
@@ -2088,42 +2099,132 @@ static int cmppid(const void *a, const void *b)
         return *(pid_t *)a - *(pid_t *)b;
  }
  
+
  /*
- * Convert array 'a' of 'npids' pid_t's to a string of newline separated
- * decimal pids in 'buf'.  Don't write more than 'sz' chars, but return
- * count 'cnt' of how many chars would be written if buf were large enough.
+ * seq_file methods for the "tasks" file. The seq_file position is the
+ * next pid to display; the seq_file iterator is a pointer to the pid
+ * in the cgroup->tasks_pids array.
   */
-static int pid_array_to_buf(char *buf, int sz, pid_t *a, int npids)
+
+static void *cgroup_tasks_start(struct seq_file *s, loff_t *pos)
  {
-       int cnt = 0;
-       int i;
+       /*
+        * Initially we receive a position value that corresponds to
+        * one more than the last pid shown (or 0 on the first call or
+        * after a seek to the start). Use a binary-search to find the
+        * next pid to display, if any
+        */
+       struct cgroup *cgrp = s->private;
+       int index = 0, pid = *pos;
+       int *iter;
+
+       down_read(&cgrp->pids_mutex);
+       if (pid) {
+               int end = cgrp->pids_length;
+
+               while (index < end) {
+                       int mid = (index + end) / 2;
+                       if (cgrp->tasks_pids[mid] == pid) {
+                               index = mid;
+                               break;
+                       } else if (cgrp->tasks_pids[mid] <= pid)
+                               index = mid + 1;
+                       else
+                               end = mid;
+               }
+       }
+       /* If we're off the end of the array, we're done */
+       if (index >= cgrp->pids_length)
+               return NULL;
+       /* Update the abstract position to be the actual pid that we found */
+       iter = cgrp->tasks_pids + index;
+       *pos = *iter;
+       return iter;
+}
+
+static void cgroup_tasks_stop(struct seq_file *s, void *v)
+{
+       struct cgroup *cgrp = s->private;
+       up_read(&cgrp->pids_mutex);
+}
+
+static void *cgroup_tasks_next(struct seq_file *s, void *v, loff_t *pos)
+{
+       struct cgroup *cgrp = s->private;
+       int *p = v;
+       int *end = cgrp->tasks_pids + cgrp->pids_length;
+
+       /*
+        * Advance to the next pid in the array. If this goes off the
+        * end, we're done
+        */
+       p++;
+       if (p >= end) {
+               return NULL;
+       } else {
+               *pos = *p;
+               return p;
+       }
+}
+
+static int cgroup_tasks_show(struct seq_file *s, void *v)
+{
+       return seq_printf(s, "%d\n", *(int *)v);
+}
+
+static struct seq_operations cgroup_tasks_seq_operations = {
+       .start = cgroup_tasks_start,
+       .stop = cgroup_tasks_stop,
+       .next = cgroup_tasks_next,
+       .show = cgroup_tasks_show,
+};
+
+static void release_cgroup_pid_array(struct cgroup *cgrp)
+{
+       down_write(&cgrp->pids_mutex);
+       BUG_ON(!cgrp->pids_use_count);
+       if (!--cgrp->pids_use_count) {
+               kfree(cgrp->tasks_pids);
+               cgrp->tasks_pids = NULL;
+               cgrp->pids_length = 0;
+       }
+       up_write(&cgrp->pids_mutex);
+}
+
+static int cgroup_tasks_release(struct inode *inode, struct file *file)
+{
+       struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
+
+       if (!(file->f_mode & FMODE_READ))
+               return 0;
  
-       for (i = 0; i < npids; i++)
-               cnt += snprintf(buf + cnt, max(sz - cnt, 0), "%d\n", a[i]);
-       return cnt;
+       release_cgroup_pid_array(cgrp);
+       return seq_release(inode, file);
  }
  
+static struct file_operations cgroup_tasks_operations = {
+       .read = seq_read,
+       .llseek = seq_lseek,
+       .write = cgroup_file_write,
+       .release = cgroup_tasks_release,
+};
+
  /*
- * Handle an open on 'tasks' file.  Prepare a buffer listing the
+ * Handle an open on 'tasks' file.  Prepare an array containing the
   * process id's of tasks currently attached to the cgroup being opened.
- *
- * Does not require any specific cgroup mutexes, and does not take any.
   */
+
  static int cgroup_tasks_open(struct inode *unused, struct file *file)
  {
         struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent);
-       struct ctr_struct *ctr;
         pid_t *pidarray;
         int npids;
-       char c;
+       int retval;
  
+       /* Nothing to do for write-only files */
         if (!(file->f_mode & FMODE_READ))
                 return 0;
  
-       ctr = kmalloc(sizeof(*ctr), GFP_KERNEL);
-       if (!ctr)
-               goto err0;
-
         /*
          * If cgroup gets more users after we read count, we won't have
          * enough space - tough.  This race is indistinguishable to the
@@ -2131,57 +2232,31 @@ static int cgroup_tasks_open(struct inode *unused, struct file *file)
          * show up until sometime later on.
          */
         npids = cgroup_task_count(cgrp);
-       if (npids) {
-               pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
-               if (!pidarray)
-                       goto err1;
-
-               npids = pid_array_load(pidarray, npids, cgrp);
-               sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
-
-               /* Call pid_array_to_buf() twice, first just to get bufsz */
-               ctr->bufsz = pid_array_to_buf(&c, sizeof(c), pidarray, npids) + 1;
-               ctr->buf = kmalloc(ctr->bufsz, GFP_KERNEL);
-               if (!ctr->buf)
-                       goto err2;
-               ctr->bufsz = pid_array_to_buf(ctr->buf, ctr->bufsz, pidarray, npids);
-
-               kfree(pidarray);
-       } else {
-               ctr->buf = NULL;
-               ctr->bufsz = 0;
-       }
-       file->private_data = ctr;
-       return 0;
-
-err2:
-       kfree(pidarray);
-err1:
-       kfree(ctr);
-err0:
-       return -ENOMEM;
-}
-
-static ssize_t cgroup_tasks_read(struct cgroup *cgrp,
-                                   struct cftype *cft,
-                                   struct file *file, char __user *buf,
-                                   size_t nbytes, loff_t *ppos)
-{
-       struct ctr_struct *ctr = file->private_data;
+       pidarray = kmalloc(npids * sizeof(pid_t), GFP_KERNEL);
+       if (!pidarray)
+               return -ENOMEM;
+       npids = pid_array_load(pidarray, npids, cgrp);
+       sort(pidarray, npids, sizeof(pid_t), cmppid, NULL);
  
-       return simple_read_from_buffer(buf, nbytes, ppos, ctr->buf, ctr->bufsz);
-}
+       /*
+        * Store the array in the cgroup, freeing the old
+        * array if necessary
+        */
+       down_write(&cgrp->pids_mutex);
+       kfree(cgrp->tasks_pids);
+       cgrp->tasks_pids = pidarray;
+       cgrp->pids_length = npids;
+       cgrp->pids_use_count++;
+       up_write(&cgrp->pids_mutex);
  
-static int cgroup_tasks_release(struct inode *unused_inode,
-                                       struct file *file)
-{
-       struct ctr_struct *ctr;
+       file->f_op = &cgroup_tasks_operations;
  
-       if (file->f_mode & FMODE_READ) {
-               ctr = file->private_data;
-               kfree(ctr->buf);
-               kfree(ctr);
+       retval = seq_open(file, &cgroup_tasks_seq_operations);
+       if (retval) {
+               release_cgroup_pid_array(cgrp);
+               return retval;
         }
+       ((struct seq_file *)file->private_data)->private = cgrp;
         return 0;
  }
  
@@ -2210,7 +2285,6 @@ static struct cftype files[] = {
         {
                 .name = "tasks",
                 .open = cgroup_tasks_open,
-               .read = cgroup_tasks_read,
                 .write_u64 = cgroup_tasks_write,
                 .release = cgroup_tasks_release,
                 .private = FILE_TASKLIST,
@@ -2262,7 +2336,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
                                struct cgroup *cgrp)
  {
         css->cgroup = cgrp;
-       atomic_set(&css->refcnt, 0);
+       atomic_set(&css->refcnt, 1);
         css->flags = 0;
         if (cgrp == dummytop)
                 set_bit(CSS_ROOT, &css->flags);
@@ -2270,6 +2344,29 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
         cgrp->subsys[ss->subsys_id] = css;
  }
  
+static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
+{
+       /* We need to take each hierarchy_mutex in a consistent order */
+       int i;
+
+       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               struct cgroup_subsys *ss = subsys[i];
+               if (ss->root == root)
+                       mutex_lock(&ss->hierarchy_mutex);
+       }
+}
+
+static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
+{
+       int i;
+
+       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               struct cgroup_subsys *ss = subsys[i];
+               if (ss->root == root)
+                       mutex_unlock(&ss->hierarchy_mutex);
+       }
+}
+
  /*
   * cgroup_create - create a cgroup
   * @parent: cgroup that will be parent of the new cgroup
@@ -2300,10 +2397,7 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
  
         mutex_lock(&cgroup_mutex);
  
-       INIT_LIST_HEAD(&cgrp->sibling);
-       INIT_LIST_HEAD(&cgrp->children);
-       INIT_LIST_HEAD(&cgrp->css_sets);
-       INIT_LIST_HEAD(&cgrp->release_list);
+       init_cgroup_housekeeping(cgrp);
  
         cgrp->parent = parent;
         cgrp->root = parent->root;
@@ -2321,7 +2415,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
                 init_cgroup_css(css, ss, cgrp);
         }
  
+       cgroup_lock_hierarchy(root);
         list_add(&cgrp->sibling, &cgrp->parent->children);
+       cgroup_unlock_hierarchy(root);
         root->number_of_cgroups++;
  
         err = cgroup_create_dir(cgrp, dentry, mode);
@@ -2341,7 +2437,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
  
   err_remove:
  
+       cgroup_lock_hierarchy(root);
         list_del(&cgrp->sibling);
+       cgroup_unlock_hierarchy(root);
         root->number_of_cgroups--;
  
   err_destroy:
@@ -2372,7 +2470,7 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
  {
         /* Check the reference count on each subsystem. Since we
          * already established that there are no tasks in the
-        * cgroup, if the css refcount is also 0, then there should
+        * cgroup, if the css refcount is also 1, then there should
          * be no outstanding references, so the subsystem is safe to
          * destroy. We scan across all subsystems rather than using
          * the per-hierarchy linked list of mounted subsystems since
@@ -2393,19 +2491,70 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
                  * matter, since it can only happen if the cgroup
                  * has been deleted and hence no longer needs the
                  * release agent to be called anyway. */
-               if (css && atomic_read(&css->refcnt))
+               if (css && (atomic_read(&css->refcnt) > 1))
                         return 1;
         }
         return 0;
  }
  
+/*
+ * Atomically mark all (or else none) of the cgroup's CSS objects as
+ * CSS_REMOVED. Return true on success, or false if the cgroup has
+ * busy subsystems. Call with cgroup_mutex held
+ */
+
+static int cgroup_clear_css_refs(struct cgroup *cgrp)
+{
+       struct cgroup_subsys *ss;
+       unsigned long flags;
+       bool failed = false;
+       local_irq_save(flags);
+       for_each_subsys(cgrp->root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+               int refcnt;
+               while (1) {
+                       /* We can only remove a CSS with a refcnt==1 */
+                       refcnt = atomic_read(&css->refcnt);
+                       if (refcnt > 1) {
+                               failed = true;
+                               goto done;
+                       }
+                       BUG_ON(!refcnt);
+                       /*
+                        * Drop the refcnt to 0 while we check other
+                        * subsystems. This will cause any racing
+                        * css_tryget() to spin until we set the
+                        * CSS_REMOVED bits or abort
+                        */
+                       if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
+                               break;
+                       cpu_relax();
+               }
+       }
+ done:
+       for_each_subsys(cgrp->root, ss) {
+               struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+               if (failed) {
+                       /*
+                        * Restore old refcnt if we previously managed
+                        * to clear it from 1 to 0
+                        */
+                       if (!atomic_read(&css->refcnt))
+                               atomic_set(&css->refcnt, 1);
+               } else {
+                       /* Commit the fact that the CSS is removed */
+                       set_bit(CSS_REMOVED, &css->flags);
+               }
+       }
+       local_irq_restore(flags);
+       return !failed;
+}
+
  static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
  {
         struct cgroup *cgrp = dentry->d_fsdata;
         struct dentry *d;
         struct cgroup *parent;
-       struct super_block *sb;
-       struct cgroupfs_root *root;
  
         /* the vfs holds both inode->i_mutex already */
  
@@ -2418,10 +2567,7 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
                 mutex_unlock(&cgroup_mutex);
                 return -EBUSY;
         }
-
-       parent = cgrp->parent;
-       root = cgrp->root;
-       sb = root->sb;
+       mutex_unlock(&cgroup_mutex);
  
         /*
          * Call pre_destroy handlers of subsys. Notify subsystems
@@ -2429,7 +2575,12 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
          */
         cgroup_call_pre_destroy(cgrp);
  
-       if (cgroup_has_css_refs(cgrp)) {
+       mutex_lock(&cgroup_mutex);
+       parent = cgrp->parent;
+
+       if (atomic_read(&cgrp->count)
+           || !list_empty(&cgrp->children)
+           || !cgroup_clear_css_refs(cgrp)) {
                 mutex_unlock(&cgroup_mutex);
                 return -EBUSY;
         }
@@ -2439,11 +2590,14 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
         if (!list_empty(&cgrp->release_list))
                 list_del(&cgrp->release_list);
         spin_unlock(&release_list_lock);
-       /* delete my sibling from parent->children */
+
+       cgroup_lock_hierarchy(cgrp->root);
+       /* delete this cgroup from parent->children */
         list_del(&cgrp->sibling);
+       cgroup_unlock_hierarchy(cgrp->root);
+
         spin_lock(&cgrp->dentry->d_lock);
         d = dget(cgrp->dentry);
-       cgrp->dentry = NULL;
         spin_unlock(&d->d_lock);
  
         cgroup_d_remove_dir(d);
@@ -2463,6 +2617,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name);
  
         /* Create the top cgroup state for this subsystem */
+       list_add(&ss->sibling, &rootnode.subsys_list);
         ss->root = &rootnode;
         css = ss->create(ss, dummytop);
         /* We don't handle early failures gracefully */
@@ -2476,13 +2631,14 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
         init_css_set.subsys[ss->subsys_id] = dummytop->subsys[ss->subsys_id];
  
         need_forkexit_callback |= ss->fork || ss->exit;
-       need_mm_owner_callback |= !!ss->mm_owner_changed;
  
         /* At system boot, before all subsystems have been
          * registered, no tasks have been forked, so we don't
          * need to invoke fork callbacks here. */
         BUG_ON(!list_empty(&init_task.tasks));
  
+       mutex_init(&ss->hierarchy_mutex);
+       lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
         ss->active = 1;
  }
  
@@ -2495,14 +2651,12 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
  int __init cgroup_init_early(void)
  {
         int i;
-       kref_init(&init_css_set.ref);
-       kref_get(&init_css_set.ref);
+       atomic_set(&init_css_set.refcount, 1);
         INIT_LIST_HEAD(&init_css_set.cg_links);
         INIT_LIST_HEAD(&init_css_set.tasks);
         INIT_HLIST_NODE(&init_css_set.hlist);
         css_set_count = 1;
         init_cgroup_root(&rootnode);
-       list_add(&rootnode.root_list, &roots);
         root_count = 1;
         init_task.cgroups = &init_css_set;
  
@@ -2609,15 +2763,12 @@ static int proc_cgroup_show(struct seq_file *m, void *v)
  
         mutex_lock(&cgroup_mutex);
  
-       for_each_root(root) {
+       for_each_active_root(root) {
                 struct cgroup_subsys *ss;
                 struct cgroup *cgrp;
                 int subsys_id;
                 int count = 0;
  
-               /* Skip this hierarchy if it has no active subsystems */
-               if (!root->actual_subsys_bits)
-                       continue;
                 seq_printf(m, "%lu:", root->subsys_bits);
                 for_each_subsys(root, ss)
                         seq_printf(m, "%s%s", count++ ? "," : "", ss->name);
@@ -2727,35 +2878,6 @@ void cgroup_fork_callbacks(struct task_struct *child)
         }
  }
  
-#ifdef CONFIG_MM_OWNER
-/**
- * cgroup_mm_owner_callbacks - run callbacks when the mm->owner changes
- * @p: the new owner
- *
- * Called on every change to mm->owner. mm_init_owner() does not
- * invoke this routine, since it assigns the mm->owner the first time
- * and does not change it.
- */
-void cgroup_mm_owner_callbacks(struct task_struct *old, struct task_struct *new)
-{
-       struct cgroup *oldcgrp, *newcgrp = NULL;
-
-       if (need_mm_owner_callback) {
-               int i;
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
-                       struct cgroup_subsys *ss = subsys[i];
-                       oldcgrp = task_cgroup(old, ss->subsys_id);
-                       if (new)
-                               newcgrp = task_cgroup(new, ss->subsys_id);
-                       if (oldcgrp == newcgrp)
-                               continue;
-                       if (ss->mm_owner_changed)
-                               ss->mm_owner_changed(ss, oldcgrp, newcgrp);
-               }
-       }
-}
-#endif /* CONFIG_MM_OWNER */
-
  /**
   * cgroup_post_fork - called on a new task after adding it to the task list
   * @child: the task in question
@@ -2769,8 +2891,10 @@ void cgroup_post_fork(struct task_struct *child)
  {
         if (use_task_css_set_links) {
                 write_lock(&css_set_lock);
+               task_lock(child);
                 if (list_empty(&child->cg_list))
                         list_add(&child->cg_list, &child->cgroups->tasks);
+               task_unlock(child);
                 write_unlock(&css_set_lock);
         }
  }
@@ -2873,20 +2997,24 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
   again:
         root = subsys->root;
         if (root == &rootnode) {
-               printk(KERN_INFO
-                      "Not cloning cgroup for unused subsystem %s\n",
-                      subsys->name);
                 mutex_unlock(&cgroup_mutex);
                 return 0;
         }
-       cg = tsk->cgroups;
-       parent = task_cgroup(tsk, subsys->subsys_id);
  
         /* Pin the hierarchy */
-       atomic_inc(&parent->root->sb->s_active);
+       if (!atomic_inc_not_zero(&root->sb->s_active)) {
+               /* We race with the final deactivate_super() */
+               mutex_unlock(&cgroup_mutex);
+               return 0;
+       }
  
         /* Keep the cgroup alive */
+       task_lock(tsk);
+       parent = task_cgroup(tsk, subsys->subsys_id);
+       cg = tsk->cgroups;
         get_css_set(cg);
+       task_unlock(tsk);
+
         mutex_unlock(&cgroup_mutex);
  
         /* Now do the VFS work to create a cgroup */
@@ -2905,7 +3033,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
         }
  
         /* Create the cgroup directory, which also creates the cgroup */
-       ret = vfs_mkdir(inode, dentry, S_IFDIR | 0755);
+       ret = vfs_mkdir(inode, dentry, 0755);
         child = __d_cgrp(dentry);
         dput(dentry);
         if (ret) {
@@ -2915,13 +3043,6 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
                 goto out_release;
         }
  
-       if (!child) {
-               printk(KERN_INFO
-                      "Couldn't find new cgroup %s\n", nodename);
-               ret = -ENOMEM;
-               goto out_release;
-       }
-
         /* The cgroup now exists. Retake cgroup_mutex and check
          * that we're still in the same state that we thought we
          * were. */
@@ -2932,7 +3053,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
                 mutex_unlock(&inode->i_mutex);
                 put_css_set(cg);
  
-               deactivate_super(parent->root->sb);
+               deactivate_super(root->sb);
                 /* The cgroup is still accessible in the VFS, but
                  * we're not going to try to rmdir() it at this
                  * point. */
@@ -2958,23 +3079,24 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
         mutex_lock(&cgroup_mutex);
         put_css_set(cg);
         mutex_unlock(&cgroup_mutex);
-       deactivate_super(parent->root->sb);
+       deactivate_super(root->sb);
         return ret;
  }
  
  /**
- * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
+ * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
   * @cgrp: the cgroup in question
+ * @task: the task in question
   *
- * See if @cgrp is a descendant of the current task's cgroup in
- * the appropriate hierarchy.
+ * See if @cgrp is a descendant of @task's cgroup in the appropriate
+ * hierarchy.
   *
   * If we are sending in dummytop, then presumably we are creating
   * the top cgroup in the subsystem.
   *
   * Called only by the ns (nsproxy) cgroup.
   */
-int cgroup_is_descendant(const struct cgroup *cgrp)
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
  {
         int ret;
         struct cgroup *target;
@@ -2984,7 +3106,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
                 return 1;
  
         get_first_subsys(cgrp, NULL, &subsys_id);
-       target = task_cgroup(current, subsys_id);
+       target = task_cgroup(task, subsys_id);
         while (cgrp != target && cgrp!= cgrp->top_cgroup)
                 cgrp = cgrp->parent;
         ret = (cgrp == target);
@@ -3017,7 +3139,8 @@ void __css_put(struct cgroup_subsys_state *css)
  {
         struct cgroup *cgrp = css->cgroup;
         rcu_read_lock();
-       if (atomic_dec_and_test(&css->refcnt) && notify_on_release(cgrp)) {
+       if ((atomic_dec_return(&css->refcnt) == 1) &&
+           notify_on_release(cgrp)) {
                 set_bit(CGRP_RELEASABLE, &cgrp->flags);
                 check_for_release(cgrp);
         }