cgroups: revamp subsys array

author Ben Blum <bblum@andrew.cmu.edu>

Wed, 10 Mar 2010 23:22:07 +0000 (15:22 -0800)

committer Linus Torvalds <torvalds@linux-foundation.org>

Fri, 12 Mar 2010 23:52:36 +0000 (15:52 -0800)
author Ben Blum <bblum@andrew.cmu.edu>
Wed, 10 Mar 2010 23:22:07 +0000 (15:22 -0800)
committer Linus Torvalds <torvalds@linux-foundation.org>
Fri, 12 Mar 2010 23:52:36 +0000 (15:52 -0800)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h

index 14160b5..28319a9 100644 (file)
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -40,13 +40,19 @@ extern int cgroupstats_build(struct cgroupstats *stats,
  
  extern const struct file_operations proc_cgroup_operations;
  
-/* Define the enumeration of all cgroup subsystems */
+/* Define the enumeration of all builtin cgroup subsystems */
  #define SUBSYS(_x) _x ## _subsys_id,
  enum cgroup_subsys_id {
  #include <linux/cgroup_subsys.h>
-       CGROUP_SUBSYS_COUNT
+       CGROUP_BUILTIN_SUBSYS_COUNT
  };
  #undef SUBSYS
+/*
+ * This define indicates the maximum number of subsystems that can be loaded
+ * at once. We limit to this many since cgroupfs_root has subsys_bits to keep
+ * track of all of them.
+ */
+#define CGROUP_SUBSYS_COUNT (BITS_PER_BYTE*sizeof(unsigned long))
  
  /* Per-subsystem/per-cgroup state maintained by the system. */
  struct cgroup_subsys_state {
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index cace83d..c92fb95 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -57,10 +57,14 @@
  
  static DEFINE_MUTEX(cgroup_mutex);
  
-/* Generate an array of cgroup subsystem pointers */
+/*
+ * Generate an array of cgroup subsystem pointers. At boot time, this is
+ * populated up to CGROUP_BUILTIN_SUBSYS_COUNT, and modular subsystems are
+ * registered after that. The mutable section of this array is protected by
+ * cgroup_mutex.
+ */
  #define SUBSYS(_x) &_x ## _subsys,
-
-static struct cgroup_subsys *subsys[] = {
+static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = {
  #include <linux/cgroup_subsys.h>
  };
  
@@ -448,8 +452,11 @@ static struct css_set *find_existing_css_set(
         struct hlist_node *node;
         struct css_set *cg;
  
-       /* Built the set of subsystem state objects that we want to
-        * see in the new css_set */
+       /*
+        * Build the set of subsystem state objects that we want to see in the
+        * new css_set. while subsystems can change globally, the entries here
+        * won't change, so no need for locking.
+        */
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                 if (root->subsys_bits & (1UL << i)) {
                         /* Subsystem is in this hierarchy. So we want
@@ -884,7 +891,9 @@ void cgroup_release_and_wakeup_rmdir(struct cgroup_subsys_state *css)
         css_put(css);
  }
  
-
+/*
+ * Call with cgroup_mutex held.
+ */
  static int rebind_subsystems(struct cgroupfs_root *root,
                               unsigned long final_bits)
  {
@@ -892,6 +901,8 @@ static int rebind_subsystems(struct cgroupfs_root *root,
         struct cgroup *cgrp = &root->top_cgroup;
         int i;
  
+       BUG_ON(!mutex_is_locked(&cgroup_mutex));
+
         removed_bits = root->actual_subsys_bits & ~final_bits;
         added_bits = final_bits & ~root->actual_subsys_bits;
         /* Check that any added subsystems are currently free */
@@ -900,6 +911,12 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                 struct cgroup_subsys *ss = subsys[i];
                 if (!(bit & added_bits))
                         continue;
+               /*
+                * Nobody should tell us to do a subsys that doesn't exist:
+                * parse_cgroupfs_options should catch that case and refcounts
+                * ensure that subsystems won't disappear once selected.
+                */
+               BUG_ON(ss == NULL);
                 if (ss->root != &rootnode) {
                         /* Subsystem isn't free */
                         return -EBUSY;
@@ -919,6 +936,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                 unsigned long bit = 1UL << i;
                 if (bit & added_bits) {
                         /* We're binding this subsystem to this hierarchy */
+                       BUG_ON(ss == NULL);
                         BUG_ON(cgrp->subsys[i]);
                         BUG_ON(!dummytop->subsys[i]);
                         BUG_ON(dummytop->subsys[i]->cgroup != dummytop);
@@ -932,6 +950,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                         mutex_unlock(&ss->hierarchy_mutex);
                 } else if (bit & removed_bits) {
                         /* We're removing this subsystem */
+                       BUG_ON(ss == NULL);
                         BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]);
                         BUG_ON(cgrp->subsys[i]->cgroup != cgrp);
                         mutex_lock(&ss->hierarchy_mutex);
@@ -944,6 +963,7 @@ static int rebind_subsystems(struct cgroupfs_root *root,
                         mutex_unlock(&ss->hierarchy_mutex);
                 } else if (bit & final_bits) {
                         /* Subsystem state should already exist */
+                       BUG_ON(ss == NULL);
                         BUG_ON(!cgrp->subsys[i]);
                 } else {
                         /* Subsystem state shouldn't exist */
@@ -986,14 +1006,18 @@ struct cgroup_sb_opts {
  
  };
  
-/* Convert a hierarchy specifier into a bitmask of subsystems and
- * flags. */
+/*
+ * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call
+ * with cgroup_mutex held to protect the subsys[] array.
+ */
  static int parse_cgroupfs_options(char *data,
                                      struct cgroup_sb_opts *opts)
  {
         char *token, *o = data ?: "all";
         unsigned long mask = (unsigned long)-1;
  
+       BUG_ON(!mutex_is_locked(&cgroup_mutex));
+
  #ifdef CONFIG_CPUSETS
         mask = ~(1UL << cpuset_subsys_id);
  #endif
@@ -1009,6 +1033,8 @@ static int parse_cgroupfs_options(char *data,
                         opts->subsys_bits = 0;
                         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                 struct cgroup_subsys *ss = subsys[i];
+                               if (ss == NULL)
+                                       continue;
                                 if (!ss->disabled)
                                         opts->subsys_bits |= 1ul << i;
                         }
@@ -1053,6 +1079,8 @@ static int parse_cgroupfs_options(char *data,
                         int i;
                         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                                 ss = subsys[i];
+                               if (ss == NULL)
+                                       continue;
                                 if (!strcmp(token, ss->name)) {
                                         if (!ss->disabled)
                                                 set_bit(i, &opts->subsys_bits);
@@ -1306,7 +1334,9 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
         struct cgroupfs_root *new_root;
  
         /* First find the desired set of subsystems */
+       mutex_lock(&cgroup_mutex);
         ret = parse_cgroupfs_options(data, &opts);
+       mutex_unlock(&cgroup_mutex);
         if (ret)
                 goto out_err;
  
@@ -2918,8 +2948,14 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
         /* We need to take each hierarchy_mutex in a consistent order */
         int i;
  
+       /*
+        * No worry about a race with rebind_subsystems that might mess up the
+        * locking order, since both parties are under cgroup_mutex.
+        */
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                 struct cgroup_subsys *ss = subsys[i];
+               if (ss == NULL)
+                       continue;
                 if (ss->root == root)
                         mutex_lock(&ss->hierarchy_mutex);
         }
@@ -2931,6 +2967,8 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                 struct cgroup_subsys *ss = subsys[i];
+               if (ss == NULL)
+                       continue;
                 if (ss->root == root)
                         mutex_unlock(&ss->hierarchy_mutex);
         }
@@ -3054,11 +3092,16 @@ static int cgroup_has_css_refs(struct cgroup *cgrp)
          * synchronization other than RCU, and the subsystem linked
          * list isn't RCU-safe */
         int i;
+       /*
+        * We won't need to lock the subsys array, because the subsystems
+        * we're concerned about aren't going anywhere since our cgroup root
+        * has a reference on them.
+        */
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                 struct cgroup_subsys *ss = subsys[i];
                 struct cgroup_subsys_state *css;
-               /* Skip subsystems not in this hierarchy */
-               if (ss->root != cgrp->root)
+               /* Skip subsystems not present or not in this hierarchy */
+               if (ss == NULL || ss->root != cgrp->root)
                         continue;
                 css = cgrp->subsys[ss->subsys_id];
                 /* When called from check_for_release() it's possible
@@ -3279,7 +3322,8 @@ int __init cgroup_init_early(void)
         for (i = 0; i < CSS_SET_TABLE_SIZE; i++)
                 INIT_HLIST_HEAD(&css_set_table[i]);
  
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+       /* at bootup time, we don't worry about modular subsystems */
+       for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                 struct cgroup_subsys *ss = subsys[i];
  
                 BUG_ON(!ss->name);
@@ -3314,7 +3358,8 @@ int __init cgroup_init(void)
         if (err)
                 return err;
  
-       for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+       /* at bootup time, we don't worry about modular subsystems */
+       for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                 struct cgroup_subsys *ss = subsys[i];
                 if (!ss->early_init)
                         cgroup_init_subsys(ss);
@@ -3423,9 +3468,16 @@ static int proc_cgroupstats_show(struct seq_file *m, void *v)
         int i;
  
         seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n");
+       /*
+        * ideally we don't want subsystems moving around while we do this.
+        * cgroup_mutex is also necessary to guarantee an atomic snapshot of
+        * subsys/hierarchy state.
+        */
         mutex_lock(&cgroup_mutex);
         for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
                 struct cgroup_subsys *ss = subsys[i];
+               if (ss == NULL)
+                       continue;
                 seq_printf(m, "%s\t%d\t%d\t%d\n",
                            ss->name, ss->root->hierarchy_id,
                            ss->root->number_of_cgroups, !ss->disabled);
@@ -3483,7 +3535,12 @@ void cgroup_fork_callbacks(struct task_struct *child)
  {
         if (need_forkexit_callback) {
                 int i;
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               /*
+                * forkexit callbacks are only supported for builtin
+                * subsystems, and the builtin section of the subsys array is
+                * immutable, so we don't need to lock the subsys array here.
+                */
+               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                         struct cgroup_subsys *ss = subsys[i];
                         if (ss->fork)
                                 ss->fork(ss, child);
@@ -3552,7 +3609,11 @@ void cgroup_exit(struct task_struct *tsk, int run_callbacks)
         struct css_set *cg;
  
         if (run_callbacks && need_forkexit_callback) {
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               /*
+                * modular subsystems can't use callbacks, so no need to lock
+                * the subsys array
+                */
+               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                         struct cgroup_subsys *ss = subsys[i];
                         if (ss->exit)
                                 ss->exit(ss, tsk);
@@ -3844,8 +3905,11 @@ static int __init cgroup_disable(char *str)
         while ((token = strsep(&str, ",")) != NULL) {
                 if (!*token)
                         continue;
-
-               for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
+               /*
+                * cgroup_disable, being at boot time, can't know about module
+                * subsystems, so we don't worry about them.
+                */
+               for (i = 0; i < CGROUP_BUILTIN_SUBSYS_COUNT; i++) {
                         struct cgroup_subsys *ss = subsys[i];
  
                         if (!strcmp(token, ss->name)) {
author	Ben Blum <bblum@andrew.cmu.edu>
	Wed, 10 Mar 2010 23:22:07 +0000 (15:22 -0800)
committer	Linus Torvalds <torvalds@linux-foundation.org>
	Fri, 12 Mar 2010 23:52:36 +0000 (15:52 -0800)
include/linux/cgroup.h		patch \| blob \| history
kernel/cgroup.c		patch \| blob \| history