arch/x86/oprofile/op_model_amd.c: fix op_amd_handle_ibs() return type

[safe/jmp/linux-2.6] / kernel / cgroup.c
diff --git a/kernel/cgroup.c b/kernel/cgroup.c

index d3c5211..3737a68 100644 (file)
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -46,6 +46,7 @@
  #include <linux/cgroupstats.h>
  #include <linux/hash.h>
  #include <linux/namei.h>
+#include <linux/smp_lock.h>
  
  #include <asm/atomic.h>
  
@@ -622,13 +623,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
   * Call subsys's pre_destroy handler.
   * This is called before css refcnt check.
   */
-static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+static int cgroup_call_pre_destroy(struct cgroup *cgrp)
  {
         struct cgroup_subsys *ss;
+       int ret = 0;
+
         for_each_subsys(cgrp->root, ss)
-               if (ss->pre_destroy)
-                       ss->pre_destroy(ss, cgrp);
-       return;
+               if (ss->pre_destroy) {
+                       ret = ss->pre_destroy(ss, cgrp);
+                       if (ret)
+                               break;
+               }
+       return ret;
  }
  
  static void free_cgroup_rcu(struct rcu_head *obj)
@@ -722,6 +728,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
         remove_dir(dentry);
  }
  
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+{
+       if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+               wake_up_all(&cgroup_rmdir_waitq);
+}
+
  static int rebind_subsystems(struct cgroupfs_root *root,
                               unsigned long final_bits)
  {
@@ -821,6 +843,11 @@ static int parse_cgroupfs_options(char *data,
                                      struct cgroup_sb_opts *opts)
  {
         char *token, *o = data ?: "all";
+       unsigned long mask = (unsigned long)-1;
+
+#ifdef CONFIG_CPUSETS
+       mask = ~(1UL << cpuset_subsys_id);
+#endif
  
         opts->subsys_bits = 0;
         opts->flags = 0;
@@ -865,6 +892,15 @@ static int parse_cgroupfs_options(char *data,
                 }
         }
  
+       /*
+        * Option noprefix was introduced just for backward compatibility
+        * with the old cpuset, so we allow noprefix only if mounting just
+        * the cpuset subsystem.
+        */
+       if (test_bit(ROOT_NOPREFIX, &opts->flags) &&
+           (opts->subsys_bits & mask))
+               return -EINVAL;
+
         /* We can't have an empty hierarchy */
         if (!opts->subsys_bits)
                 return -EINVAL;
@@ -879,6 +915,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
         struct cgroup *cgrp = &root->top_cgroup;
         struct cgroup_sb_opts opts;
  
+       lock_kernel();
         mutex_lock(&cgrp->dentry->d_inode->i_mutex);
         mutex_lock(&cgroup_mutex);
  
@@ -894,18 +931,19 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
         }
  
         ret = rebind_subsystems(root, opts.subsys_bits);
+       if (ret)
+               goto out_unlock;
  
         /* (re)populate subsystem files */
-       if (!ret)
-               cgroup_populate_dir(cgrp);
+       cgroup_populate_dir(cgrp);
  
         if (opts.release_agent)
                 strcpy(root->release_agent_path, opts.release_agent);
   out_unlock:
-       if (opts.release_agent)
-               kfree(opts.release_agent);
+       kfree(opts.release_agent);
         mutex_unlock(&cgroup_mutex);
         mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
+       unlock_kernel();
         return ret;
  }
  
@@ -1006,15 +1044,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
         /* First find the desired set of subsystems */
         ret = parse_cgroupfs_options(data, &opts);
         if (ret) {
-               if (opts.release_agent)
-                       kfree(opts.release_agent);
+               kfree(opts.release_agent);
                 return ret;
         }
  
         root = kzalloc(sizeof(*root), GFP_KERNEL);
         if (!root) {
-               if (opts.release_agent)
-                       kfree(opts.release_agent);
+               kfree(opts.release_agent);
                 return -ENOMEM;
         }
  
@@ -1114,8 +1150,7 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
   free_cg_links:
         free_cg_links(&tmp_cg_links);
   drop_new_super:
-       up_write(&sb->s_umount);
-       deactivate_super(sb);
+       deactivate_locked_super(sb);
         return ret;
  }
  
@@ -1317,6 +1352,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
         set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
         synchronize_rcu();
         put_css_set(cg);
+
+       /*
+        * wake up rmdir() waiter. the rmdir should fail since the cgroup
+        * is no longer empty.
+        */
+       cgroup_wakeup_rmdir_waiters(cgrp);
         return 0;
  }
  
@@ -1662,7 +1703,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
         .rename = cgroup_rename,
  };
  
-static int cgroup_create_file(struct dentry *dentry, int mode,
+static int cgroup_create_file(struct dentry *dentry, mode_t mode,
                                 struct super_block *sb)
  {
         static const struct dentry_operations cgroup_dops = {
@@ -1708,7 +1749,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
   * @mode: mode to set on new directory.
   */
  static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-                               int mode)
+                               mode_t mode)
  {
         struct dentry *parent;
         int error = 0;
@@ -1726,6 +1767,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
         return error;
  }
  
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static mode_t cgroup_file_mode(const struct cftype *cft)
+{
+       mode_t mode = 0;
+
+       if (cft->mode)
+               return cft->mode;
+
+       if (cft->read || cft->read_u64 || cft->read_s64 ||
+           cft->read_map || cft->read_seq_string)
+               mode |= S_IRUGO;
+
+       if (cft->write || cft->write_u64 || cft->write_s64 ||
+           cft->write_string || cft->trigger)
+               mode |= S_IWUSR;
+
+       return mode;
+}
+
  int cgroup_add_file(struct cgroup *cgrp,
                        struct cgroup_subsys *subsys,
                        const struct cftype *cft)
@@ -1733,6 +1801,7 @@ int cgroup_add_file(struct cgroup *cgrp,
         struct dentry *dir = cgrp->dentry;
         struct dentry *dentry;
         int error;
+       mode_t mode;
  
         char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
         if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1743,7 +1812,8 @@ int cgroup_add_file(struct cgroup *cgrp,
         BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
         dentry = lookup_one_len(name, dir, strlen(name));
         if (!IS_ERR(dentry)) {
-               error = cgroup_create_file(dentry, 0644 | S_IFREG,
+               mode = cgroup_file_mode(cft);
+               error = cgroup_create_file(dentry, mode | S_IFREG,
                                                 cgrp->root->sb);
                 if (!error)
                         dentry->d_fsdata = (void *)cft;
@@ -2325,6 +2395,7 @@ static struct cftype files[] = {
                 .write_u64 = cgroup_tasks_write,
                 .release = cgroup_tasks_release,
                 .private = FILE_TASKLIST,
+               .mode = S_IRUGO | S_IWUSR,
         },
  
         {
@@ -2425,7 +2496,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
   * Must be called with the mutex on the parent inode held
   */
  static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-                            int mode)
+                            mode_t mode)
  {
         struct cgroup *cgrp;
         struct cgroupfs_root *root = parent->root;
@@ -2608,9 +2679,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
         struct cgroup *cgrp = dentry->d_fsdata;
         struct dentry *d;
         struct cgroup *parent;
+       DEFINE_WAIT(wait);
+       int ret;
  
         /* the vfs holds both inode->i_mutex already */
-
+again:
         mutex_lock(&cgroup_mutex);
         if (atomic_read(&cgrp->count) != 0) {
                 mutex_unlock(&cgroup_mutex);
@@ -2626,17 +2699,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
          * Call pre_destroy handlers of subsys. Notify subsystems
          * that rmdir() request comes.
          */
-       cgroup_call_pre_destroy(cgrp);
+       ret = cgroup_call_pre_destroy(cgrp);
+       if (ret)
+               return ret;
  
         mutex_lock(&cgroup_mutex);
         parent = cgrp->parent;
-
-       if (atomic_read(&cgrp->count)
-           || !list_empty(&cgrp->children)
-           || !cgroup_clear_css_refs(cgrp)) {
+       if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
                 mutex_unlock(&cgroup_mutex);
                 return -EBUSY;
         }
+       /*
+        * css_put/get is provided for subsys to grab refcnt to css. In typical
+        * case, subsystem has no reference after pre_destroy(). But, under
+        * hierarchy management, some *temporal* refcnt can be hold.
+        * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
+        * is really busy, it should return -EBUSY at pre_destroy(). wake_up
+        * is called when css_put() is called and refcnt goes down to 0.
+        */
+       set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+       prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
+
+       if (!cgroup_clear_css_refs(cgrp)) {
+               mutex_unlock(&cgroup_mutex);
+               schedule();
+               finish_wait(&cgroup_rmdir_waitq, &wait);
+               clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+               if (signal_pending(current))
+                       return -EINTR;
+               goto again;
+       }
+       /* NO css_tryget() can success after here. */
+       finish_wait(&cgroup_rmdir_waitq, &wait);
+       clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
  
         spin_lock(&release_list_lock);
         set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3194,10 +3289,12 @@ void __css_put(struct cgroup_subsys_state *css)
  {
         struct cgroup *cgrp = css->cgroup;
         rcu_read_lock();
-       if ((atomic_dec_return(&css->refcnt) == 1) &&
-           notify_on_release(cgrp)) {
-               set_bit(CGRP_RELEASABLE, &cgrp->flags);
-               check_for_release(cgrp);
+       if (atomic_dec_return(&css->refcnt) == 1) {
+               if (notify_on_release(cgrp)) {
+                       set_bit(CGRP_RELEASABLE, &cgrp->flags);
+                       check_for_release(cgrp);
+               }
+               cgroup_wakeup_rmdir_waiters(cgrp);
         }
         rcu_read_unlock();
  }
@@ -3324,7 +3421,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
  }
  
  bool css_is_ancestor(struct cgroup_subsys_state *child,
-                   struct cgroup_subsys_state *root)
+                   const struct cgroup_subsys_state *root)
  {
         struct css_id *child_id = rcu_dereference(child->id);
         struct css_id *root_id = rcu_dereference(root->id);