sched: group scheduling, sysfs tunables

author Dhaval Giani <dhaval@linux.vnet.ibm.com>

Mon, 15 Oct 2007 15:00:14 +0000 (17:00 +0200)

committer Ingo Molnar <mingo@elte.hu>

Mon, 15 Oct 2007 15:00:14 +0000 (17:00 +0200)
author Dhaval Giani <dhaval@linux.vnet.ibm.com>
Mon, 15 Oct 2007 15:00:14 +0000 (17:00 +0200)
committer Ingo Molnar <mingo@elte.hu>
Mon, 15 Oct 2007 15:00:14 +0000 (17:00 +0200)
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt

index 84901e7..88bcb87 100644 (file)
--- a/Documentation/sched-design-CFS.txt
+++ b/Documentation/sched-design-CFS.txt
@@ -117,3 +117,70 @@ Some implementation details:
     iterators of the scheduling modules are used. The balancing code got
     quite a bit simpler as a result.
  
+
+Group scheduler extension to CFS
+================================
+
+Normally the scheduler operates on individual tasks and strives to provide
+fair CPU time to each task. Sometimes, it may be desirable to group tasks
+and provide fair CPU time to each such task group. For example, it may
+be desirable to first provide fair CPU time to each user on the system
+and then to each task belonging to a user.
+
+CONFIG_FAIR_GROUP_SCHED strives to achieve exactly that. It lets
+SCHED_NORMAL/BATCH tasks be be grouped and divides CPU time fairly among such
+groups. At present, there are two (mutually exclusive) mechanisms to group
+tasks for CPU bandwidth control purpose:
+
+       - Based on user id (CONFIG_FAIR_USER_SCHED)
+               In this option, tasks are grouped according to their user id.
+       - Based on "cgroup" pseudo filesystem (CONFIG_FAIR_CGROUP_SCHED)
+               This options lets the administrator create arbitrary groups
+               of tasks, using the "cgroup" pseudo filesystem. See
+               Documentation/cgroups.txt for more information about this
+               filesystem.
+
+Only one of these options to group tasks can be chosen and not both.
+
+Group scheduler tunables:
+
+When CONFIG_FAIR_USER_SCHED is defined, a directory is created in sysfs for
+each new user and a "cpu_share" file is added in that directory.
+
+       # cd /sys/kernel/uids
+       # cat 512/cpu_share             # Display user 512's CPU share
+       1024
+       # echo 2048 > 512/cpu_share     # Modify user 512's CPU share
+       # cat 512/cpu_share             # Display user 512's CPU share
+       2048
+       #
+
+CPU bandwidth between two users are divided in the ratio of their CPU shares.
+For ex: if you would like user "root" to get twice the bandwidth of user
+"guest", then set the cpu_share for both the users such that "root"'s
+cpu_share is twice "guest"'s cpu_share
+
+
+When CONFIG_FAIR_CGROUP_SCHED is defined, a "cpu.shares" file is created
+for each group created using the pseudo filesystem. See example steps
+below to create task groups and modify their CPU share using the "cgroups"
+pseudo filesystem
+
+       # mkdir /dev/cpuctl
+       # mount -t cgroup -ocpu none /dev/cpuctl
+       # cd /dev/cpuctl
+
+       # mkdir multimedia      # create "multimedia" group of tasks
+       # mkdir browser         # create "browser" group of tasks
+
+       # #Configure the multimedia group to receive twice the CPU bandwidth
+       # #that of browser group
+
+       # echo 2048 > multimedia/cpu.shares
+       # echo 1024 > browser/cpu.shares
+
+       # firefox &     # Launch firefox and move it to "browser" group
+       # echo <firefox_pid> > browser/tasks
+
+       # #Launch gmplayer (or your favourite movie player)
+       # echo <movie_player_pid> > multimedia/tasks
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 3cddbfc..04233c8 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -87,6 +87,7 @@ struct sched_param {
  #include <linux/timer.h>
  #include <linux/hrtimer.h>
  #include <linux/task_io_accounting.h>
+#include <linux/kobject.h>
  
  #include <asm/processor.h>
  
@@ -599,9 +600,18 @@ struct user_struct {
  
  #ifdef CONFIG_FAIR_USER_SCHED
         struct task_group *tg;
+       struct kset kset;
+       struct subsys_attribute user_attr;
+       struct work_struct work;
  #endif
  };
  
+#ifdef CONFIG_FAIR_USER_SCHED
+extern int uids_kobject_init(void);
+#else
+static inline int uids_kobject_init(void) { return 0; }
+#endif
+
  extern struct user_struct *find_user(uid_t);
  
  extern struct user_struct root_user;
@@ -1848,6 +1858,7 @@ extern struct task_group *sched_create_group(void);
  extern void sched_destroy_group(struct task_group *tg);
  extern void sched_move_task(struct task_struct *tsk);
  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
+extern unsigned long sched_group_shares(struct task_group *tg);
  
  #endif
  
diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c

index d0e5c48..6046939 100644 (file)
--- a/kernel/ksysfs.c
+++ b/kernel/ksysfs.c
@@ -14,6 +14,7 @@
  #include <linux/module.h>
  #include <linux/init.h>
  #include <linux/kexec.h>
+#include <linux/sched.h>
  
  #define KERNEL_ATTR_RO(_name) \
  static struct subsys_attribute _name##_attr = __ATTR_RO(_name)
@@ -116,6 +117,13 @@ static int __init ksysfs_init(void)
                                               &notes_attr);
         }
  
+       /*
+        * Create "/sys/kernel/uids" directory and corresponding root user's
+        * directory under it.
+        */
+       if (!error)
+               error = uids_kobject_init();
+
         return error;
  }
  
diff --git a/kernel/sched.c b/kernel/sched.c

index a3c3ec8..9ac9989 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -162,6 +162,8 @@ struct task_group {
         /* runqueue "owned" by this group on each cpu */
         struct cfs_rq **cfs_rq;
         unsigned long shares;
+       /* spinlock to serialize modification to shares */
+       spinlock_t lock;
  };
  
  /* Default task group's sched entity on each cpu */
@@ -6533,6 +6535,7 @@ void __init sched_init(void)
                         se->parent = NULL;
                 }
                 init_task_group.shares = init_task_group_load;
+               spin_lock_init(&init_task_group.lock);
  #endif
  
                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
@@ -6777,6 +6780,7 @@ struct task_group *sched_create_group(void)
         }
  
         tg->shares = NICE_0_LOAD;
+       spin_lock_init(&tg->lock);
  
         return tg;
  
@@ -6897,8 +6901,9 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
  {
         int i;
  
+       spin_lock(&tg->lock);
         if (tg->shares == shares)
-               return 0;
+               goto done;
  
         /* return -EINVAL if the new value is not sane */
  
@@ -6906,7 +6911,14 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
         for_each_possible_cpu(i)
                 set_se_shares(tg->se[i], shares);
  
+done:
+       spin_unlock(&tg->lock);
         return 0;
  }
  
+unsigned long sched_group_shares(struct task_group *tg)
+{
+       return tg->shares;
+}
+
  #endif /* CONFIG_FAIR_GROUP_SCHED */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c

index 6f87b31..0aab455 100644 (file)
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -231,45 +231,6 @@ static void sysrq_sched_debug_show(void)
         sched_debug_show(NULL, NULL);
  }
  
-#ifdef CONFIG_FAIR_USER_SCHED
-
-static DEFINE_MUTEX(root_user_share_mutex);
-
-static int
-root_user_share_read_proc(char *page, char **start, off_t off, int count,
-                                int *eof, void *data)
-{
-       return sprintf(page, "%d\n", init_task_group_load);
-}
-
-static int
-root_user_share_write_proc(struct file *file, const char __user *buffer,
-                                unsigned long count, void *data)
-{
-       unsigned long shares;
-       char kbuf[sizeof(unsigned long)+1];
-       int rc = 0;
-
-       if (copy_from_user(kbuf, buffer, sizeof(kbuf)))
-               return -EFAULT;
-
-       shares = simple_strtoul(kbuf, NULL, 0);
-
-       if (!shares)
-               shares = NICE_0_LOAD;
-
-       mutex_lock(&root_user_share_mutex);
-
-       init_task_group_load = shares;
-       rc = sched_group_set_shares(&init_task_group, shares);
-
-       mutex_unlock(&root_user_share_mutex);
-
-       return (rc < 0 ? rc : count);
-}
-
-#endif /* CONFIG_FAIR_USER_SCHED */
-
  static int sched_debug_open(struct inode *inode, struct file *filp)
  {
         return single_open(filp, sched_debug_show, NULL);
@@ -292,15 +253,6 @@ static int __init init_sched_debug_procfs(void)
  
         pe->proc_fops = &sched_debug_fops;
  
-#ifdef CONFIG_FAIR_USER_SCHED
-       pe = create_proc_entry("root_user_cpu_share", 0644, NULL);
-       if (!pe)
-               return -ENOMEM;
-
-       pe->read_proc = root_user_share_read_proc;
-       pe->write_proc = root_user_share_write_proc;
-#endif
-
         return 0;
  }
  
diff --git a/kernel/user.c b/kernel/user.c

index 0c9a787..74cadea 100644 (file)
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -55,7 +55,41 @@ struct user_struct root_user = {
  #endif
  };
  
+/*
+ * These routines must be called with the uidhash spinlock held!
+ */
+static inline void uid_hash_insert(struct user_struct *up,
+                                               struct hlist_head *hashent)
+{
+       hlist_add_head(&up->uidhash_node, hashent);
+}
+
+static inline void uid_hash_remove(struct user_struct *up)
+{
+       hlist_del_init(&up->uidhash_node);
+}
+
+static inline struct user_struct *uid_hash_find(uid_t uid,
+                                               struct hlist_head *hashent)
+{
+       struct user_struct *user;
+       struct hlist_node *h;
+
+       hlist_for_each_entry(user, h, hashent, uidhash_node) {
+               if (user->uid == uid) {
+                       atomic_inc(&user->__count);
+                       return user;
+               }
+       }
+
+       return NULL;
+}
+
  #ifdef CONFIG_FAIR_USER_SCHED
+
+static struct kobject uids_kobject; /* represents /sys/kernel/uids directory */
+static DEFINE_MUTEX(uids_mutex);
+
  static void sched_destroy_user(struct user_struct *up)
  {
         sched_destroy_group(up->tg);
@@ -77,42 +111,173 @@ static void sched_switch_user(struct task_struct *p)
         sched_move_task(p);
  }
  
-#else  /* CONFIG_FAIR_USER_SCHED */
+static inline void uids_mutex_lock(void)
+{
+       mutex_lock(&uids_mutex);
+}
  
-static void sched_destroy_user(struct user_struct *up) { }
-static int sched_create_user(struct user_struct *up) { return 0; }
-static void sched_switch_user(struct task_struct *p) { }
+static inline void uids_mutex_unlock(void)
+{
+       mutex_unlock(&uids_mutex);
+}
  
-#endif /* CONFIG_FAIR_USER_SCHED */
+/* return cpu shares held by the user */
+ssize_t cpu_shares_show(struct kset *kset, char *buffer)
+{
+       struct user_struct *up = container_of(kset, struct user_struct, kset);
  
-/*
- * These routines must be called with the uidhash spinlock held!
+       return sprintf(buffer, "%lu\n", sched_group_shares(up->tg));
+}
+
+/* modify cpu shares held by the user */
+ssize_t cpu_shares_store(struct kset *kset, const char *buffer, size_t size)
+{
+       struct user_struct *up = container_of(kset, struct user_struct, kset);
+       unsigned long shares;
+       int rc;
+
+       sscanf(buffer, "%lu", &shares);
+
+       rc = sched_group_set_shares(up->tg, shares);
+
+       return (rc ? rc : size);
+}
+
+static void user_attr_init(struct subsys_attribute *sa, char *name, int mode)
+{
+       sa->attr.name = name;
+       sa->attr.mode = mode;
+       sa->show = cpu_shares_show;
+       sa->store = cpu_shares_store;
+}
+
+/* Create "/sys/kernel/uids/<uid>" directory and
+ *  "/sys/kernel/uids/<uid>/cpu_share" file for this user.
   */
-static inline void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
+static int user_kobject_create(struct user_struct *up)
  {
-       hlist_add_head(&up->uidhash_node, hashent);
+       struct kset *kset = &up->kset;
+       struct kobject *kobj = &kset->kobj;
+       int error;
+
+       memset(kset, 0, sizeof(struct kset));
+       kobj->parent = &uids_kobject;   /* create under /sys/kernel/uids dir */
+       kobject_set_name(kobj, "%d", up->uid);
+       kset_init(kset);
+       user_attr_init(&up->user_attr, "cpu_share", 0644);
+
+       error = kobject_add(kobj);
+       if (error)
+               goto done;
+
+       error = sysfs_create_file(kobj, &up->user_attr.attr);
+       if (error)
+               kobject_del(kobj);
+
+done:
+       return error;
  }
  
-static inline void uid_hash_remove(struct user_struct *up)
+/* create these in sysfs filesystem:
+ *     "/sys/kernel/uids" directory
+ *     "/sys/kernel/uids/0" directory (for root user)
+ *     "/sys/kernel/uids/0/cpu_share" file (for root user)
+ */
+int __init uids_kobject_init(void)
  {
-       hlist_del_init(&up->uidhash_node);
+       int error;
+
+       /* create under /sys/kernel dir */
+       uids_kobject.parent = &kernel_subsys.kobj;
+       kobject_set_name(&uids_kobject, "uids");
+       kobject_init(&uids_kobject);
+
+       error = kobject_add(&uids_kobject);
+       if (!error)
+               error = user_kobject_create(&root_user);
+
+       return error;
  }
  
-static inline struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
+/* work function to remove sysfs directory for a user and free up
+ * corresponding structures.
+ */
+static void remove_user_sysfs_dir(struct work_struct *w)
  {
-       struct user_struct *user;
-       struct hlist_node *h;
+       struct user_struct *up = container_of(w, struct user_struct, work);
+       struct kobject *kobj = &up->kset.kobj;
+       unsigned long flags;
+       int remove_user = 0;
  
-       hlist_for_each_entry(user, h, hashent, uidhash_node) {
-               if(user->uid == uid) {
-                       atomic_inc(&user->__count);
-                       return user;
-               }
+       /* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
+        * atomic.
+        */
+       uids_mutex_lock();
+
+       local_irq_save(flags);
+
+       if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
+               uid_hash_remove(up);
+               remove_user = 1;
+               spin_unlock_irqrestore(&uidhash_lock, flags);
+       } else {
+               local_irq_restore(flags);
         }
  
-       return NULL;
+       if (!remove_user)
+               goto done;
+
+       sysfs_remove_file(kobj, &up->user_attr.attr);
+       kobject_del(kobj);
+
+       sched_destroy_user(up);
+       key_put(up->uid_keyring);
+       key_put(up->session_keyring);
+       kmem_cache_free(uid_cachep, up);
+
+done:
+       uids_mutex_unlock();
+}
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+       /* restore back the count */
+       atomic_inc(&up->__count);
+       spin_unlock_irqrestore(&uidhash_lock, flags);
+
+       INIT_WORK(&up->work, remove_user_sysfs_dir);
+       schedule_work(&up->work);
  }
  
+#else  /* CONFIG_FAIR_USER_SCHED */
+
+static void sched_destroy_user(struct user_struct *up) { }
+static int sched_create_user(struct user_struct *up) { return 0; }
+static void sched_switch_user(struct task_struct *p) { }
+static inline int user_kobject_create(struct user_struct *up) { return 0; }
+static inline void uids_mutex_lock(void) { }
+static inline void uids_mutex_unlock(void) { }
+
+/* IRQs are disabled and uidhash_lock is held upon function entry.
+ * IRQ state (as stored in flags) is restored and uidhash_lock released
+ * upon function exit.
+ */
+static inline void free_user(struct user_struct *up, unsigned long flags)
+{
+       uid_hash_remove(up);
+       spin_unlock_irqrestore(&uidhash_lock, flags);
+       sched_destroy_user(up);
+       key_put(up->uid_keyring);
+       key_put(up->session_keyring);
+       kmem_cache_free(uid_cachep, up);
+}
+
+#endif /* CONFIG_FAIR_USER_SCHED */
+
  /*
   * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
   * caller must undo that ref with free_uid().
@@ -139,16 +304,10 @@ void free_uid(struct user_struct *up)
                 return;
  
         local_irq_save(flags);
-       if (atomic_dec_and_lock(&up->__count, &uidhash_lock)) {
-               uid_hash_remove(up);
-               spin_unlock_irqrestore(&uidhash_lock, flags);
-               sched_destroy_user(up);
-               key_put(up->uid_keyring);
-               key_put(up->session_keyring);
-               kmem_cache_free(uid_cachep, up);
-       } else {
+       if (atomic_dec_and_lock(&up->__count, &uidhash_lock))
+               free_user(up, flags);
+       else
                 local_irq_restore(flags);
-       }
  }
  
  struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
@@ -156,6 +315,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
         struct hlist_head *hashent = uidhashentry(ns, uid);
         struct user_struct *up;
  
+       /* Make uid_hash_find() + user_kobject_create() + uid_hash_insert()
+        * atomic.
+        */
+       uids_mutex_lock();
+
         spin_lock_irq(&uidhash_lock);
         up = uid_hash_find(uid, hashent);
         spin_unlock_irq(&uidhash_lock);
@@ -191,6 +355,15 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
                         return NULL;
                 }
  
+               if (user_kobject_create(new)) {
+                       sched_destroy_user(new);
+                       key_put(new->uid_keyring);
+                       key_put(new->session_keyring);
+                       kmem_cache_free(uid_cachep, new);
+                       uids_mutex_unlock();
+                       return NULL;
+               }
+
                 /*
                  * Before adding this, check whether we raced
                  * on adding the same user already..
@@ -198,7 +371,11 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
                 spin_lock_irq(&uidhash_lock);
                 up = uid_hash_find(uid, hashent);
                 if (up) {
-                       sched_destroy_user(new);
+                       /* This case is not possible when CONFIG_FAIR_USER_SCHED
+                        * is defined, since we serialize alloc_uid() using
+                        * uids_mutex. Hence no need to call
+                        * sched_destroy_user() or remove_user_sysfs_dir().
+                        */
                         key_put(new->uid_keyring);
                         key_put(new->session_keyring);
                         kmem_cache_free(uid_cachep, new);
@@ -209,6 +386,9 @@ struct user_struct * alloc_uid(struct user_namespace *ns, uid_t uid)
                 spin_unlock_irq(&uidhash_lock);
  
         }
+
+       uids_mutex_unlock();
+
         return up;
  }
author	Dhaval Giani <dhaval@linux.vnet.ibm.com>
	Mon, 15 Oct 2007 15:00:14 +0000 (17:00 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Mon, 15 Oct 2007 15:00:14 +0000 (17:00 +0200)
Documentation/sched-design-CFS.txt		patch \| blob \| history
include/linux/sched.h		patch \| blob \| history
kernel/ksysfs.c		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/sched_debug.c		patch \| blob \| history
kernel/user.c		patch \| blob \| history