sched: rt-group: interface

author Peter Zijlstra <a.p.zijlstra@chello.nl>

Wed, 13 Feb 2008 14:45:39 +0000 (15:45 +0100)

committer Ingo Molnar <mingo@elte.hu>

Wed, 13 Feb 2008 14:45:39 +0000 (15:45 +0100)
author Peter Zijlstra <a.p.zijlstra@chello.nl>
Wed, 13 Feb 2008 14:45:39 +0000 (15:45 +0100)
committer Ingo Molnar <mingo@elte.hu>
Wed, 13 Feb 2008 14:45:39 +0000 (15:45 +0100)
diff --git a/Documentation/sched-rt-group.txt b/Documentation/sched-rt-group.txt

new file mode 100644 (file)

index 0000000..1c6332f
--- /dev/null
+++ b/Documentation/sched-rt-group.txt
@@ -0,0 +1,59 @@
+
+
+Real-Time group scheduling.
+
+The problem space:
+
+In order to schedule multiple groups of realtime tasks each group must
+be assigned a fixed portion of the CPU time available. Without a minimum
+guarantee a realtime group can obviously fall short. A fuzzy upper limit
+is of no use since it cannot be relied upon. Which leaves us with just
+the single fixed portion.
+
+CPU time is divided by means of specifying how much time can be spent
+running in a given period. Say a frame fixed realtime renderer must
+deliver 25 frames a second, which yields a period of 0.04s. Now say
+it will also have to play some music and respond to input, leaving it
+with around 80% for the graphics. We can then give this group a runtime
+of 0.8 * 0.04s = 0.032s.
+
+This way the graphics group will have a 0.04s period with a 0.032s runtime
+limit.
+
+Now if the audio thread needs to refill the DMA buffer every 0.005s, but
+needs only about 3% CPU time to do so, it can do with a 0.03 * 0.005s
+= 0.00015s.
+
+
+The Interface:
+
+system wide:
+
+/proc/sys/kernel/sched_rt_period_ms
+/proc/sys/kernel/sched_rt_runtime_us
+
+CONFIG_FAIR_USER_SCHED
+
+/sys/kernel/uids/<uid>/cpu_rt_runtime_us
+
+or
+
+CONFIG_FAIR_CGROUP_SCHED
+
+/cgroup/<cgroup>/cpu.rt_runtime_us
+
+[ time is specified in us because the interface is s32; this gives an
+  operating range of ~35m to 1us ]
+
+The period takes values in [ 1, INT_MAX ], runtime in [ -1, INT_MAX - 1 ].
+
+A runtime of -1 specifies runtime == period, ie. no limit.
+
+New groups get the period from /proc/sys/kernel/sched_rt_period_us and
+a runtime of 0.
+
+Settings are constrained to:
+
+   \Sum_{i} runtime_{i} / global_period <= global_runtime / global_period
+
+in order to keep the configuration schedulable.
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 00e1441..142eb29 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1541,8 +1541,6 @@ extern unsigned int sysctl_sched_child_runs_first;
  extern unsigned int sysctl_sched_features;
  extern unsigned int sysctl_sched_migration_cost;
  extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_rt_period;
-extern unsigned int sysctl_sched_rt_ratio;
  #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
  extern unsigned int sysctl_sched_min_bal_int_shares;
  extern unsigned int sysctl_sched_max_bal_int_shares;
@@ -1552,6 +1550,8 @@ int sched_nr_latency_handler(struct ctl_table *table, int write,
                 struct file *file, void __user *buffer, size_t *length,
                 loff_t *ppos);
  #endif
+extern unsigned int sysctl_sched_rt_period;
+extern int sysctl_sched_rt_runtime;
  
  extern unsigned int sysctl_sched_compat_yield;
  
@@ -2036,6 +2036,9 @@ extern void sched_destroy_group(struct task_group *tg);
  extern void sched_move_task(struct task_struct *tsk);
  extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
  extern unsigned long sched_group_shares(struct task_group *tg);
+extern int sched_group_set_rt_runtime(struct task_group *tg,
+                                     long rt_runtime_us);
+extern long sched_group_rt_runtime(struct task_group *tg);
  
  #endif
  
diff --git a/kernel/sched.c b/kernel/sched.c

index cecaea6..85a5fbf 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -176,7 +176,7 @@ struct task_group {
         struct sched_rt_entity **rt_se;
         struct rt_rq **rt_rq;
  
-       unsigned int rt_ratio;
+       u64 rt_runtime;
  
         /*
          * shares assigned to a task group governs how much of cpu bandwidth
@@ -642,19 +642,21 @@ const_debug unsigned int sysctl_sched_features =
  const_debug unsigned int sysctl_sched_nr_migrate = 32;
  
  /*
- * period over which we measure -rt task cpu usage in ms.
+ * period over which we measure -rt task cpu usage in us.
   * default: 1s
   */
-const_debug unsigned int sysctl_sched_rt_period = 1000;
+unsigned int sysctl_sched_rt_period = 1000000;
  
-#define SCHED_RT_FRAC_SHIFT    16
-#define SCHED_RT_FRAC          (1UL << SCHED_RT_FRAC_SHIFT)
+/*
+ * part of the period that we allow rt tasks to run in us.
+ * default: 0.95s
+ */
+int sysctl_sched_rt_runtime = 950000;
  
  /*
- * ratio of time -rt tasks may consume.
- * default: 95%
+ * single value that denotes runtime == period, ie unlimited time.
   */
-const_debug unsigned int sysctl_sched_rt_ratio = 62259;
+#define RUNTIME_INF    ((u64)~0ULL)
  
  /*
   * For kernel-internal use: high-speed (but slightly incorrect) per-cpu
@@ -7187,7 +7189,8 @@ void __init sched_init(void)
                                 &per_cpu(init_cfs_rq, i),
                                 &per_cpu(init_sched_entity, i), i, 1);
  
-               init_task_group.rt_ratio = sysctl_sched_rt_ratio; /* XXX */
+               init_task_group.rt_runtime =
+                       sysctl_sched_rt_runtime * NSEC_PER_USEC;
                 INIT_LIST_HEAD(&rq->leaf_rt_rq_list);
                 init_tg_rt_entry(rq, &init_task_group,
                                 &per_cpu(init_rt_rq, i),
@@ -7583,7 +7586,7 @@ struct task_group *sched_create_group(void)
                 goto err;
  
         tg->shares = NICE_0_LOAD;
-       tg->rt_ratio = 0; /* XXX */
+       tg->rt_runtime = 0;
  
         for_each_possible_cpu(i) {
                 rq = cpu_rq(i);
@@ -7785,30 +7788,76 @@ unsigned long sched_group_shares(struct task_group *tg)
  }
  
  /*
- * Ensure the total rt_ratio <= sysctl_sched_rt_ratio
+ * Ensure that the real time constraints are schedulable.
   */
-int sched_group_set_rt_ratio(struct task_group *tg, unsigned long rt_ratio)
+static DEFINE_MUTEX(rt_constraints_mutex);
+
+static unsigned long to_ratio(u64 period, u64 runtime)
+{
+       if (runtime == RUNTIME_INF)
+               return 1ULL << 16;
+
+       runtime *= (1ULL << 16);
+       div64_64(runtime, period);
+       return runtime;
+}
+
+static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
  {
         struct task_group *tgi;
         unsigned long total = 0;
+       unsigned long global_ratio =
+               to_ratio(sysctl_sched_rt_period,
+                        sysctl_sched_rt_runtime < 0 ?
+                               RUNTIME_INF : sysctl_sched_rt_runtime);
  
         rcu_read_lock();
-       list_for_each_entry_rcu(tgi, &task_groups, list)
-               total += tgi->rt_ratio;
-       rcu_read_unlock();
+       list_for_each_entry_rcu(tgi, &task_groups, list) {
+               if (tgi == tg)
+                       continue;
  
-       if (total + rt_ratio - tg->rt_ratio > sysctl_sched_rt_ratio)
-               return -EINVAL;
+               total += to_ratio(period, tgi->rt_runtime);
+       }
+       rcu_read_unlock();
  
-       tg->rt_ratio = rt_ratio;
-       return 0;
+       return total + to_ratio(period, runtime) < global_ratio;
  }
  
-unsigned long sched_group_rt_ratio(struct task_group *tg)
+int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
  {
-       return tg->rt_ratio;
+       u64 rt_runtime, rt_period;
+       int err = 0;
+
+       rt_period = sysctl_sched_rt_period * NSEC_PER_USEC;
+       rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
+       if (rt_runtime_us == -1)
+               rt_runtime = rt_period;
+
+       mutex_lock(&rt_constraints_mutex);
+       if (!__rt_schedulable(tg, rt_period, rt_runtime)) {
+               err = -EINVAL;
+               goto unlock;
+       }
+       if (rt_runtime_us == -1)
+               rt_runtime = RUNTIME_INF;
+       tg->rt_runtime = rt_runtime;
+ unlock:
+       mutex_unlock(&rt_constraints_mutex);
+
+       return err;
  }
  
+long sched_group_rt_runtime(struct task_group *tg)
+{
+       u64 rt_runtime_us;
+
+       if (tg->rt_runtime == RUNTIME_INF)
+               return -1;
+
+       rt_runtime_us = tg->rt_runtime;
+       do_div(rt_runtime_us, NSEC_PER_USEC);
+       return rt_runtime_us;
+}
  #endif /* CONFIG_FAIR_GROUP_SCHED */
  
  #ifdef CONFIG_FAIR_CGROUP_SCHED
@@ -7884,17 +7933,49 @@ static u64 cpu_shares_read_uint(struct cgroup *cgrp, struct cftype *cft)
         return (u64) tg->shares;
  }
  
-static int cpu_rt_ratio_write_uint(struct cgroup *cgrp, struct cftype *cftype,
-               u64 rt_ratio_val)
+static int cpu_rt_runtime_write(struct cgroup *cgrp, struct cftype *cft,
+                               struct file *file,
+                               const char __user *userbuf,
+                               size_t nbytes, loff_t *unused_ppos)
  {
-       return sched_group_set_rt_ratio(cgroup_tg(cgrp), rt_ratio_val);
+       char buffer[64];
+       int retval = 0;
+       s64 val;
+       char *end;
+
+       if (!nbytes)
+               return -EINVAL;
+       if (nbytes >= sizeof(buffer))
+               return -E2BIG;
+       if (copy_from_user(buffer, userbuf, nbytes))
+               return -EFAULT;
+
+       buffer[nbytes] = 0;     /* nul-terminate */
+
+       /* strip newline if necessary */
+       if (nbytes && (buffer[nbytes-1] == '\n'))
+               buffer[nbytes-1] = 0;
+       val = simple_strtoll(buffer, &end, 0);
+       if (*end)
+               return -EINVAL;
+
+       /* Pass to subsystem */
+       retval = sched_group_set_rt_runtime(cgroup_tg(cgrp), val);
+       if (!retval)
+               retval = nbytes;
+       return retval;
  }
  
-static u64 cpu_rt_ratio_read_uint(struct cgroup *cgrp, struct cftype *cft)
+static ssize_t cpu_rt_runtime_read(struct cgroup *cgrp, struct cftype *cft,
+                                  struct file *file,
+                                  char __user *buf, size_t nbytes,
+                                  loff_t *ppos)
  {
-       struct task_group *tg = cgroup_tg(cgrp);
+       char tmp[64];
+       long val = sched_group_rt_runtime(cgroup_tg(cgrp));
+       int len = sprintf(tmp, "%ld\n", val);
  
-       return (u64) tg->rt_ratio;
+       return simple_read_from_buffer(buf, nbytes, ppos, tmp, len);
  }
  
  static struct cftype cpu_files[] = {
@@ -7904,9 +7985,9 @@ static struct cftype cpu_files[] = {
                 .write_uint = cpu_shares_write_uint,
         },
         {
-               .name = "rt_ratio",
-               .read_uint = cpu_rt_ratio_read_uint,
-               .write_uint = cpu_rt_ratio_write_uint,
+               .name = "rt_runtime_us",
+               .read = cpu_rt_runtime_read,
+               .write = cpu_rt_runtime_write,
         },
  };
  
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index 8d42693..35825b2 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -57,12 +57,12 @@ static inline int on_rt_rq(struct sched_rt_entity *rt_se)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
  {
         if (!rt_rq->tg)
-               return SCHED_RT_FRAC;
+               return RUNTIME_INF;
  
-       return rt_rq->tg->rt_ratio;
+       return rt_rq->tg->rt_runtime;
  }
  
  #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -89,7 +89,7 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
  static void enqueue_rt_entity(struct sched_rt_entity *rt_se);
  static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
  
-static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  {
         struct sched_rt_entity *rt_se = rt_rq->rt_se;
  
@@ -102,7 +102,7 @@ static void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
         }
  }
  
-static void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
  {
         struct sched_rt_entity *rt_se = rt_rq->rt_se;
  
@@ -129,9 +129,12 @@ static int rt_se_boosted(struct sched_rt_entity *rt_se)
  
  #else
  
-static inline unsigned int sched_rt_ratio(struct rt_rq *rt_rq)
+static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
  {
-       return sysctl_sched_rt_ratio;
+       if (sysctl_sched_rt_runtime == -1)
+               return RUNTIME_INF;
+
+       return (u64)sysctl_sched_rt_runtime * NSEC_PER_USEC;
  }
  
  #define for_each_leaf_rt_rq(rt_rq, rq) \
@@ -158,11 +161,11 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
         return NULL;
  }
  
-static inline void sched_rt_ratio_enqueue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
  {
  }
  
-static inline void sched_rt_ratio_dequeue(struct rt_rq *rt_rq)
+static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
  {
  }
  
@@ -184,28 +187,24 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
         return rt_task_of(rt_se)->prio;
  }
  
-static int sched_rt_ratio_exceeded(struct rt_rq *rt_rq)
+static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
  {
-       unsigned int rt_ratio = sched_rt_ratio(rt_rq);
-       u64 period, ratio;
+       u64 runtime = sched_rt_runtime(rt_rq);
  
-       if (rt_ratio == SCHED_RT_FRAC)
+       if (runtime == RUNTIME_INF)
                 return 0;
  
         if (rt_rq->rt_throttled)
                 return rt_rq_throttled(rt_rq);
  
-       period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
-       ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
-
-       if (rt_rq->rt_time > ratio) {
+       if (rt_rq->rt_time > runtime) {
                 struct rq *rq = rq_of_rt_rq(rt_rq);
  
                 rq->rt_throttled = 1;
                 rt_rq->rt_throttled = 1;
  
                 if (rt_rq_throttled(rt_rq)) {
-                       sched_rt_ratio_dequeue(rt_rq);
+                       sched_rt_rq_dequeue(rt_rq);
                         return 1;
                 }
         }
@@ -219,17 +218,16 @@ static void update_sched_rt_period(struct rq *rq)
         u64 period;
  
         while (rq->clock > rq->rt_period_expire) {
-               period = (u64)sysctl_sched_rt_period * NSEC_PER_MSEC;
+               period = (u64)sysctl_sched_rt_period * NSEC_PER_USEC;
                 rq->rt_period_expire += period;
  
                 for_each_leaf_rt_rq(rt_rq, rq) {
-                       unsigned long rt_ratio = sched_rt_ratio(rt_rq);
-                       u64 ratio = (period * rt_ratio) >> SCHED_RT_FRAC_SHIFT;
+                       u64 runtime = sched_rt_runtime(rt_rq);
  
-                       rt_rq->rt_time -= min(rt_rq->rt_time, ratio);
-                       if (rt_rq->rt_throttled) {
+                       rt_rq->rt_time -= min(rt_rq->rt_time, runtime);
+                       if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
                                 rt_rq->rt_throttled = 0;
-                               sched_rt_ratio_enqueue(rt_rq);
+                               sched_rt_rq_enqueue(rt_rq);
                         }
                 }
  
@@ -262,12 +260,7 @@ static void update_curr_rt(struct rq *rq)
         cpuacct_charge(curr, delta_exec);
  
         rt_rq->rt_time += delta_exec;
-       /*
-        * might make it a tad more accurate:
-        *
-        * update_sched_rt_period(rq);
-        */
-       if (sched_rt_ratio_exceeded(rt_rq))
+       if (sched_rt_runtime_exceeded(rt_rq))
                 resched_task(curr);
  }
  
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index d41ef6b..924c674 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -311,22 +311,6 @@ static struct ctl_table kern_table[] = {
                 .mode           = 0644,
                 .proc_handler   = &proc_dointvec,
         },
-       {
-               .ctl_name       = CTL_UNNUMBERED,
-               .procname       = "sched_rt_period_ms",
-               .data           = &sysctl_sched_rt_period,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
-       {
-               .ctl_name       = CTL_UNNUMBERED,
-               .procname       = "sched_rt_ratio",
-               .data           = &sysctl_sched_rt_ratio,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = &proc_dointvec,
-       },
  #if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
         {
                 .ctl_name       = CTL_UNNUMBERED,
@@ -348,6 +332,22 @@ static struct ctl_table kern_table[] = {
  #endif
         {
                 .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "sched_rt_period_us",
+               .data           = &sysctl_sched_rt_period,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "sched_rt_runtime_us",
+               .data           = &sysctl_sched_rt_runtime,
+               .maxlen         = sizeof(int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
                 .procname       = "sched_compat_yield",
                 .data           = &sysctl_sched_compat_yield,
                 .maxlen         = sizeof(unsigned int),
diff --git a/kernel/user.c b/kernel/user.c

index 7d7900c..9f6d471 100644 (file)
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -164,9 +164,37 @@ static ssize_t cpu_shares_store(struct kobject *kobj,
  static struct kobj_attribute cpu_share_attr =
         __ATTR(cpu_share, 0644, cpu_shares_show, cpu_shares_store);
  
+static ssize_t cpu_rt_runtime_show(struct kobject *kobj,
+                                  struct kobj_attribute *attr,
+                                  char *buf)
+{
+       struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+
+       return sprintf(buf, "%lu\n", sched_group_rt_runtime(up->tg));
+}
+
+static ssize_t cpu_rt_runtime_store(struct kobject *kobj,
+                                   struct kobj_attribute *attr,
+                                   const char *buf, size_t size)
+{
+       struct user_struct *up = container_of(kobj, struct user_struct, kobj);
+       unsigned long rt_runtime;
+       int rc;
+
+       sscanf(buf, "%lu", &rt_runtime);
+
+       rc = sched_group_set_rt_runtime(up->tg, rt_runtime);
+
+       return (rc ? rc : size);
+}
+
+static struct kobj_attribute cpu_rt_runtime_attr =
+       __ATTR(cpu_rt_runtime, 0644, cpu_rt_runtime_show, cpu_rt_runtime_store);
+
  /* default attributes per uid directory */
  static struct attribute *uids_attributes[] = {
         &cpu_share_attr.attr,
+       &cpu_rt_runtime_attr.attr,
         NULL
  };
author	Peter Zijlstra <a.p.zijlstra@chello.nl>
	Wed, 13 Feb 2008 14:45:39 +0000 (15:45 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Wed, 13 Feb 2008 14:45:39 +0000 (15:45 +0100)
Documentation/sched-rt-group.txt	[new file with mode: 0644]	patch \| blob
include/linux/sched.h		patch \| blob \| history
kernel/sched.c		patch \| blob \| history
kernel/sched_rt.c		patch \| blob \| history
kernel/sysctl.c		patch \| blob \| history
kernel/user.c		patch \| blob \| history