cgroup: CSS ID support

[safe/jmp/linux-2.6] / include / linux / sched.h
diff --git a/include/linux/sched.h b/include/linux/sched.h

index b475d4d..9186f8c 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -250,7 +250,7 @@ extern void init_idle_bootup_task(struct task_struct *idle);
  extern int runqueue_is_locked(void);
  extern void task_rq_unlock_wait(struct task_struct *p);
  
-extern cpumask_t nohz_cpu_mask;
+extern cpumask_var_t nohz_cpu_mask;
  #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
  extern int select_nohz_load_balancer(int cpu);
  #else
@@ -293,6 +293,9 @@ extern void sched_show_task(struct task_struct *p);
  extern void softlockup_tick(void);
  extern void touch_softlockup_watchdog(void);
  extern void touch_all_softlockup_watchdogs(void);
+extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+                                   struct file *filp, void __user *buffer,
+                                   size_t *lenp, loff_t *ppos);
  extern unsigned int  softlockup_panic;
  extern unsigned long sysctl_hung_task_check_count;
  extern unsigned long sysctl_hung_task_timeout_secs;
@@ -328,7 +331,9 @@ extern signed long schedule_timeout(signed long timeout);
  extern signed long schedule_timeout_interruptible(signed long timeout);
  extern signed long schedule_timeout_killable(signed long timeout);
  extern signed long schedule_timeout_uninterruptible(signed long timeout);
+asmlinkage void __schedule(void);
  asmlinkage void schedule(void);
+extern int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner);
  
  struct nsproxy;
  struct user_namespace;
@@ -386,6 +391,16 @@ extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
                 (mm)->hiwater_vm = (mm)->total_vm;      \
  } while (0)
  
+static inline unsigned long get_mm_hiwater_rss(struct mm_struct *mm)
+{
+       return max(mm->hiwater_rss, get_mm_rss(mm));
+}
+
+static inline unsigned long get_mm_hiwater_vm(struct mm_struct *mm)
+{
+       return max(mm->hiwater_vm, mm->total_vm);
+}
+
  extern void set_dumpable(struct mm_struct *mm, int value);
  extern int get_dumpable(struct mm_struct *mm);
  
@@ -453,16 +468,27 @@ struct task_cputime {
  #define virt_exp       utime
  #define sched_exp      sum_exec_runtime
  
+#define INIT_CPUTIME   \
+       (struct task_cputime) {                                 \
+               .utime = cputime_zero,                          \
+               .stime = cputime_zero,                          \
+               .sum_exec_runtime = 0,                          \
+       }
+
  /**
- * struct thread_group_cputime - thread group interval timer counts
- * @totals:            thread group interval timers; substructure for
- *                     uniprocessor kernel, per-cpu for SMP kernel.
+ * struct thread_group_cputimer - thread group interval timer counts
+ * @cputime:           thread group interval timers.
+ * @running:           non-zero when there are timers running and
+ *                     @cputime receives updates.
+ * @lock:              lock for fields in this struct.
   *
   * This structure contains the version of task_cputime, above, that is
- * used for thread group CPU clock calculations.
+ * used for thread group CPU timer calculations.
   */
-struct thread_group_cputime {
-       struct task_cputime *totals;
+struct thread_group_cputimer {
+       struct task_cputime cputime;
+       int running;
+       spinlock_t lock;
  };
  
  /*
@@ -511,10 +537,10 @@ struct signal_struct {
         cputime_t it_prof_incr, it_virt_incr;
  
         /*
-        * Thread group totals for process CPU clocks.
-        * See thread_group_cputime(), et al, for details.
+        * Thread group totals for process CPU timers.
+        * See thread_group_cputimer(), et al, for details.
          */
-       struct thread_group_cputime cputime;
+       struct thread_group_cputimer cputimer;
  
         /* Earliest-expiration cache. */
         struct task_cputime cputime_expires;
@@ -551,7 +577,7 @@ struct signal_struct {
          * Live threads maintain their own counters and add to these
          * in __exit_signal, except for the group leader.
          */
-       cputime_t cutime, cstime;
+       cputime_t utime, stime, cutime, cstime;
         cputime_t gtime;
         cputime_t cgtime;
         unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@ -560,6 +586,14 @@ struct signal_struct {
         struct task_io_accounting ioac;
  
         /*
+        * Cumulative ns of schedule CPU time fo dead threads in the
+        * group, not including a zombie group leader, (This only differs
+        * from jiffies_to_ns(utime + stime) if sched_clock uses something
+        * other than jiffies.)
+        */
+       unsigned long long sum_sched_runtime;
+
+       /*
          * We don't bother to synchronize most readers of this at all,
          * because there is no reader checking a limit that actually needs
          * to get both rlim_cur and rlim_max atomically, and either one
@@ -623,7 +657,6 @@ struct user_struct {
         atomic_t inotify_devs;  /* How many inotify devs does this user have opened? */
  #endif
  #ifdef CONFIG_EPOLL
-       atomic_t epoll_devs;    /* The number of epoll descriptors currently open */
         atomic_t epoll_watches; /* The number of file descriptors currently watched */
  #endif
  #ifdef CONFIG_POSIX_MQUEUE
@@ -757,20 +790,51 @@ enum cpu_idle_type {
  #define SD_SERIALIZE           1024    /* Only a single load balancing instance */
  #define SD_WAKE_IDLE_FAR       2048    /* Gain latency sacrificing cache hit */
  
-#define BALANCE_FOR_MC_POWER   \
-       (sched_smt_power_savings ? SD_POWERSAVINGS_BALANCE : 0)
+enum powersavings_balance_level {
+       POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
+       POWERSAVINGS_BALANCE_BASIC,     /* Fill one thread/core/package
+                                        * first for long running threads
+                                        */
+       POWERSAVINGS_BALANCE_WAKEUP,    /* Also bias task wakeups to semi-idle
+                                        * cpu package for power savings
+                                        */
+       MAX_POWERSAVINGS_BALANCE_LEVELS
+};
+
+extern int sched_mc_power_savings, sched_smt_power_savings;
+
+static inline int sd_balance_for_mc_power(void)
+{
+       if (sched_smt_power_savings)
+               return SD_POWERSAVINGS_BALANCE;
+
+       return 0;
+}
+
+static inline int sd_balance_for_package_power(void)
+{
+       if (sched_mc_power_savings | sched_smt_power_savings)
+               return SD_POWERSAVINGS_BALANCE;
  
-#define BALANCE_FOR_PKG_POWER  \
-       ((sched_mc_power_savings || sched_smt_power_savings) ?  \
-        SD_POWERSAVINGS_BALANCE : 0)
+       return 0;
+}
  
-#define test_sd_parent(sd, flag)       ((sd->parent &&         \
-                                        (sd->parent->flags & flag)) ? 1 : 0)
+/*
+ * Optimise SD flags for power savings:
+ * SD_BALANCE_NEWIDLE helps agressive task consolidation and power savings.
+ * Keep default SD flags if sched_{smt,mc}_power_saving=0
+ */
+
+static inline int sd_power_saving_flags(void)
+{
+       if (sched_mc_power_savings | sched_smt_power_savings)
+               return SD_BALANCE_NEWIDLE;
  
+       return 0;
+}
  
  struct sched_group {
         struct sched_group *next;       /* Must be a circular list */
-       cpumask_t cpumask;
  
         /*
          * CPU power of this group, SCHED_LOAD_SCALE being max power for a
@@ -783,8 +847,15 @@ struct sched_group {
          * (see include/linux/reciprocal_div.h)
          */
         u32 reciprocal_cpu_power;
+
+       unsigned long cpumask[];
  };
  
+static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
+{
+       return to_cpumask(sg->cpumask);
+}
+
  enum sched_domain_level {
         SD_LV_NONE = 0,
         SD_LV_SIBLING,
@@ -808,7 +879,6 @@ struct sched_domain {
         struct sched_domain *parent;    /* top domain must be null terminated */
         struct sched_domain *child;     /* bottom domain must be null terminated */
         struct sched_group *groups;     /* the balancing groups of the domain */
-       cpumask_t span;                 /* span of all CPUs in this domain */
         unsigned long min_interval;     /* Minimum balance interval ms */
         unsigned long max_interval;     /* Maximum balance interval ms */
         unsigned int busy_factor;       /* less balancing by factor if busy */
@@ -863,18 +933,34 @@ struct sched_domain {
  #ifdef CONFIG_SCHED_DEBUG
         char *name;
  #endif
+
+       /* span of all CPUs in this domain */
+       unsigned long span[];
  };
  
-extern void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
+{
+       return to_cpumask(sd->span);
+}
+
+extern void partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                                     struct sched_domain_attr *dattr_new);
-extern int arch_reinit_sched_domains(void);
+
+/* Test a flag in parent sched domain */
+static inline int test_sd_parent(struct sched_domain *sd, int flag)
+{
+       if (sd->parent && (sd->parent->flags & flag))
+               return 1;
+
+       return 0;
+}
  
  #else /* CONFIG_SMP */
  
  struct sched_domain_attr;
  
  static inline void
-partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
+partition_sched_domains(int ndoms_new, struct cpumask *doms_new,
                         struct sched_domain_attr *dattr_new)
  {
  }
@@ -921,11 +1007,12 @@ struct sched_class {
                               struct rq *busiest, struct sched_domain *sd,
                               enum cpu_idle_type idle);
         void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+       int (*needs_post_schedule) (struct rq *this_rq);
         void (*post_schedule) (struct rq *this_rq);
         void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
  
         void (*set_cpus_allowed)(struct task_struct *p,
-                                const cpumask_t *newmask);
+                                const struct cpumask *newmask);
  
         void (*rq_online)(struct rq *rq);
         void (*rq_offline)(struct rq *rq);
@@ -975,6 +1062,10 @@ struct sched_entity {
         u64                     last_wakeup;
         u64                     avg_overlap;
  
+       u64                     start_runtime;
+       u64                     avg_wakeup;
+       u64                     nr_migrations;
+
  #ifdef CONFIG_SCHEDSTATS
         u64                     wait_start;
         u64                     wait_max;
@@ -990,7 +1081,6 @@ struct sched_entity {
         u64                     exec_max;
         u64                     slice_max;
  
-       u64                     nr_migrations;
         u64                     nr_migrations_cold;
         u64                     nr_failed_migrations_affine;
         u64                     nr_failed_migrations_running;
@@ -1087,6 +1177,7 @@ struct task_struct {
  #endif
  
         struct list_head tasks;
+       struct plist_node pushable_tasks;
  
         struct mm_struct *mm, *active_mm;
  
@@ -1098,13 +1189,14 @@ struct task_struct {
         /* ??? */
         unsigned int personality;
         unsigned did_exec:1;
+       unsigned in_execve:1;   /* Tell the LSMs that the process is doing an
+                                * execve */
         pid_t pid;
         pid_t tgid;
  
-#ifdef CONFIG_CC_STACKPROTECTOR
         /* Canary value for the -fstack-protector gcc feature */
         unsigned long stack_canary;
-#endif
+
         /* 
          * pointers to (original) parent process, youngest child, younger sibling,
          * older sibling, respectively.  (p->father can be replaced with 
@@ -1251,6 +1343,7 @@ struct task_struct {
         int lockdep_depth;
         unsigned int lockdep_recursion;
         struct held_lock held_locks[MAX_LOCK_DEPTH];
+       gfp_t lockdep_reclaim_gfp;
  #endif
  
  /* journalling filesystem info */
@@ -1342,6 +1435,9 @@ struct task_struct {
  #endif
  };
  
+/* Future-safe accessor for struct task_struct's cpus_allowed. */
+#define tsk_cpumask(tsk) (&(tsk)->cpus_allowed)
+
  /*
   * Priority of a process goes from 0..MAX_PRIO-1, valid RT
   * priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
@@ -1578,12 +1674,12 @@ extern cputime_t task_gtime(struct task_struct *p);
  
  #ifdef CONFIG_SMP
  extern int set_cpus_allowed_ptr(struct task_struct *p,
-                               const cpumask_t *new_mask);
+                               const struct cpumask *new_mask);
  #else
  static inline int set_cpus_allowed_ptr(struct task_struct *p,
-                                      const cpumask_t *new_mask)
+                                      const struct cpumask *new_mask)
  {
-       if (!cpu_isset(0, *new_mask))
+       if (!cpumask_test_cpu(0, new_mask))
                 return -EINVAL;
         return 0;
  }
@@ -1593,6 +1689,16 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
         return set_cpus_allowed_ptr(p, &new_mask);
  }
  
+/*
+ * Architectures can set this to 1 if they have specified
+ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ * but then during bootup it turns out that sched_clock()
+ * is reliable after all:
+ */
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+extern int sched_clock_stable;
+#endif
+
  extern unsigned long long sched_clock(void);
  
  extern void sched_clock_init(void);
@@ -1650,16 +1756,16 @@ extern void wake_up_idle_cpu(int cpu);
  static inline void wake_up_idle_cpu(int cpu) { }
  #endif
  
-#ifdef CONFIG_SCHED_DEBUG
  extern unsigned int sysctl_sched_latency;
  extern unsigned int sysctl_sched_min_granularity;
  extern unsigned int sysctl_sched_wakeup_granularity;
+extern unsigned int sysctl_sched_shares_ratelimit;
+extern unsigned int sysctl_sched_shares_thresh;
+#ifdef CONFIG_SCHED_DEBUG
  extern unsigned int sysctl_sched_child_runs_first;
  extern unsigned int sysctl_sched_features;
  extern unsigned int sysctl_sched_migration_cost;
  extern unsigned int sysctl_sched_nr_migrate;
-extern unsigned int sysctl_sched_shares_ratelimit;
-extern unsigned int sysctl_sched_shares_thresh;
  
  int sched_nr_latency_handler(struct ctl_table *table, int write,
                 struct file *file, void __user *buffer, size_t *length,
@@ -1869,7 +1975,8 @@ extern void mm_release(struct task_struct *, struct mm_struct *);
  /* Allocate a new mm structure and copy contents from tsk->mm */
  extern struct mm_struct *dup_mm(struct task_struct *tsk);
  
-extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+extern int copy_thread(unsigned long, unsigned long, unsigned long,
+                       struct task_struct *, struct pt_regs *);
  extern void flush_thread(void);
  extern void exit_thread(void);
  
@@ -2010,6 +2117,19 @@ static inline int object_is_on_stack(void *obj)
  
  extern void thread_info_cache_init(void);
  
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static inline unsigned long stack_not_used(struct task_struct *p)
+{
+       unsigned long *n = end_of_stack(p);
+
+       do {    /* Skip over canary */
+               n++;
+       } while (!*n);
+
+       return (unsigned long)n - (unsigned long)end_of_stack(p);
+}
+#endif
+
  /* set thread flags in other task's structures
   * - see asm/thread_info.h for TIF_xxxx flags available
   */
@@ -2123,25 +2243,18 @@ static inline int spin_needbreak(spinlock_t *lock)
  /*
   * Thread group CPU time accounting.
   */
-
-extern int thread_group_cputime_alloc(struct task_struct *);
-extern void thread_group_cputime(struct task_struct *, struct task_cputime *);
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
  
  static inline void thread_group_cputime_init(struct signal_struct *sig)
  {
-       sig->cputime.totals = NULL;
-}
-
-static inline int thread_group_cputime_clone_thread(struct task_struct *curr)
-{
-       if (curr->signal->cputime.totals)
-               return 0;
-       return thread_group_cputime_alloc(curr);
+       sig->cputimer.cputime = INIT_CPUTIME;
+       spin_lock_init(&sig->cputimer.lock);
+       sig->cputimer.running = 0;
  }
  
  static inline void thread_group_cputime_free(struct signal_struct *sig)
  {
-       free_percpu(sig->cputime.totals);
  }
  
  /*
@@ -2194,10 +2307,8 @@ __trace_special(void *__tr, void *__data,
  }
  #endif
  
-extern long sched_setaffinity(pid_t pid, const cpumask_t *new_mask);
-extern long sched_getaffinity(pid_t pid, cpumask_t *mask);
-
-extern int sched_mc_power_savings, sched_smt_power_savings;
+extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
+extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
  
  extern void normalize_rt_tasks(void);
  
@@ -2223,9 +2334,13 @@ extern long sched_group_rt_runtime(struct task_group *tg);
  extern int sched_group_set_rt_period(struct task_group *tg,
                                       long rt_period_us);
  extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
  #endif
  #endif
  
+extern int task_can_switch_user(struct user_struct *up,
+                                       struct task_struct *tsk);
+
  #ifdef CONFIG_TASK_XACCT
  static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
  {