x86: Do not free zero sized per cpu areas

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index f47560f..49d2fa7 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -436,7 +436,6 @@ struct rt_rq {
         struct rq *rq;
         struct list_head leaf_rt_rq_list;
         struct task_group *tg;
-       struct sched_rt_entity *rt_se;
  #endif
  };
  
@@ -603,6 +602,11 @@ static inline int cpu_of(struct rq *rq)
  #endif
  }
  
+#define rcu_dereference_check_sched_domain(p) \
+       rcu_dereference_check((p), \
+                             rcu_read_lock_sched_held() || \
+                             lockdep_is_held(&sched_domains_mutex))
+
  /*
   * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
   * See detach_destroy_domains: synchronize_sched for details.
@@ -611,7 +615,7 @@ static inline int cpu_of(struct rq *rq)
   * preempt-disabled sections.
   */
  #define for_each_domain(cpu, __sd) \
-       for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+       for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
  
  #define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
  #define this_rq()              (&__get_cpu_var(runqueues))
@@ -899,16 +903,33 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
  #endif /* __ARCH_WANT_UNLOCKED_CTXSW */
  
  /*
+ * Check whether the task is waking, we use this to synchronize against
+ * ttwu() so that task_cpu() reports a stable number.
+ *
+ * We need to make an exception for PF_STARTING tasks because the fork
+ * path might require task_rq_lock() to work, eg. it can call
+ * set_cpus_allowed_ptr() from the cpuset clone_ns code.
+ */
+static inline int task_is_waking(struct task_struct *p)
+{
+       return unlikely((p->state == TASK_WAKING) && !(p->flags & PF_STARTING));
+}
+
+/*
   * __task_rq_lock - lock the runqueue a given task resides on.
   * Must be called interrupts disabled.
   */
  static inline struct rq *__task_rq_lock(struct task_struct *p)
         __acquires(rq->lock)
  {
+       struct rq *rq;
+
         for (;;) {
-               struct rq *rq = task_rq(p);
+               while (task_is_waking(p))
+                       cpu_relax();
+               rq = task_rq(p);
                 raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p)))
+               if (likely(rq == task_rq(p) && !task_is_waking(p)))
                         return rq;
                 raw_spin_unlock(&rq->lock);
         }
@@ -925,10 +946,12 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
         struct rq *rq;
  
         for (;;) {
+               while (task_is_waking(p))
+                       cpu_relax();
                 local_irq_save(*flags);
                 rq = task_rq(p);
                 raw_spin_lock(&rq->lock);
-               if (likely(rq == task_rq(p)))
+               if (likely(rq == task_rq(p) && !task_is_waking(p)))
                         return rq;
                 raw_spin_unlock_irqrestore(&rq->lock, *flags);
         }
@@ -1463,7 +1486,7 @@ static unsigned long target_load(int cpu, int type)
  
  static struct sched_group *group_of(int cpu)
  {
-       struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+       struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
  
         if (!sd)
                 return NULL;
@@ -1498,7 +1521,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
-static __read_mostly unsigned long *update_shares_data;
+static __read_mostly unsigned long __percpu *update_shares_data;
  
  static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  
@@ -1633,16 +1656,6 @@ static void update_shares(struct sched_domain *sd)
         }
  }
  
-static void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-       if (root_task_group_empty())
-               return;
-
-       raw_spin_unlock(&rq->lock);
-       update_shares(sd);
-       raw_spin_lock(&rq->lock);
-}
-
  static void update_h_load(long cpu)
  {
         if (root_task_group_empty())
@@ -1657,10 +1670,6 @@ static inline void update_shares(struct sched_domain *sd)
  {
  }
  
-static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
-{
-}
-
  #endif
  
  #ifdef CONFIG_PREEMPT
@@ -2301,14 +2310,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p)
  }
  
  /*
- * Called from:
- *
- *  - fork, @p is stable because it isn't on the tasklist yet
+ * Gets called from 3 sites (exec, fork, wakeup), since it is called without
+ * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done
+ * by:
   *
- *  - exec, @p is unstable, retry loop
- *
- *  - wake-up, we serialize ->cpus_allowed against TASK_WAKING so
- *             we should be good.
+ *  exec:           is unstable, retry loop
+ *  fork & wake-up: serialize ->cpus_allowed against TASK_WAKING
   */
  static inline
  int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags)
@@ -2352,7 +2359,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
  {
         int cpu, orig_cpu, this_cpu, success = 0;
         unsigned long flags;
-       struct rq *rq, *orig_rq;
+       struct rq *rq;
  
         if (!sched_feat(SYNC_WAKEUPS))
                 wake_flags &= ~WF_SYNC;
@@ -2360,7 +2367,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
         this_cpu = get_cpu();
  
         smp_wmb();
-       rq = orig_rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &flags);
         update_rq_clock(rq);
         if (!(p->state & state))
                 goto out;
@@ -2391,14 +2398,27 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
         __task_rq_unlock(rq);
  
         cpu = select_task_rq(p, SD_BALANCE_WAKE, wake_flags);
-       if (cpu != orig_cpu)
+       if (cpu != orig_cpu) {
+               /*
+                * Since we migrate the task without holding any rq->lock,
+                * we need to be careful with task_rq_lock(), since that
+                * might end up locking an invalid rq.
+                */
                 set_task_cpu(p, cpu);
+       }
  
-       rq = __task_rq_lock(p);
+       rq = cpu_rq(cpu);
+       raw_spin_lock(&rq->lock);
         update_rq_clock(rq);
  
+       /*
+        * We migrated the task without holding either rq->lock, however
+        * since the task is not on the task list itself, nobody else
+        * will try and migrate the task, hence the rq should match the
+        * cpu we just moved it to.
+        */
+       WARN_ON(task_cpu(p) != cpu);
         WARN_ON(p->state != TASK_WAKING);
-       cpu = task_cpu(p);
  
  #ifdef CONFIG_SCHEDSTATS
         schedstat_inc(rq, ttwu_count);
@@ -2601,9 +2621,6 @@ void sched_fork(struct task_struct *p, int clone_flags)
         if (p->sched_class->task_fork)
                 p->sched_class->task_fork(p);
  
-#ifdef CONFIG_SMP
-       cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
-#endif
         set_task_cpu(p, cpu);
  
  #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
@@ -2633,8 +2650,29 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
  {
         unsigned long flags;
         struct rq *rq;
+       int cpu __maybe_unused = get_cpu();
+
+#ifdef CONFIG_SMP
+       /*
+        * Fork balancing, do it here and not earlier because:
+        *  - cpus_allowed can change in the fork path
+        *  - any previously selected cpu might disappear through hotplug
+        *
+        * We still have TASK_WAKING but PF_STARTING is gone now, meaning
+        * ->cpus_allowed is stable, we have preemption disabled, meaning
+        * cpu_online_mask is stable.
+        */
+       cpu = select_task_rq(p, SD_BALANCE_FORK, 0);
+       set_task_cpu(p, cpu);
+#endif
+
+       /*
+        * Since the task is not on the rq and we still have TASK_WAKING set
+        * nobody else will migrate this task.
+        */
+       rq = cpu_rq(cpu);
+       raw_spin_lock_irqsave(&rq->lock, flags);
  
-       rq = task_rq_lock(p, &flags);
         BUG_ON(p->state != TASK_WAKING);
         p->state = TASK_RUNNING;
         update_rq_clock(rq);
@@ -2646,6 +2684,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 p->sched_class->task_woken(rq, p);
  #endif
         task_rq_unlock(rq, &flags);
+       put_cpu();
  }
  
  #ifdef CONFIG_PREEMPT_NOTIFIERS
@@ -2764,7 +2803,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
-       perf_event_task_sched_in(current, cpu_of(rq));
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+       perf_event_task_sched_in(current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
         finish_lock_switch(rq, prev);
  
         fire_sched_in_preempt_notifiers(current);
@@ -3470,7 +3515,7 @@ void scheduler_tick(void)
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
  
-       perf_event_task_tick(curr, cpu);
+       perf_event_task_tick(curr);
  
  #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
@@ -3684,7 +3729,7 @@ need_resched_nonpreemptible:
  
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
-               perf_event_task_sched_out(prev, next, cpu);
+               perf_event_task_sched_out(prev, next);
  
                 rq->nr_switches++;
                 rq->curr = next;
@@ -3702,8 +3747,11 @@ need_resched_nonpreemptible:
  
         post_schedule(rq);
  
-       if (unlikely(reacquire_kernel_lock(current) < 0))
+       if (unlikely(reacquire_kernel_lock(current) < 0)) {
+               prev = rq->curr;
+               switch_count = &prev->nivcsw;
                 goto need_resched_nonpreemptible;
+       }
  
         preempt_enable_no_resched();
         if (need_resched())
@@ -4212,7 +4260,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         unsigned long flags;
         int oldprio, on_rq, running;
         struct rq *rq;
-       const struct sched_class *prev_class = p->sched_class;
+       const struct sched_class *prev_class;
  
         BUG_ON(prio < 0 || prio > MAX_PRIO);
  
@@ -4220,6 +4268,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         update_rq_clock(rq);
  
         oldprio = p->prio;
+       prev_class = p->sched_class;
         on_rq = p->se.on_rq;
         running = task_current(rq, p);
         if (on_rq)
@@ -4237,7 +4286,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
         if (running)
                 p->sched_class->set_curr_task(rq);
         if (on_rq) {
-               enqueue_task(rq, p, 0, false);
+               enqueue_task(rq, p, 0, oldprio < prio);
  
                 check_class_changed(rq, p, prev_class, oldprio, running);
         }
@@ -4304,7 +4353,7 @@ int can_nice(const struct task_struct *p, const int nice)
         /* convert nice value [19,-20] to rlimit style value [1,40] */
         int nice_rlim = 20 - nice;
  
-       return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+       return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                 capable(CAP_SYS_NICE));
  }
  
@@ -4439,7 +4488,7 @@ static int __sched_setscheduler(struct task_struct *p, int policy,
  {
         int retval, oldprio, oldpolicy = -1, on_rq, running;
         unsigned long flags;
-       const struct sched_class *prev_class = p->sched_class;
+       const struct sched_class *prev_class;
         struct rq *rq;
         int reset_on_fork;
  
@@ -4481,7 +4530,7 @@ recheck:
  
                         if (!lock_task_sighand(p, &flags))
                                 return -ESRCH;
-                       rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+                       rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
                         unlock_task_sighand(p, &flags);
  
                         /* can't set/change the rt policy */
@@ -4553,6 +4602,7 @@ recheck:
         p->sched_reset_on_fork = reset_on_fork;
  
         oldprio = p->prio;
+       prev_class = p->sched_class;
         __setscheduler(rq, p, policy, param->sched_priority);
  
         if (running)
@@ -4852,7 +4902,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
         int ret;
         cpumask_var_t mask;
  
-       if (len < cpumask_size())
+       if (len < nr_cpu_ids)
+               return -EINVAL;
+       if (len & (sizeof(unsigned long)-1))
                 return -EINVAL;
  
         if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -4860,10 +4912,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
  
         ret = sched_getaffinity(pid, mask);
         if (ret == 0) {
-               if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+               size_t retlen = min_t(size_t, len, cpumask_size());
+
+               if (copy_to_user(user_mask_ptr, mask, retlen))
                         ret = -EFAULT;
                 else
-                       ret = cpumask_size();
+                       ret = retlen;
         }
         free_cpumask_var(mask);
  
@@ -5303,23 +5357,8 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
         struct rq *rq;
         int ret = 0;
  
-       /*
-        * Since we rely on wake-ups to migrate sleeping tasks, don't change
-        * the ->cpus_allowed mask from under waking tasks, which would be
-        * possible when we change rq->lock in ttwu(), so synchronize against
-        * TASK_WAKING to avoid that.
-        */
-again:
-       while (p->state == TASK_WAKING)
-               cpu_relax();
-
         rq = task_rq_lock(p, &flags);
  
-       if (p->state == TASK_WAKING) {
-               task_rq_unlock(rq, &flags);
-               goto again;
-       }
-
         if (!cpumask_intersects(new_mask, cpu_active_mask)) {
                 ret = -EINVAL;
                 goto out;
@@ -7371,11 +7410,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
  
  #ifdef CONFIG_SCHED_MC
  static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+                                          struct sysdev_class_attribute *attr,
                                            char *page)
  {
         return sprintf(page, "%u\n", sched_mc_power_savings);
  }
  static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+                                           struct sysdev_class_attribute *attr,
                                             const char *buf, size_t count)
  {
         return sched_power_savings_store(buf, count, 0);
@@ -7387,11 +7428,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
  
  #ifdef CONFIG_SCHED_SMT
  static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+                                           struct sysdev_class_attribute *attr,
                                             char *page)
  {
         return sprintf(page, "%u\n", sched_smt_power_savings);
  }
  static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+                                            struct sysdev_class_attribute *attr,
                                              const char *buf, size_t count)
  {
         return sched_power_savings_store(buf, count, 1);
@@ -7606,7 +7649,6 @@ static void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
         tg->rt_rq[cpu] = rt_rq;
         init_rt_rq(rt_rq, rq);
         rt_rq->tg = tg;
-       rt_rq->rt_se = rt_se;
         rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
         if (add)
                 list_add(&rt_rq->leaf_rt_rq_list, &rq->leaf_rt_rq_list);
@@ -8779,7 +8821,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
  struct cpuacct {
         struct cgroup_subsys_state css;
         /* cpuusage holds pointer to a u64-type object on every cpu */
-       u64 *cpuusage;
+       u64 __percpu *cpuusage;
         struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
         struct cpuacct *parent;
  };
@@ -8996,12 +9038,30 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
  }
  
  /*
+ * When CONFIG_VIRT_CPU_ACCOUNTING is enabled one jiffy can be very large
+ * in cputime_t units. As a result, cpuacct_update_stats calls
+ * percpu_counter_add with values large enough to always overflow the
+ * per cpu batch limit causing bad SMP scalability.
+ *
+ * To fix this we scale percpu_counter_batch by cputime_one_jiffy so we
+ * batch the same amount of time with CONFIG_VIRT_CPU_ACCOUNTING disabled
+ * and enabled. We cap it at INT_MAX which is the largest allowed batch value.
+ */
+#ifdef CONFIG_SMP
+#define CPUACCT_BATCH  \
+       min_t(long, percpu_counter_batch * cputime_one_jiffy, INT_MAX)
+#else
+#define CPUACCT_BATCH  0
+#endif
+
+/*
   * Charge the system/user time to the task's accounting group.
   */
  static void cpuacct_update_stats(struct task_struct *tsk,
                 enum cpuacct_stat_index idx, cputime_t val)
  {
         struct cpuacct *ca;
+       int batch = CPUACCT_BATCH;
  
         if (unlikely(!cpuacct_subsys.active))
                 return;
@@ -9010,7 +9070,7 @@ static void cpuacct_update_stats(struct task_struct *tsk,
         ca = task_ca(tsk);
  
         do {
-               percpu_counter_add(&ca->cpustat[idx], val);
+               __percpu_counter_add(&ca->cpustat[idx], val, batch);
                 ca = ca->parent;
         } while (ca);
         rcu_read_unlock();