Merge branch 'master' of master.kernel.org:/pub/scm/linux/kernel/git/davem/net-2.6

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index 0b914fc..3c2a54f 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -71,6 +71,7 @@
  #include <linux/debugfs.h>
  #include <linux/ctype.h>
  #include <linux/ftrace.h>
+#include <linux/slab.h>
  
  #include <asm/tlb.h>
  #include <asm/irq_regs.h>
@@ -322,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p)
  /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */
  static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
  {
+       /*
+        * Strictly speaking this rcu_read_lock() is not needed since the
+        * task_group is tied to the cgroup, which in turn can never go away
+        * as long as there are tasks attached to it.
+        *
+        * However since task_group() uses task_subsys_state() which is an
+        * rcu_dereference() user, this quiets CONFIG_PROVE_RCU.
+        */
+       rcu_read_lock();
  #ifdef CONFIG_FAIR_GROUP_SCHED
         p->se.cfs_rq = task_group(p)->cfs_rq[cpu];
         p->se.parent = task_group(p)->se[cpu];
@@ -331,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
         p->rt.rt_rq  = task_group(p)->rt_rq[cpu];
         p->rt.parent = task_group(p)->rt_se[cpu];
  #endif
+       rcu_read_unlock();
  }
  
  #else
@@ -602,6 +613,11 @@ static inline int cpu_of(struct rq *rq)
  #endif
  }
  
+#define rcu_dereference_check_sched_domain(p) \
+       rcu_dereference_check((p), \
+                             rcu_read_lock_sched_held() || \
+                             lockdep_is_held(&sched_domains_mutex))
+
  /*
   * The domain tree (rq->sd) is protected by RCU's quiescent state transition.
   * See detach_destroy_domains: synchronize_sched for details.
@@ -610,7 +626,7 @@ static inline int cpu_of(struct rq *rq)
   * preempt-disabled sections.
   */
  #define for_each_domain(cpu, __sd) \
-       for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
+       for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent)
  
  #define cpu_rq(cpu)            (&per_cpu(runqueues, (cpu)))
  #define this_rq()              (&__get_cpu_var(runqueues))
@@ -1481,7 +1497,7 @@ static unsigned long target_load(int cpu, int type)
  
  static struct sched_group *group_of(int cpu)
  {
-       struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd);
+       struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd);
  
         if (!sd)
                 return NULL;
@@ -1516,7 +1532,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
-static __read_mostly unsigned long *update_shares_data;
+static __read_mostly unsigned long __percpu *update_shares_data;
  
  static void __set_se_shares(struct sched_entity *se, unsigned long shares);
  
@@ -2354,7 +2370,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
  {
         int cpu, orig_cpu, this_cpu, success = 0;
         unsigned long flags;
-       struct rq *rq, *orig_rq;
+       struct rq *rq;
  
         if (!sched_feat(SYNC_WAKEUPS))
                 wake_flags &= ~WF_SYNC;
@@ -2362,7 +2378,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
         this_cpu = get_cpu();
  
         smp_wmb();
-       rq = orig_rq = task_rq_lock(p, &flags);
+       rq = task_rq_lock(p, &flags);
         update_rq_clock(rq);
         if (!(p->state & state))
                 goto out;
@@ -2645,7 +2661,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
  {
         unsigned long flags;
         struct rq *rq;
-       int cpu = get_cpu();
+       int cpu __maybe_unused = get_cpu();
  
  #ifdef CONFIG_SMP
         /*
@@ -2798,7 +2814,13 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
          */
         prev_state = prev->state;
         finish_arch_switch(prev);
-       perf_event_task_sched_in(current, cpu_of(rq));
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       local_irq_disable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
+       perf_event_task_sched_in(current);
+#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
+       local_irq_enable();
+#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
         finish_lock_switch(rq, prev);
  
         fire_sched_in_preempt_notifiers(current);
@@ -3504,7 +3526,7 @@ void scheduler_tick(void)
         curr->sched_class->task_tick(rq, curr, 0);
         raw_spin_unlock(&rq->lock);
  
-       perf_event_task_tick(curr, cpu);
+       perf_event_task_tick(curr);
  
  #ifdef CONFIG_SMP
         rq->idle_at_tick = idle_cpu(cpu);
@@ -3718,7 +3740,7 @@ need_resched_nonpreemptible:
  
         if (likely(prev != next)) {
                 sched_info_switch(prev, next);
-               perf_event_task_sched_out(prev, next, cpu);
+               perf_event_task_sched_out(prev, next);
  
                 rq->nr_switches++;
                 rq->curr = next;
@@ -3768,7 +3790,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
          * the mutex owner just released it and exited.
          */
         if (probe_kernel_address(&owner->cpu, cpu))
-               goto out;
+               return 0;
  #else
         cpu = owner->cpu;
  #endif
@@ -3778,14 +3800,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
          * the cpu field may no longer be valid.
          */
         if (cpu >= nr_cpumask_bits)
-               goto out;
+               return 0;
  
         /*
          * We need to validate that we can do a
          * get_cpu() and that we have the percpu area.
          */
         if (!cpu_online(cpu))
-               goto out;
+               return 0;
  
         rq = cpu_rq(cpu);
  
@@ -3804,7 +3826,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner)
  
                 cpu_relax();
         }
-out:
+
         return 1;
  }
  #endif
@@ -4342,7 +4364,7 @@ int can_nice(const struct task_struct *p, const int nice)
         /* convert nice value [19,-20] to rlimit style value [1,40] */
         int nice_rlim = 20 - nice;
  
-       return (nice_rlim <= p->signal->rlim[RLIMIT_NICE].rlim_cur ||
+       return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
                 capable(CAP_SYS_NICE));
  }
  
@@ -4519,7 +4541,7 @@ recheck:
  
                         if (!lock_task_sighand(p, &flags))
                                 return -ESRCH;
-                       rlim_rtprio = p->signal->rlim[RLIMIT_RTPRIO].rlim_cur;
+                       rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO);
                         unlock_task_sighand(p, &flags);
  
                         /* can't set/change the rt policy */
@@ -4891,7 +4913,9 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
         int ret;
         cpumask_var_t mask;
  
-       if (len < cpumask_size())
+       if ((len * BITS_PER_BYTE) < nr_cpu_ids)
+               return -EINVAL;
+       if (len & (sizeof(unsigned long)-1))
                 return -EINVAL;
  
         if (!alloc_cpumask_var(&mask, GFP_KERNEL))
@@ -4899,10 +4923,12 @@ SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
  
         ret = sched_getaffinity(pid, mask);
         if (ret == 0) {
-               if (copy_to_user(user_mask_ptr, mask, cpumask_size()))
+               size_t retlen = min_t(size_t, len, cpumask_size());
+
+               if (copy_to_user(user_mask_ptr, mask, retlen))
                         ret = -EFAULT;
                 else
-                       ret = cpumask_size();
+                       ret = retlen;
         }
         free_cpumask_var(mask);
  
@@ -5372,7 +5398,7 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
  
                 get_task_struct(mt);
                 task_rq_unlock(rq, &flags);
-               wake_up_process(rq->migration_thread);
+               wake_up_process(mt);
                 put_task_struct(mt);
                 wait_for_completion(&req.done);
                 tlb_migrate_finish(p->mm);
@@ -7395,11 +7421,13 @@ static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
  
  #ifdef CONFIG_SCHED_MC
  static ssize_t sched_mc_power_savings_show(struct sysdev_class *class,
+                                          struct sysdev_class_attribute *attr,
                                            char *page)
  {
         return sprintf(page, "%u\n", sched_mc_power_savings);
  }
  static ssize_t sched_mc_power_savings_store(struct sysdev_class *class,
+                                           struct sysdev_class_attribute *attr,
                                             const char *buf, size_t count)
  {
         return sched_power_savings_store(buf, count, 0);
@@ -7411,11 +7439,13 @@ static SYSDEV_CLASS_ATTR(sched_mc_power_savings, 0644,
  
  #ifdef CONFIG_SCHED_SMT
  static ssize_t sched_smt_power_savings_show(struct sysdev_class *dev,
+                                           struct sysdev_class_attribute *attr,
                                             char *page)
  {
         return sprintf(page, "%u\n", sched_smt_power_savings);
  }
  static ssize_t sched_smt_power_savings_store(struct sysdev_class *dev,
+                                            struct sysdev_class_attribute *attr,
                                              const char *buf, size_t count)
  {
         return sched_power_savings_store(buf, count, 1);
@@ -8802,7 +8832,7 @@ struct cgroup_subsys cpu_cgroup_subsys = {
  struct cpuacct {
         struct cgroup_subsys_state css;
         /* cpuusage holds pointer to a u64-type object on every cpu */
-       u64 *cpuusage;
+       u64 __percpu *cpuusage;
         struct percpu_counter cpustat[CPUACCT_STAT_NSTATS];
         struct cpuacct *parent;
  };