[SCSI] zfcp: Remove busid macro

[safe/jmp/linux-2.6] / kernel / sched.c
diff --git a/kernel/sched.c b/kernel/sched.c

index ebaf432..748ff92 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -118,6 +118,12 @@
   */
  #define RUNTIME_INF    ((u64)~0ULL)
  
+DEFINE_TRACE(sched_wait_task);
+DEFINE_TRACE(sched_wakeup);
+DEFINE_TRACE(sched_wakeup_new);
+DEFINE_TRACE(sched_switch);
+DEFINE_TRACE(sched_migrate_task);
+
  #ifdef CONFIG_SMP
  /*
   * Divide a load by a sched group cpu_power : (load / sg->__cpu_power)
@@ -261,6 +267,10 @@ struct task_group {
         struct cgroup_subsys_state css;
  #endif
  
+#ifdef CONFIG_USER_SCHED
+       uid_t uid;
+#endif
+
  #ifdef CONFIG_FAIR_GROUP_SCHED
         /* schedulable entities of this group on each cpu */
         struct sched_entity **se;
@@ -286,6 +296,12 @@ struct task_group {
  
  #ifdef CONFIG_USER_SCHED
  
+/* Helper function to pass uid information to create_sched_user() */
+void set_tg_uid(struct user_struct *user)
+{
+       user->tg->uid = user->uid;
+}
+
  /*
   * Root task group.
   *     Every UID task group (including init_task_group aka UID-0) will
@@ -345,7 +361,9 @@ static inline struct task_group *task_group(struct task_struct *p)
         struct task_group *tg;
  
  #ifdef CONFIG_USER_SCHED
-       tg = p->user->tg;
+       rcu_read_lock();
+       tg = __task_cred(p)->user->tg;
+       rcu_read_unlock();
  #elif defined(CONFIG_CGROUP_SCHED)
         tg = container_of(task_subsys_state(p, cpu_cgroup_subsys_id),
                                 struct task_group, css);
@@ -399,7 +417,7 @@ struct cfs_rq {
          */
         struct sched_entity *curr, *next, *last;
  
-       unsigned long nr_spread_over;
+       unsigned int nr_spread_over;
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         struct rq *rq;  /* cpu runqueue to which this cfs_rq is attached */
@@ -586,6 +604,8 @@ struct rq {
  #ifdef CONFIG_SCHEDSTATS
         /* latency stats */
         struct sched_info rq_sched_info;
+       unsigned long long rq_cpu_time;
+       /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
  
         /* sys_sched_yield() stats */
         unsigned int yld_exp_empty;
@@ -949,6 +969,14 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
         }
  }
  
+void task_rq_unlock_wait(struct task_struct *p)
+{
+       struct rq *rq = task_rq(p);
+
+       smp_mb(); /* spin-unlock-wait is not a full memory barrier */
+       spin_unlock_wait(&rq->lock);
+}
+
  static void __task_rq_unlock(struct rq *rq)
         __releases(rq->lock)
  {
@@ -1425,9 +1453,12 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
+       unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
  
-       if (rq->nr_running)
-               rq->avg_load_per_task = rq->load.weight / rq->nr_running;
+       if (nr_running)
+               rq->avg_load_per_task = rq->load.weight / nr_running;
+       else
+               rq->avg_load_per_task = 0;
  
         return rq->avg_load_per_task;
  }
@@ -1443,27 +1474,13 @@ static void
  update_group_shares_cpu(struct task_group *tg, int cpu,
                         unsigned long sd_shares, unsigned long sd_rq_weight)
  {
-       int boost = 0;
         unsigned long shares;
         unsigned long rq_weight;
  
         if (!tg->se[cpu])
                 return;
  
-       rq_weight = tg->cfs_rq[cpu]->load.weight;
-
-       /*
-        * If there are currently no tasks on the cpu pretend there is one of
-        * average load so that when a new task gets to run here it will not
-        * get delayed by group starvation.
-        */
-       if (!rq_weight) {
-               boost = 1;
-               rq_weight = NICE_0_LOAD;
-       }
-
-       if (unlikely(rq_weight > sd_rq_weight))
-               rq_weight = sd_rq_weight;
+       rq_weight = tg->cfs_rq[cpu]->rq_weight;
  
         /*
          *           \Sum shares * rq_weight
@@ -1471,7 +1488,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
          *               \Sum rq_weight
          *
          */
-       shares = (sd_shares * rq_weight) / (sd_rq_weight + 1);
+       shares = (sd_shares * rq_weight) / sd_rq_weight;
         shares = clamp_t(unsigned long, shares, MIN_SHARES, MAX_SHARES);
  
         if (abs(shares - tg->se[cpu]->load.weight) >
@@ -1480,11 +1497,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                 unsigned long flags;
  
                 spin_lock_irqsave(&rq->lock, flags);
-               /*
-                * record the actual number of shares, not the boosted amount.
-                */
-               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
-               tg->cfs_rq[cpu]->rq_weight = rq_weight;
+               tg->cfs_rq[cpu]->shares = shares;
  
                 __set_se_shares(tg->se[cpu], shares);
                 spin_unlock_irqrestore(&rq->lock, flags);
@@ -1498,13 +1511,23 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
   */
  static int tg_shares_up(struct task_group *tg, void *data)
  {
-       unsigned long rq_weight = 0;
+       unsigned long weight, rq_weight = 0;
         unsigned long shares = 0;
         struct sched_domain *sd = data;
         int i;
  
         for_each_cpu_mask(i, sd->span) {
-               rq_weight += tg->cfs_rq[i]->load.weight;
+               /*
+                * If there are currently no tasks on the cpu pretend there
+                * is one of average load so that when a new task gets to
+                * run here it will not get delayed by group starvation.
+                */
+               weight = tg->cfs_rq[i]->load.weight;
+               if (!weight)
+                       weight = NICE_0_LOAD;
+
+               tg->cfs_rq[i]->rq_weight = weight;
+               rq_weight += weight;
                 shares += tg->cfs_rq[i]->shares;
         }
  
@@ -1514,9 +1537,6 @@ static int tg_shares_up(struct task_group *tg, void *data)
         if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                 shares = tg->shares;
  
-       if (!rq_weight)
-               rq_weight = cpus_weight(sd->span) * NICE_0_LOAD;
-
         for_each_cpu_mask(i, sd->span)
                 update_group_shares_cpu(tg, i, shares, rq_weight);
  
@@ -1581,6 +1601,39 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
  
  #endif
  
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(this_rq->lock)
+       __acquires(busiest->lock)
+       __acquires(this_rq->lock)
+{
+       int ret = 0;
+
+       if (unlikely(!irqs_disabled())) {
+               /* printk() doesn't work good under rq->lock */
+               spin_unlock(&this_rq->lock);
+               BUG_ON(1);
+       }
+       if (unlikely(!spin_trylock(&busiest->lock))) {
+               if (busiest < this_rq) {
+                       spin_unlock(&this_rq->lock);
+                       spin_lock(&busiest->lock);
+                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
+                       ret = 1;
+               } else
+                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
+       }
+       return ret;
+}
+
+static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
+       __releases(busiest->lock)
+{
+       spin_unlock(&busiest->lock);
+       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
+}
  #endif
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
@@ -1814,6 +1867,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
  
         clock_offset = old_rq->clock - new_rq->clock;
  
+       trace_sched_migrate_task(p, task_cpu(p), new_cpu);
+
  #ifdef CONFIG_SCHEDSTATS
         if (p->se.wait_start)
                 p->se.wait_start -= clock_offset;
@@ -2223,6 +2278,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
  
         smp_wmb();
         rq = task_rq_lock(p, &flags);
+       update_rq_clock(rq);
         old_state = p->state;
         if (!(old_state & state))
                 goto out;
@@ -2280,12 +2336,11 @@ out_activate:
                 schedstat_inc(p, se.nr_wakeups_local);
         else
                 schedstat_inc(p, se.nr_wakeups_remote);
-       update_rq_clock(rq);
         activate_task(rq, p, 1);
         success = 1;
  
  out_running:
-       trace_sched_wakeup(rq, p);
+       trace_sched_wakeup(rq, p, success);
         check_preempt_curr(rq, p, sync);
  
         p->state = TASK_RUNNING;
@@ -2418,7 +2473,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                 p->sched_class->task_new(rq, p);
                 inc_nr_running(rq);
         }
-       trace_sched_wakeup_new(rq, p);
+       trace_sched_wakeup_new(rq, p, 1);
         check_preempt_curr(rq, p, 0);
  #ifdef CONFIG_SMP
         if (p->sched_class->task_wake_up)
@@ -2781,40 +2836,6 @@ static void double_rq_unlock(struct rq *rq1, struct rq *rq2)
  }
  
  /*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
- */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(this_rq->lock)
-       __acquires(busiest->lock)
-       __acquires(this_rq->lock)
-{
-       int ret = 0;
-
-       if (unlikely(!irqs_disabled())) {
-               /* printk() doesn't work good under rq->lock */
-               spin_unlock(&this_rq->lock);
-               BUG_ON(1);
-       }
-       if (unlikely(!spin_trylock(&busiest->lock))) {
-               if (busiest < this_rq) {
-                       spin_unlock(&this_rq->lock);
-                       spin_lock(&busiest->lock);
-                       spin_lock_nested(&this_rq->lock, SINGLE_DEPTH_NESTING);
-                       ret = 1;
-               } else
-                       spin_lock_nested(&busiest->lock, SINGLE_DEPTH_NESTING);
-       }
-       return ret;
-}
-
-static void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
-       __releases(busiest->lock)
-{
-       spin_unlock(&busiest->lock);
-       lock_set_subclass(&this_rq->lock.dep_map, 0, _RET_IP_);
-}
-
-/*
   * If dest_cpu is allowed for this process, migrate the task to it.
   * This is accomplished by forcing the cpu_allowed mask to only
   * allow dest_cpu, which will force the cpu onto dest_cpu. Then
@@ -2831,7 +2852,6 @@ static void sched_migrate_task(struct task_struct *p, int dest_cpu)
             || unlikely(!cpu_active(dest_cpu)))
                 goto out;
  
-       trace_sched_migrate_task(rq, p, dest_cpu);
         /* force the process onto the specified CPU */
         if (migrate_task(p, dest_cpu, &req)) {
                 /* Need to wait for migration thread (might exit: take ref). */
@@ -3676,7 +3696,7 @@ out_balanced:
  static void idle_balance(int this_cpu, struct rq *this_rq)
  {
         struct sched_domain *sd;
-       int pulled_task = -1;
+       int pulled_task = 0;
         unsigned long next_balance = jiffies + HZ;
         cpumask_t tmpmask;
  
@@ -5103,6 +5123,22 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
         set_load_weight(p);
  }
  
+/*
+ * check the target process has a UID that matches the current process's
+ */
+static bool check_same_owner(struct task_struct *p)
+{
+       const struct cred *cred = current_cred(), *pcred;
+       bool match;
+
+       rcu_read_lock();
+       pcred = __task_cred(p);
+       match = (cred->euid == pcred->euid ||
+                cred->euid == pcred->uid);
+       rcu_read_unlock();
+       return match;
+}
+
  static int __sched_setscheduler(struct task_struct *p, int policy,
                                 struct sched_param *param, bool user)
  {
@@ -5162,8 +5198,7 @@ recheck:
                         return -EPERM;
  
                 /* can't change other user's priorities */
-               if ((current->euid != p->euid) &&
-                   (current->euid != p->uid))
+               if (!check_same_owner(p))
                         return -EPERM;
         }
  
@@ -5395,8 +5430,7 @@ long sched_setaffinity(pid_t pid, const cpumask_t *in_mask)
         read_unlock(&tasklist_lock);
  
         retval = -EPERM;
-       if ((current->euid != p->euid) && (current->euid != p->uid) &&
-                       !capable(CAP_SYS_NICE))
+       if (!check_same_owner(p) && !capable(CAP_SYS_NICE))
                 goto out_unlock;
  
         retval = security_task_setscheduler(p, 0, NULL);
@@ -5840,6 +5874,8 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         struct rq *rq = cpu_rq(cpu);
         unsigned long flags;
  
+       spin_lock_irqsave(&rq->lock, flags);
+
         __sched_fork(idle);
         idle->se.exec_start = sched_clock();
  
@@ -5847,7 +5883,6 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
         idle->cpus_allowed = cpumask_of_cpu(cpu);
         __set_task_cpu(idle, cpu);
  
-       spin_lock_irqsave(&rq->lock, flags);
         rq->curr = rq->idle = idle;
  #if defined(CONFIG_SMP) && defined(__ARCH_WANT_UNLOCKED_CTXSW)
         idle->oncpu = 1;
@@ -5864,6 +5899,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu)
          * The idle tasks have their own, simple scheduling class:
          */
         idle->sched_class = &idle_sched_class;
+       ftrace_graph_init_task(idle);
  }
  
  /*
@@ -6094,7 +6130,6 @@ static int __migrate_task_irq(struct task_struct *p, int src_cpu, int dest_cpu)
  
  /*
   * Figure out where task on dead CPU should go, use force if necessary.
- * NOTE: interrupts should be disabled by the caller
   */
  static void move_task_off_dead_cpu(int dead_cpu, struct task_struct *p)
  {
@@ -6555,7 +6590,9 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu)
                         req = list_entry(rq->migration_queue.next,
                                          struct migration_req, list);
                         list_del_init(&req->list);
+                       spin_unlock_irq(&rq->lock);
                         complete(&req->done);
+                       spin_lock_irq(&rq->lock);
                 }
                 spin_unlock_irq(&rq->lock);
                 break;
@@ -6759,6 +6796,8 @@ sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent)
                                 SD_BALANCE_EXEC |
                                 SD_SHARE_CPUPOWER |
                                 SD_SHARE_PKG_RESOURCES);
+               if (nr_node_ids == 1)
+                       pflags &= ~SD_SERIALIZE;
         }
         if (~cflags & pflags)
                 return 0;
@@ -7662,8 +7701,14 @@ static struct sched_domain_attr *dattr_cur;
   */
  static cpumask_t fallback_doms;
  
-void __attribute__((weak)) arch_update_cpu_topology(void)
+/*
+ * arch_update_cpu_topology lets virtualized architectures update the
+ * cpu core maps. It is supposed to return 1 if the topology changed
+ * or 0 if it stayed the same.
+ */
+int __attribute__((weak)) arch_update_cpu_topology(void)
  {
+       return 0;
  }
  
  /*
@@ -7740,13 +7785,14 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur,
   *
   * The passed in 'doms_new' should be kmalloc'd. This routine takes
   * ownership of it and will kfree it when done with it. If the caller
- * failed the kmalloc call, then it can pass in doms_new == NULL,
- * and partition_sched_domains() will fallback to the single partition
- * 'fallback_doms', it also forces the domains to be rebuilt.
+ * failed the kmalloc call, then it can pass in doms_new == NULL &&
+ * ndoms_new == 1, and partition_sched_domains() will fallback to
+ * the single partition 'fallback_doms', it also forces the domains
+ * to be rebuilt.
   *
- * If doms_new==NULL it will be replaced with cpu_online_map.
- * ndoms_new==0 is a special case for destroying existing domains.
- * It will not create the default domain.
+ * If doms_new == NULL it will be replaced with cpu_online_map.
+ * ndoms_new == 0 is a special case for destroying existing domains,
+ * and it will not create the default domain.
   *
   * Call with hotplug lock held
   */
@@ -7754,17 +7800,21 @@ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new,
                              struct sched_domain_attr *dattr_new)
  {
         int i, j, n;
+       int new_topology;
  
         mutex_lock(&sched_domains_mutex);
  
         /* always unregister in case we don't destroy any domains */
         unregister_sched_domain_sysctl();
  
+       /* Let architecture update cpu core mappings. */
+       new_topology = arch_update_cpu_topology();
+
         n = doms_new ? ndoms_new : 0;
  
         /* Destroy deleted domains */
         for (i = 0; i < ndoms_cur; i++) {
-               for (j = 0; j < n; j++) {
+               for (j = 0; j < n && !new_topology; j++) {
                         if (cpus_equal(doms_cur[i], doms_new[j])
                             && dattrs_equal(dattr_cur, i, dattr_new, j))
                                 goto match1;
@@ -7784,7 +7834,7 @@ match1:
  
         /* Build new domains */
         for (i = 0; i < ndoms_new; i++) {
-               for (j = 0; j < ndoms_cur; j++) {
+               for (j = 0; j < ndoms_cur && !new_topology; j++) {
                         if (cpus_equal(doms_new[i], doms_cur[j])
                             && dattrs_equal(dattr_new, i, dattr_cur, j))
                                 goto match2;
@@ -9251,6 +9301,41 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
         kfree(ca);
  }
  
+static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
+{
+       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+       u64 data;
+
+#ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit read safe on 32-bit platforms.
+        */
+       spin_lock_irq(&cpu_rq(cpu)->lock);
+       data = *cpuusage;
+       spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+       data = *cpuusage;
+#endif
+
+       return data;
+}
+
+static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
+{
+       u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+
+#ifndef CONFIG_64BIT
+       /*
+        * Take rq->lock to make 64-bit write safe on 32-bit platforms.
+        */
+       spin_lock_irq(&cpu_rq(cpu)->lock);
+       *cpuusage = val;
+       spin_unlock_irq(&cpu_rq(cpu)->lock);
+#else
+       *cpuusage = val;
+#endif
+}
+
  /* return total cpu usage (in nanoseconds) of a group */
  static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
  {
@@ -9258,17 +9343,8 @@ static u64 cpuusage_read(struct cgroup *cgrp, struct cftype *cft)
         u64 totalcpuusage = 0;
         int i;
  
-       for_each_possible_cpu(i) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
-
-               /*
-                * Take rq->lock to make 64-bit addition safe on 32-bit
-                * platforms.
-                */
-               spin_lock_irq(&cpu_rq(i)->lock);
-               totalcpuusage += *cpuusage;
-               spin_unlock_irq(&cpu_rq(i)->lock);
-       }
+       for_each_present_cpu(i)
+               totalcpuusage += cpuacct_cpuusage_read(ca, i);
  
         return totalcpuusage;
  }
@@ -9285,23 +9361,39 @@ static int cpuusage_write(struct cgroup *cgrp, struct cftype *cftype,
                 goto out;
         }
  
-       for_each_possible_cpu(i) {
-               u64 *cpuusage = percpu_ptr(ca->cpuusage, i);
+       for_each_present_cpu(i)
+               cpuacct_cpuusage_write(ca, i, 0);
  
-               spin_lock_irq(&cpu_rq(i)->lock);
-               *cpuusage = 0;
-               spin_unlock_irq(&cpu_rq(i)->lock);
-       }
  out:
         return err;
  }
  
+static int cpuacct_percpu_seq_read(struct cgroup *cgroup, struct cftype *cft,
+                                  struct seq_file *m)
+{
+       struct cpuacct *ca = cgroup_ca(cgroup);
+       u64 percpu;
+       int i;
+
+       for_each_present_cpu(i) {
+               percpu = cpuacct_cpuusage_read(ca, i);
+               seq_printf(m, "%llu ", (unsigned long long) percpu);
+       }
+       seq_printf(m, "\n");
+       return 0;
+}
+
  static struct cftype files[] = {
         {
                 .name = "usage",
                 .read_u64 = cpuusage_read,
                 .write_u64 = cpuusage_write,
         },
+       {
+               .name = "usage_percpu",
+               .read_seq_string = cpuacct_percpu_seq_read,
+       },
+
  };
  
  static int cpuacct_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)