sched: fix mysql+oltp regression
[safe/jmp/linux-2.6] / kernel / posix-cpu-timers.c
index 7a51a55..c42a03a 100644 (file)
@@ -4,10 +4,11 @@
 
 #include <linux/sched.h>
 #include <linux/posix-timers.h>
-#include <asm/uaccess.h>
 #include <linux/errno.h>
+#include <linux/math64.h>
+#include <asm/uaccess.h>
 
-static int check_clock(clockid_t which_clock)
+static int check_clock(const clockid_t which_clock)
 {
        int error = 0;
        struct task_struct *p;
@@ -20,9 +21,9 @@ static int check_clock(clockid_t which_clock)
                return 0;
 
        read_lock(&tasklist_lock);
-       p = find_task_by_pid(pid);
-       if (!p || (CPUCLOCK_PERTHREAD(which_clock) ?
-                  p->tgid != current->tgid : p->tgid != pid)) {
+       p = find_task_by_vpid(pid);
+       if (!p || !(CPUCLOCK_PERTHREAD(which_clock) ?
+                  same_thread_group(p, current) : thread_group_leader(p))) {
                error = -EINVAL;
        }
        read_unlock(&tasklist_lock);
@@ -31,31 +32,29 @@ static int check_clock(clockid_t which_clock)
 }
 
 static inline union cpu_time_count
-timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
+timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
 {
        union cpu_time_count ret;
        ret.sched = 0;          /* high half always zero when .cpu used */
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-               ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+               ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
        } else {
                ret.cpu = timespec_to_cputime(tp);
        }
        return ret;
 }
 
-static void sample_to_timespec(clockid_t which_clock,
+static void sample_to_timespec(const clockid_t which_clock,
                               union cpu_time_count cpu,
                               struct timespec *tp)
 {
-       if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-               tp->tv_sec = div_long_long_rem(cpu.sched,
-                                              NSEC_PER_SEC, &tp->tv_nsec);
-       } else {
+       if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED)
+               *tp = ns_to_timespec(cpu.sched);
+       else
                cputime_to_timespec(cpu.cpu, tp);
-       }
 }
 
-static inline int cpu_time_before(clockid_t which_clock,
+static inline int cpu_time_before(const clockid_t which_clock,
                                  union cpu_time_count now,
                                  union cpu_time_count then)
 {
@@ -65,7 +64,7 @@ static inline int cpu_time_before(clockid_t which_clock,
                return cputime_lt(now.cpu, then.cpu);
        }
 }
-static inline void cpu_time_add(clockid_t which_clock,
+static inline void cpu_time_add(const clockid_t which_clock,
                                union cpu_time_count *acc,
                                union cpu_time_count val)
 {
@@ -75,7 +74,7 @@ static inline void cpu_time_add(clockid_t which_clock,
                acc->cpu = cputime_add(acc->cpu, val.cpu);
        }
 }
-static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
+static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
                                                union cpu_time_count a,
                                                union cpu_time_count b)
 {
@@ -88,10 +87,23 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
 }
 
 /*
+ * Divide and limit the result to res >= 1
+ *
+ * This is necessary to prevent signal delivery starvation, when the result of
+ * the division would be rounded down to 0.
+ */
+static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
+{
+       cputime_t res = cputime_div(time, div);
+
+       return max_t(cputime_t, res, 1);
+}
+
+/*
  * Update expiry time from increment, and increase overrun count,
  * given the current clock sample.
  */
-static inline void bump_cpu_timer(struct k_itimer *timer,
+static void bump_cpu_timer(struct k_itimer *timer,
                                  union cpu_time_count now)
 {
        int i;
@@ -110,7 +122,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
                for (i = 0; incr < delta - incr; i++)
                        incr = incr << 1;
                for (; i >= 0; incr >>= 1, i--) {
-                       if (delta <= incr)
+                       if (delta < incr)
                                continue;
                        timer->it.cpu.expires.sched += incr;
                        timer->it_overrun += 1 << i;
@@ -128,7 +140,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
                for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
                             incr = cputime_add(incr, incr);
                for (; i >= 0; incr = cputime_halve(incr), i--) {
-                       if (cputime_le(delta, incr))
+                       if (cputime_lt(delta, incr))
                                continue;
                        timer->it.cpu.expires.cpu =
                                cputime_add(timer->it.cpu.expires.cpu, incr);
@@ -148,10 +160,10 @@ static inline cputime_t virt_ticks(struct task_struct *p)
 }
 static inline unsigned long long sched_ns(struct task_struct *p)
 {
-       return (p == current) ? current_sched_time(p) : p->sched_time;
+       return task_sched_runtime(p);
 }
 
-int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
        int error = check_clock(which_clock);
        if (!error) {
@@ -169,7 +181,7 @@ int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
        return error;
 }
 
-int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
+int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 {
        /*
         * You can never reset a CPU clock, but we check for other errors
@@ -186,7 +198,7 @@ int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
 /*
  * Sample a per-thread clock for the given task.
  */
-static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
+static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
                            union cpu_time_count *cpu)
 {
        switch (CPUCLOCK_WHICH(which_clock)) {
@@ -233,23 +245,12 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
                } while (t != p);
                break;
        case CPUCLOCK_SCHED:
-               cpu->sched = p->signal->sched_time;
+               cpu->sched = p->signal->sum_sched_runtime;
                /* Add in each other live thread.  */
                while ((t = next_thread(t)) != p) {
-                       cpu->sched += t->sched_time;
-               }
-               if (p->tgid == current->tgid) {
-                       /*
-                        * We're sampling ourselves, so include the
-                        * cycles not yet banked.  We still omit
-                        * other threads running on other CPUs,
-                        * so the total can always be behind as
-                        * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
-                        */
-                       cpu->sched += current_sched_time(current);
-               } else {
-                       cpu->sched += p->sched_time;
+                       cpu->sched += t->se.sum_exec_runtime;
                }
+               cpu->sched += sched_ns(p);
                break;
        }
        return 0;
@@ -259,7 +260,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
  */
-static int cpu_clock_sample_group(clockid_t which_clock,
+static int cpu_clock_sample_group(const clockid_t which_clock,
                                  struct task_struct *p,
                                  union cpu_time_count *cpu)
 {
@@ -273,7 +274,7 @@ static int cpu_clock_sample_group(clockid_t which_clock,
 }
 
 
-int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
        const pid_t pid = CPUCLOCK_PID(which_clock);
        int error = -EINVAL;
@@ -302,20 +303,25 @@ int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
                 * should be able to see it.
                 */
                struct task_struct *p;
-               read_lock(&tasklist_lock);
-               p = find_task_by_pid(pid);
+               rcu_read_lock();
+               p = find_task_by_vpid(pid);
                if (p) {
                        if (CPUCLOCK_PERTHREAD(which_clock)) {
-                               if (p->tgid == current->tgid) {
+                               if (same_thread_group(p, current)) {
                                        error = cpu_clock_sample(which_clock,
                                                                 p, &rtn);
                                }
-                       } else if (p->tgid == pid && p->signal) {
-                               error = cpu_clock_sample_group(which_clock,
-                                                              p, &rtn);
+                       } else {
+                               read_lock(&tasklist_lock);
+                               if (thread_group_leader(p) && p->signal) {
+                                       error =
+                                           cpu_clock_sample_group(which_clock,
+                                                                  p, &rtn);
+                               }
+                               read_unlock(&tasklist_lock);
                        }
                }
-               read_unlock(&tasklist_lock);
+               rcu_read_unlock();
        }
 
        if (error)
@@ -347,16 +353,16 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
                if (pid == 0) {
                        p = current;
                } else {
-                       p = find_task_by_pid(pid);
-                       if (p && p->tgid != current->tgid)
+                       p = find_task_by_vpid(pid);
+                       if (p && !same_thread_group(p, current))
                                p = NULL;
                }
        } else {
                if (pid == 0) {
                        p = current->group_leader;
                } else {
-                       p = find_task_by_pid(pid);
-                       if (p && p->tgid != pid)
+                       p = find_task_by_vpid(pid);
+                       if (p && !thread_group_leader(p))
                                p = NULL;
                }
        }
@@ -380,14 +386,9 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 int posix_cpu_timer_del(struct k_itimer *timer)
 {
        struct task_struct *p = timer->it.cpu.task;
+       int ret = 0;
 
-       if (timer->it.cpu.firing)
-               return TIMER_RETRY;
-
-       if (unlikely(p == NULL))
-               return 0;
-
-       if (!list_empty(&timer->it.cpu.entry)) {
+       if (likely(p != NULL)) {
                read_lock(&tasklist_lock);
                if (unlikely(p->signal == NULL)) {
                        /*
@@ -396,18 +397,20 @@ int posix_cpu_timer_del(struct k_itimer *timer)
                         */
                        BUG_ON(!list_empty(&timer->it.cpu.entry));
                } else {
-                       /*
-                        * Take us off the task's timer list.
-                        */
                        spin_lock(&p->sighand->siglock);
-                       list_del(&timer->it.cpu.entry);
+                       if (timer->it.cpu.firing)
+                               ret = TIMER_RETRY;
+                       else
+                               list_del(&timer->it.cpu.entry);
                        spin_unlock(&p->sighand->siglock);
                }
                read_unlock(&tasklist_lock);
+
+               if (!ret)
+                       put_task_struct(p);
        }
-       put_task_struct(p);
 
-       return 0;
+       return ret;
 }
 
 /*
@@ -418,14 +421,12 @@ int posix_cpu_timer_del(struct k_itimer *timer)
  */
 static void cleanup_timers(struct list_head *head,
                           cputime_t utime, cputime_t stime,
-                          unsigned long long sched_time)
+                          unsigned long long sum_exec_runtime)
 {
        struct cpu_timer_list *timer, *next;
        cputime_t ptime = cputime_add(utime, stime);
 
        list_for_each_entry_safe(timer, next, head, entry) {
-               put_task_struct(timer->task);
-               timer->task = NULL;
                list_del_init(&timer->entry);
                if (cputime_lt(timer->expires.cpu, ptime)) {
                        timer->expires.cpu = cputime_zero;
@@ -437,8 +438,6 @@ static void cleanup_timers(struct list_head *head,
 
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
-               put_task_struct(timer->task);
-               timer->task = NULL;
                list_del_init(&timer->entry);
                if (cputime_lt(timer->expires.cpu, utime)) {
                        timer->expires.cpu = cputime_zero;
@@ -450,13 +449,11 @@ static void cleanup_timers(struct list_head *head,
 
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
-               put_task_struct(timer->task);
-               timer->task = NULL;
                list_del_init(&timer->entry);
-               if (timer->expires.sched < sched_time) {
+               if (timer->expires.sched < sum_exec_runtime) {
                        timer->expires.sched = 0;
                } else {
-                       timer->expires.sched -= sched_time;
+                       timer->expires.sched -= sum_exec_runtime;
                }
        }
 }
@@ -469,7 +466,7 @@ static void cleanup_timers(struct list_head *head,
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
        cleanup_timers(tsk->cpu_timers,
-                      tsk->utime, tsk->stime, tsk->sched_time);
+                      tsk->utime, tsk->stime, tsk->se.sum_exec_runtime);
 
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
@@ -477,7 +474,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
        cleanup_timers(tsk->signal->cpu_timers,
                       cputime_add(tsk->utime, tsk->signal->utime),
                       cputime_add(tsk->stime, tsk->signal->stime),
-                      tsk->sched_time + tsk->signal->sched_time);
+                    tsk->se.sum_exec_runtime + tsk->signal->sum_sched_runtime);
 }
 
 
@@ -495,15 +492,18 @@ static void process_timer_rebalance(struct task_struct *p,
        struct task_struct *t = p;
        unsigned int nthreads = atomic_read(&p->signal->live);
 
+       if (!nthreads)
+               return;
+
        switch (clock_idx) {
        default:
                BUG();
                break;
        case CPUCLOCK_PROF:
-               left = cputime_div(cputime_sub(expires.cpu, val.cpu),
-                                  nthreads);
+               left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
+                                      nthreads);
                do {
-                       if (!unlikely(t->exit_state)) {
+                       if (likely(!(t->flags & PF_EXITING))) {
                                ticks = cputime_add(prof_ticks(t), left);
                                if (cputime_eq(t->it_prof_expires,
                                               cputime_zero) ||
@@ -515,10 +515,10 @@ static void process_timer_rebalance(struct task_struct *p,
                } while (t != p);
                break;
        case CPUCLOCK_VIRT:
-               left = cputime_div(cputime_sub(expires.cpu, val.cpu),
-                                  nthreads);
+               left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
+                                      nthreads);
                do {
-                       if (!unlikely(t->exit_state)) {
+                       if (likely(!(t->flags & PF_EXITING))) {
                                ticks = cputime_add(virt_ticks(t), left);
                                if (cputime_eq(t->it_virt_expires,
                                               cputime_zero) ||
@@ -532,9 +532,10 @@ static void process_timer_rebalance(struct task_struct *p,
        case CPUCLOCK_SCHED:
                nsleft = expires.sched - val.sched;
                do_div(nsleft, nthreads);
+               nsleft = max_t(unsigned long long, nsleft, 1);
                do {
-                       if (!unlikely(t->exit_state)) {
-                               ns = t->sched_time + nsleft;
+                       if (likely(!(t->flags & PF_EXITING))) {
+                               ns = t->se.sum_exec_runtime + nsleft;
                                if (t->it_sched_expires == 0 ||
                                    t->it_sched_expires > ns) {
                                        t->it_sched_expires = ns;
@@ -582,17 +583,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
        listpos = head;
        if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
                list_for_each_entry(next, head, entry) {
-                       if (next->expires.sched > nt->expires.sched) {
-                               listpos = &next->entry;
+                       if (next->expires.sched > nt->expires.sched)
                                break;
-                       }
+                       listpos = &next->entry;
                }
        } else {
                list_for_each_entry(next, head, entry) {
-                       if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
-                               listpos = &next->entry;
+                       if (cputime_gt(next->expires.cpu, nt->expires.cpu))
                                break;
-                       }
+                       listpos = &next->entry;
                }
        }
        list_add(&nt->entry, listpos);
@@ -736,9 +735,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
         * Disarm any old timer after extracting its expiry time.
         */
        BUG_ON(!irqs_disabled());
+
+       ret = 0;
        spin_lock(&p->sighand->siglock);
        old_expires = timer->it.cpu.expires;
-       list_del_init(&timer->it.cpu.entry);
+       if (unlikely(timer->it.cpu.firing)) {
+               timer->it.cpu.firing = -1;
+               ret = TIMER_RETRY;
+       } else
+               list_del_init(&timer->it.cpu.entry);
        spin_unlock(&p->sighand->siglock);
 
        /*
@@ -786,7 +791,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                }
        }
 
-       if (unlikely(timer->it.cpu.firing)) {
+       if (unlikely(ret)) {
                /*
                 * We are colliding with the timer actually firing.
                 * Punt after filling in the timer's old value, and
@@ -794,8 +799,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                 * it as an overrun (thanks to bump_cpu_timer above).
                 */
                read_unlock(&tasklist_lock);
-               timer->it.cpu.firing = -1;
-               ret = TIMER_RETRY;
                goto out;
        }
 
@@ -961,14 +964,17 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 static void check_thread_timers(struct task_struct *tsk,
                                struct list_head *firing)
 {
+       int maxfire;
        struct list_head *timers = tsk->cpu_timers;
+       struct signal_struct *const sig = tsk->signal;
 
+       maxfire = 20;
        tsk->it_prof_expires = cputime_zero;
        while (!list_empty(timers)) {
-               struct cpu_timer_list *t = list_entry(timers->next,
+               struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+               if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
                        tsk->it_prof_expires = t->expires.cpu;
                        break;
                }
@@ -977,12 +983,13 @@ static void check_thread_timers(struct task_struct *tsk,
        }
 
        ++timers;
+       maxfire = 20;
        tsk->it_virt_expires = cputime_zero;
        while (!list_empty(timers)) {
-               struct cpu_timer_list *t = list_entry(timers->next,
+               struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+               if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
                        tsk->it_virt_expires = t->expires.cpu;
                        break;
                }
@@ -991,18 +998,51 @@ static void check_thread_timers(struct task_struct *tsk,
        }
 
        ++timers;
+       maxfire = 20;
        tsk->it_sched_expires = 0;
        while (!list_empty(timers)) {
-               struct cpu_timer_list *t = list_entry(timers->next,
+               struct cpu_timer_list *t = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (tsk->sched_time < t->expires.sched) {
+               if (!--maxfire || tsk->se.sum_exec_runtime < t->expires.sched) {
                        tsk->it_sched_expires = t->expires.sched;
                        break;
                }
                t->firing = 1;
                list_move_tail(&t->entry, firing);
        }
+
+       /*
+        * Check for the special case thread timers.
+        */
+       if (sig->rlim[RLIMIT_RTTIME].rlim_cur != RLIM_INFINITY) {
+               unsigned long hard = sig->rlim[RLIMIT_RTTIME].rlim_max;
+               unsigned long *soft = &sig->rlim[RLIMIT_RTTIME].rlim_cur;
+
+               if (hard != RLIM_INFINITY &&
+                   tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) {
+                       /*
+                        * At the hard limit, we just die.
+                        * No need to calculate anything else now.
+                        */
+                       __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk);
+                       return;
+               }
+               if (tsk->rt.timeout > DIV_ROUND_UP(*soft, USEC_PER_SEC/HZ)) {
+                       /*
+                        * At the soft limit, send a SIGXCPU every second.
+                        */
+                       if (sig->rlim[RLIMIT_RTTIME].rlim_cur
+                           < sig->rlim[RLIMIT_RTTIME].rlim_max) {
+                               sig->rlim[RLIMIT_RTTIME].rlim_cur +=
+                                                               USEC_PER_SEC;
+                       }
+                       printk(KERN_INFO
+                               "RT Watchdog Timeout: %s[%d]\n",
+                               tsk->comm, task_pid_nr(tsk));
+                       __group_send_sig_info(SIGXCPU, SEND_SIG_PRIV, tsk);
+               }
+       }
 }
 
 /*
@@ -1013,9 +1053,10 @@ static void check_thread_timers(struct task_struct *tsk,
 static void check_process_timers(struct task_struct *tsk,
                                 struct list_head *firing)
 {
+       int maxfire;
        struct signal_struct *const sig = tsk->signal;
        cputime_t utime, stime, ptime, virt_expires, prof_expires;
-       unsigned long long sched_time, sched_expires;
+       unsigned long long sum_sched_runtime, sched_expires;
        struct task_struct *t;
        struct list_head *timers = sig->cpu_timers;
 
@@ -1035,55 +1076,58 @@ static void check_process_timers(struct task_struct *tsk,
         */
        utime = sig->utime;
        stime = sig->stime;
-       sched_time = sig->sched_time;
+       sum_sched_runtime = sig->sum_sched_runtime;
        t = tsk;
        do {
                utime = cputime_add(utime, t->utime);
                stime = cputime_add(stime, t->stime);
-               sched_time += t->sched_time;
+               sum_sched_runtime += t->se.sum_exec_runtime;
                t = next_thread(t);
        } while (t != tsk);
        ptime = cputime_add(utime, stime);
 
+       maxfire = 20;
        prof_expires = cputime_zero;
        while (!list_empty(timers)) {
-               struct cpu_timer_list *t = list_entry(timers->next,
+               struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (cputime_lt(ptime, t->expires.cpu)) {
-                       prof_expires = t->expires.cpu;
+               if (!--maxfire || cputime_lt(ptime, tl->expires.cpu)) {
+                       prof_expires = tl->expires.cpu;
                        break;
                }
-               t->firing = 1;
-               list_move_tail(&t->entry, firing);
+               tl->firing = 1;
+               list_move_tail(&tl->entry, firing);
        }
 
        ++timers;
+       maxfire = 20;
        virt_expires = cputime_zero;
        while (!list_empty(timers)) {
-               struct cpu_timer_list *t = list_entry(timers->next,
+               struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (cputime_lt(utime, t->expires.cpu)) {
-                       virt_expires = t->expires.cpu;
+               if (!--maxfire || cputime_lt(utime, tl->expires.cpu)) {
+                       virt_expires = tl->expires.cpu;
                        break;
                }
-               t->firing = 1;
-               list_move_tail(&t->entry, firing);
+               tl->firing = 1;
+               list_move_tail(&tl->entry, firing);
        }
 
        ++timers;
+       maxfire = 20;
        sched_expires = 0;
        while (!list_empty(timers)) {
-               struct cpu_timer_list *t = list_entry(timers->next,
+               struct cpu_timer_list *tl = list_first_entry(timers,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (sched_time < t->expires.sched) {
-                       sched_expires = t->expires.sched;
+               if (!--maxfire || sum_sched_runtime < tl->expires.sched) {
+                       sched_expires = tl->expires.sched;
                        break;
                }
-               t->firing = 1;
-               list_move_tail(&t->entry, firing);
+               tl->firing = 1;
+               list_move_tail(&tl->entry, firing);
        }
 
        /*
@@ -1161,19 +1205,26 @@ static void check_process_timers(struct task_struct *tsk,
                unsigned long long sched_left, sched;
                const unsigned int nthreads = atomic_read(&sig->live);
 
+               if (!nthreads)
+                       return;
+
                prof_left = cputime_sub(prof_expires, utime);
                prof_left = cputime_sub(prof_left, stime);
-               prof_left = cputime_div(prof_left, nthreads);
+               prof_left = cputime_div_non_zero(prof_left, nthreads);
                virt_left = cputime_sub(virt_expires, utime);
-               virt_left = cputime_div(virt_left, nthreads);
+               virt_left = cputime_div_non_zero(virt_left, nthreads);
                if (sched_expires) {
-                       sched_left = sched_expires - sched_time;
+                       sched_left = sched_expires - sum_sched_runtime;
                        do_div(sched_left, nthreads);
+                       sched_left = max_t(unsigned long long, sched_left, 1);
                } else {
                        sched_left = 0;
                }
                t = tsk;
                do {
+                       if (unlikely(t->flags & PF_EXITING))
+                               continue;
+
                        ticks = cputime_add(cputime_add(t->utime, t->stime),
                                            prof_left);
                        if (!cputime_eq(prof_expires, cputime_zero) &&
@@ -1189,16 +1240,12 @@ static void check_process_timers(struct task_struct *tsk,
                                t->it_virt_expires = ticks;
                        }
 
-                       sched = t->sched_time + sched_left;
+                       sched = t->se.sum_exec_runtime + sched_left;
                        if (sched_expires && (t->it_sched_expires == 0 ||
                                              t->it_sched_expires > sched)) {
                                t->it_sched_expires = sched;
                        }
-
-                       do {
-                               t = next_thread(t);
-                       } while (unlikely(t->exit_state));
-               } while (t != tsk);
+               } while ((t = next_thread(t)) != tsk);
        }
 }
 
@@ -1215,7 +1262,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                /*
                 * The task was cleaned up already, no future firings.
                 */
-               return;
+               goto out;
 
        /*
         * Fetch the current sample and update the timer's expiry time.
@@ -1225,7 +1272,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                bump_cpu_timer(timer, now);
                if (unlikely(p->exit_state)) {
                        clear_dead_task(timer, now);
-                       return;
+                       goto out;
                }
                read_lock(&tasklist_lock); /* arm_timer needs it.  */
        } else {
@@ -1238,8 +1285,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                        put_task_struct(p);
                        timer->it.cpu.task = p = NULL;
                        timer->it.cpu.expires.sched = 0;
-                       read_unlock(&tasklist_lock);
-                       return;
+                       goto out_unlock;
                } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
                        /*
                         * We've noticed that the thread is dead, but
@@ -1247,8 +1293,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                         * drop our task ref.
                         */
                        clear_dead_task(timer, now);
-                       read_unlock(&tasklist_lock);
-                       return;
+                       goto out_unlock;
                }
                cpu_clock_sample_group(timer->it_clock, p, &now);
                bump_cpu_timer(timer, now);
@@ -1260,7 +1305,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
         */
        arm_timer(timer, now);
 
+out_unlock:
        read_unlock(&tasklist_lock);
+
+out:
+       timer->it_overrun_last = timer->it_overrun;
+       timer->it_overrun = -1;
+       ++timer->it_requeue_pending;
 }
 
 /*
@@ -1281,35 +1332,35 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
        if (UNEXPIRED(prof) && UNEXPIRED(virt) &&
            (tsk->it_sched_expires == 0 ||
-            tsk->sched_time < tsk->it_sched_expires))
+            tsk->se.sum_exec_runtime < tsk->it_sched_expires))
                return;
 
 #undef UNEXPIRED
 
-       BUG_ON(tsk->exit_state);
-
        /*
         * Double-check with locks held.
         */
        read_lock(&tasklist_lock);
-       spin_lock(&tsk->sighand->siglock);
+       if (likely(tsk->signal != NULL)) {
+               spin_lock(&tsk->sighand->siglock);
 
-       /*
-        * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-        * all the timers that are firing, and put them on the firing list.
-        */
-       check_thread_timers(tsk, &firing);
-       check_process_timers(tsk, &firing);
+               /*
+                * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+                * all the timers that are firing, and put them on the firing list.
+                */
+               check_thread_timers(tsk, &firing);
+               check_process_timers(tsk, &firing);
 
-       /*
-        * We must release these locks before taking any timer's lock.
-        * There is a potential race with timer deletion here, as the
-        * siglock now protects our private firing list.  We have set
-        * the firing flag in each timer, so that a deletion attempt
-        * that gets the timer lock before we do will give it up and
-        * spin until we've taken care of that timer below.
-        */
-       spin_unlock(&tsk->sighand->siglock);
+               /*
+                * We must release these locks before taking any timer's lock.
+                * There is a potential race with timer deletion here, as the
+                * siglock now protects our private firing list.  We have set
+                * the firing flag in each timer, so that a deletion attempt
+                * that gets the timer lock before we do will give it up and
+                * spin until we've taken care of that timer below.
+                */
+               spin_unlock(&tsk->sighand->siglock);
+       }
        read_unlock(&tasklist_lock);
 
        /*
@@ -1381,7 +1432,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
         */
        head = &tsk->signal->cpu_timers[clock_idx];
        if (list_empty(head) ||
-           cputime_ge(list_entry(head->next,
+           cputime_ge(list_first_entry(head,
                                  struct cpu_timer_list, entry)->expires.cpu,
                       *newval)) {
                /*
@@ -1394,25 +1445,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
        }
 }
 
-static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
-
-int posix_cpu_nsleep(clockid_t which_clock, int flags,
-                    struct timespec *rqtp)
+static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
+                           struct timespec *rqtp, struct itimerspec *it)
 {
-       struct restart_block *restart_block =
-           &current_thread_info()->restart_block;
        struct k_itimer timer;
        int error;
 
        /*
-        * Diagnose required errors first.
-        */
-       if (CPUCLOCK_PERTHREAD(which_clock) &&
-           (CPUCLOCK_PID(which_clock) == 0 ||
-            CPUCLOCK_PID(which_clock) == current->pid))
-               return -EINVAL;
-
-       /*
         * Set up a temporary timer and then wait for it to go off.
         */
        memset(&timer, 0, sizeof timer);
@@ -1422,13 +1461,13 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags,
        error = posix_cpu_timer_create(&timer);
        timer.it_process = current;
        if (!error) {
-               struct timespec __user *rmtp;
                static struct itimerspec zero_it;
-               struct itimerspec it = { .it_value = *rqtp,
-                                        .it_interval = {} };
+
+               memset(it, 0, sizeof *it);
+               it->it_value = *rqtp;
 
                spin_lock_irq(&timer.it_lock);
-               error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+               error = posix_cpu_timer_set(&timer, flags, it, NULL);
                if (error) {
                        spin_unlock_irq(&timer.it_lock);
                        return error;
@@ -1456,55 +1495,102 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags,
                 * We were interrupted by a signal.
                 */
                sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
-               posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+               posix_cpu_timer_set(&timer, 0, &zero_it, it);
                spin_unlock_irq(&timer.it_lock);
 
-               if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+               if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
                        /*
                         * It actually did fire already.
                         */
                        return 0;
                }
 
+               error = -ERESTART_RESTARTBLOCK;
+       }
+
+       return error;
+}
+
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+                    struct timespec *rqtp, struct timespec __user *rmtp)
+{
+       struct restart_block *restart_block =
+           &current_thread_info()->restart_block;
+       struct itimerspec it;
+       int error;
+
+       /*
+        * Diagnose required errors first.
+        */
+       if (CPUCLOCK_PERTHREAD(which_clock) &&
+           (CPUCLOCK_PID(which_clock) == 0 ||
+            CPUCLOCK_PID(which_clock) == current->pid))
+               return -EINVAL;
+
+       error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+
+       if (error == -ERESTART_RESTARTBLOCK) {
+
+               if (flags & TIMER_ABSTIME)
+                       return -ERESTARTNOHAND;
                /*
-                * Report back to the user the time still remaining.
-                */
-               rmtp = (struct timespec __user *) restart_block->arg1;
-               if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
-                   copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                * Report back to the user the time still remaining.
+                */
+               if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
                        return -EFAULT;
 
-               restart_block->fn = posix_cpu_clock_nanosleep_restart;
-               /* Caller already set restart_block->arg1 */
+               restart_block->fn = posix_cpu_nsleep_restart;
                restart_block->arg0 = which_clock;
+               restart_block->arg1 = (unsigned long) rmtp;
                restart_block->arg2 = rqtp->tv_sec;
                restart_block->arg3 = rqtp->tv_nsec;
-
-               error = -ERESTART_RESTARTBLOCK;
        }
-
        return error;
 }
 
-static long
-posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
+long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
        clockid_t which_clock = restart_block->arg0;
-       struct timespec t = { .tv_sec = restart_block->arg2,
-                             .tv_nsec = restart_block->arg3 };
+       struct timespec __user *rmtp;
+       struct timespec t;
+       struct itimerspec it;
+       int error;
+
+       rmtp = (struct timespec __user *) restart_block->arg1;
+       t.tv_sec = restart_block->arg2;
+       t.tv_nsec = restart_block->arg3;
+
        restart_block->fn = do_no_restart_syscall;
-       return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t);
+       error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
+
+       if (error == -ERESTART_RESTARTBLOCK) {
+               /*
+                * Report back to the user the time still remaining.
+                */
+               if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                       return -EFAULT;
+
+               restart_block->fn = posix_cpu_nsleep_restart;
+               restart_block->arg0 = which_clock;
+               restart_block->arg1 = (unsigned long) rmtp;
+               restart_block->arg2 = t.tv_sec;
+               restart_block->arg3 = t.tv_nsec;
+       }
+       return error;
+
 }
 
 
 #define PROCESS_CLOCK  MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
 #define THREAD_CLOCK   MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 
-static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+static int process_cpu_clock_getres(const clockid_t which_clock,
+                                   struct timespec *tp)
 {
        return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
-static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+static int process_cpu_clock_get(const clockid_t which_clock,
+                                struct timespec *tp)
 {
        return posix_cpu_clock_get(PROCESS_CLOCK, tp);
 }
@@ -1513,16 +1599,23 @@ static int process_cpu_timer_create(struct k_itimer *timer)
        timer->it_clock = PROCESS_CLOCK;
        return posix_cpu_timer_create(timer);
 }
-static int process_cpu_nsleep(clockid_t which_clock, int flags,
-                             struct timespec *rqtp)
+static int process_cpu_nsleep(const clockid_t which_clock, int flags,
+                             struct timespec *rqtp,
+                             struct timespec __user *rmtp)
 {
-       return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
+       return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
 }
-static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+static long process_cpu_nsleep_restart(struct restart_block *restart_block)
+{
+       return -EINVAL;
+}
+static int thread_cpu_clock_getres(const clockid_t which_clock,
+                                  struct timespec *tp)
 {
        return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
-static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+static int thread_cpu_clock_get(const clockid_t which_clock,
+                               struct timespec *tp)
 {
        return posix_cpu_clock_get(THREAD_CLOCK, tp);
 }
@@ -1531,8 +1624,12 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
        timer->it_clock = THREAD_CLOCK;
        return posix_cpu_timer_create(timer);
 }
-static int thread_cpu_nsleep(clockid_t which_clock, int flags,
-                             struct timespec *rqtp)
+static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
+                             struct timespec *rqtp, struct timespec __user *rmtp)
+{
+       return -EINVAL;
+}
+static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
 {
        return -EINVAL;
 }
@@ -1545,6 +1642,7 @@ static __init int init_posix_cpu_timers(void)
                .clock_set = do_posix_clock_nosettime,
                .timer_create = process_cpu_timer_create,
                .nsleep = process_cpu_nsleep,
+               .nsleep_restart = process_cpu_nsleep_restart,
        };
        struct k_clock thread = {
                .clock_getres = thread_cpu_clock_getres,
@@ -1552,6 +1650,7 @@ static __init int init_posix_cpu_timers(void)
                .clock_set = do_posix_clock_nosettime,
                .timer_create = thread_cpu_timer_create,
                .nsleep = thread_cpu_nsleep,
+               .nsleep_restart = thread_cpu_nsleep_restart,
        };
 
        register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);