[PATCH] posix-cpu-timers: prevent signal delivery starvation
[safe/jmp/linux-2.6] / kernel / posix-cpu-timers.c
index b3f3edc..7c3e1e6 100644 (file)
@@ -7,7 +7,7 @@
 #include <asm/uaccess.h>
 #include <linux/errno.h>
 
-static int check_clock(clockid_t which_clock)
+static int check_clock(const clockid_t which_clock)
 {
        int error = 0;
        struct task_struct *p;
@@ -31,19 +31,19 @@ static int check_clock(clockid_t which_clock)
 }
 
 static inline union cpu_time_count
-timespec_to_sample(clockid_t which_clock, const struct timespec *tp)
+timespec_to_sample(const clockid_t which_clock, const struct timespec *tp)
 {
        union cpu_time_count ret;
        ret.sched = 0;          /* high half always zero when .cpu used */
        if (CPUCLOCK_WHICH(which_clock) == CPUCLOCK_SCHED) {
-               ret.sched = tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
+               ret.sched = (unsigned long long)tp->tv_sec * NSEC_PER_SEC + tp->tv_nsec;
        } else {
                ret.cpu = timespec_to_cputime(tp);
        }
        return ret;
 }
 
-static void sample_to_timespec(clockid_t which_clock,
+static void sample_to_timespec(const clockid_t which_clock,
                               union cpu_time_count cpu,
                               struct timespec *tp)
 {
@@ -55,7 +55,7 @@ static void sample_to_timespec(clockid_t which_clock,
        }
 }
 
-static inline int cpu_time_before(clockid_t which_clock,
+static inline int cpu_time_before(const clockid_t which_clock,
                                  union cpu_time_count now,
                                  union cpu_time_count then)
 {
@@ -65,7 +65,7 @@ static inline int cpu_time_before(clockid_t which_clock,
                return cputime_lt(now.cpu, then.cpu);
        }
 }
-static inline void cpu_time_add(clockid_t which_clock,
+static inline void cpu_time_add(const clockid_t which_clock,
                                union cpu_time_count *acc,
                                union cpu_time_count val)
 {
@@ -75,7 +75,7 @@ static inline void cpu_time_add(clockid_t which_clock,
                acc->cpu = cputime_add(acc->cpu, val.cpu);
        }
 }
-static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
+static inline union cpu_time_count cpu_time_sub(const clockid_t which_clock,
                                                union cpu_time_count a,
                                                union cpu_time_count b)
 {
@@ -88,10 +88,23 @@ static inline union cpu_time_count cpu_time_sub(clockid_t which_clock,
 }
 
 /*
+ * Divide and limit the result to res >= 1
+ *
+ * This is necessary to prevent signal delivery starvation, when the result of
+ * the division would be rounded down to 0.
+ */
+static inline cputime_t cputime_div_non_zero(cputime_t time, unsigned long div)
+{
+       cputime_t res = cputime_div(time, div);
+
+       return max_t(cputime_t, res, 1);
+}
+
+/*
  * Update expiry time from increment, and increase overrun count,
  * given the current clock sample.
  */
-static inline void bump_cpu_timer(struct k_itimer *timer,
+static void bump_cpu_timer(struct k_itimer *timer,
                                  union cpu_time_count now)
 {
        int i;
@@ -110,7 +123,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
                for (i = 0; incr < delta - incr; i++)
                        incr = incr << 1;
                for (; i >= 0; incr >>= 1, i--) {
-                       if (delta <= incr)
+                       if (delta < incr)
                                continue;
                        timer->it.cpu.expires.sched += incr;
                        timer->it_overrun += 1 << i;
@@ -128,7 +141,7 @@ static inline void bump_cpu_timer(struct k_itimer *timer,
                for (i = 0; cputime_lt(incr, cputime_sub(delta, incr)); i++)
                             incr = cputime_add(incr, incr);
                for (; i >= 0; incr = cputime_halve(incr), i--) {
-                       if (cputime_le(delta, incr))
+                       if (cputime_lt(delta, incr))
                                continue;
                        timer->it.cpu.expires.cpu =
                                cputime_add(timer->it.cpu.expires.cpu, incr);
@@ -151,7 +164,7 @@ static inline unsigned long long sched_ns(struct task_struct *p)
        return (p == current) ? current_sched_time(p) : p->sched_time;
 }
 
-int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+int posix_cpu_clock_getres(const clockid_t which_clock, struct timespec *tp)
 {
        int error = check_clock(which_clock);
        if (!error) {
@@ -169,7 +182,7 @@ int posix_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
        return error;
 }
 
-int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
+int posix_cpu_clock_set(const clockid_t which_clock, const struct timespec *tp)
 {
        /*
         * You can never reset a CPU clock, but we check for other errors
@@ -186,7 +199,7 @@ int posix_cpu_clock_set(clockid_t which_clock, const struct timespec *tp)
 /*
  * Sample a per-thread clock for the given task.
  */
-static int cpu_clock_sample(clockid_t which_clock, struct task_struct *p,
+static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
                            union cpu_time_count *cpu)
 {
        switch (CPUCLOCK_WHICH(which_clock)) {
@@ -238,18 +251,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
                while ((t = next_thread(t)) != p) {
                        cpu->sched += t->sched_time;
                }
-               if (p->tgid == current->tgid) {
-                       /*
-                        * We're sampling ourselves, so include the
-                        * cycles not yet banked.  We still omit
-                        * other threads running on other CPUs,
-                        * so the total can always be behind as
-                        * much as max(nthreads-1,ncpus) * (NSEC_PER_SEC/HZ).
-                        */
-                       cpu->sched += current_sched_time(current);
-               } else {
-                       cpu->sched += p->sched_time;
-               }
+               cpu->sched += sched_ns(p);
                break;
        }
        return 0;
@@ -259,7 +261,7 @@ static int cpu_clock_sample_group_locked(unsigned int clock_idx,
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
  */
-static int cpu_clock_sample_group(clockid_t which_clock,
+static int cpu_clock_sample_group(const clockid_t which_clock,
                                  struct task_struct *p,
                                  union cpu_time_count *cpu)
 {
@@ -273,7 +275,7 @@ static int cpu_clock_sample_group(clockid_t which_clock,
 }
 
 
-int posix_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
        const pid_t pid = CPUCLOCK_PID(which_clock);
        int error = -EINVAL;
@@ -380,28 +382,31 @@ int posix_cpu_timer_create(struct k_itimer *new_timer)
 int posix_cpu_timer_del(struct k_itimer *timer)
 {
        struct task_struct *p = timer->it.cpu.task;
+       int ret = 0;
 
-       if (timer->it.cpu.firing)
-               return TIMER_RETRY;
-
-       if (unlikely(p == NULL))
-               return 0;
+       if (likely(p != NULL)) {
+               read_lock(&tasklist_lock);
+               if (unlikely(p->signal == NULL)) {
+                       /*
+                        * We raced with the reaping of the task.
+                        * The deletion should have cleared us off the list.
+                        */
+                       BUG_ON(!list_empty(&timer->it.cpu.entry));
+               } else {
+                       spin_lock(&p->sighand->siglock);
+                       if (timer->it.cpu.firing)
+                               ret = TIMER_RETRY;
+                       else
+                               list_del(&timer->it.cpu.entry);
+                       spin_unlock(&p->sighand->siglock);
+               }
+               read_unlock(&tasklist_lock);
 
-       spin_lock(&p->sighand->siglock);
-       if (!list_empty(&timer->it.cpu.entry)) {
-               /*
-                * Take us off the task's timer list.  We don't need to
-                * take tasklist_lock and check for the task being reaped.
-                * If it was reaped, it already called posix_cpu_timers_exit
-                * and posix_cpu_timers_exit_group to clear all the timers
-                * that pointed to it.
-                */
-               list_del(&timer->it.cpu.entry);
-               put_task_struct(p);
+               if (!ret)
+                       put_task_struct(p);
        }
-       spin_unlock(&p->sighand->siglock);
 
-       return 0;
+       return ret;
 }
 
 /*
@@ -418,8 +423,6 @@ static void cleanup_timers(struct list_head *head,
        cputime_t ptime = cputime_add(utime, stime);
 
        list_for_each_entry_safe(timer, next, head, entry) {
-               put_task_struct(timer->task);
-               timer->task = NULL;
                list_del_init(&timer->entry);
                if (cputime_lt(timer->expires.cpu, ptime)) {
                        timer->expires.cpu = cputime_zero;
@@ -431,8 +434,6 @@ static void cleanup_timers(struct list_head *head,
 
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
-               put_task_struct(timer->task);
-               timer->task = NULL;
                list_del_init(&timer->entry);
                if (cputime_lt(timer->expires.cpu, utime)) {
                        timer->expires.cpu = cputime_zero;
@@ -444,8 +445,6 @@ static void cleanup_timers(struct list_head *head,
 
        ++head;
        list_for_each_entry_safe(timer, next, head, entry) {
-               put_task_struct(timer->task);
-               timer->task = NULL;
                list_del_init(&timer->entry);
                if (timer->expires.sched < sched_time) {
                        timer->expires.sched = 0;
@@ -489,15 +488,18 @@ static void process_timer_rebalance(struct task_struct *p,
        struct task_struct *t = p;
        unsigned int nthreads = atomic_read(&p->signal->live);
 
+       if (!nthreads)
+               return;
+
        switch (clock_idx) {
        default:
                BUG();
                break;
        case CPUCLOCK_PROF:
-               left = cputime_div(cputime_sub(expires.cpu, val.cpu),
-                                  nthreads);
+               left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
+                                      nthreads);
                do {
-                       if (!unlikely(t->exit_state)) {
+                       if (likely(!(t->flags & PF_EXITING))) {
                                ticks = cputime_add(prof_ticks(t), left);
                                if (cputime_eq(t->it_prof_expires,
                                               cputime_zero) ||
@@ -509,10 +511,10 @@ static void process_timer_rebalance(struct task_struct *p,
                } while (t != p);
                break;
        case CPUCLOCK_VIRT:
-               left = cputime_div(cputime_sub(expires.cpu, val.cpu),
-                                  nthreads);
+               left = cputime_div_non_zero(cputime_sub(expires.cpu, val.cpu),
+                                      nthreads);
                do {
-                       if (!unlikely(t->exit_state)) {
+                       if (likely(!(t->flags & PF_EXITING))) {
                                ticks = cputime_add(virt_ticks(t), left);
                                if (cputime_eq(t->it_virt_expires,
                                               cputime_zero) ||
@@ -526,8 +528,9 @@ static void process_timer_rebalance(struct task_struct *p,
        case CPUCLOCK_SCHED:
                nsleft = expires.sched - val.sched;
                do_div(nsleft, nthreads);
+               nsleft = max_t(unsigned long long, nsleft, 1);
                do {
-                       if (!unlikely(t->exit_state)) {
+                       if (likely(!(t->flags & PF_EXITING))) {
                                ns = t->sched_time + nsleft;
                                if (t->it_sched_expires == 0 ||
                                    t->it_sched_expires > ns) {
@@ -576,17 +579,15 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
        listpos = head;
        if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
                list_for_each_entry(next, head, entry) {
-                       if (next->expires.sched > nt->expires.sched) {
-                               listpos = &next->entry;
+                       if (next->expires.sched > nt->expires.sched)
                                break;
-                       }
+                       listpos = &next->entry;
                }
        } else {
                list_for_each_entry(next, head, entry) {
-                       if (cputime_gt(next->expires.cpu, nt->expires.cpu)) {
-                               listpos = &next->entry;
+                       if (cputime_gt(next->expires.cpu, nt->expires.cpu))
                                break;
-                       }
+                       listpos = &next->entry;
                }
        }
        list_add(&nt->entry, listpos);
@@ -730,9 +731,15 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
         * Disarm any old timer after extracting its expiry time.
         */
        BUG_ON(!irqs_disabled());
+
+       ret = 0;
        spin_lock(&p->sighand->siglock);
        old_expires = timer->it.cpu.expires;
-       list_del_init(&timer->it.cpu.entry);
+       if (unlikely(timer->it.cpu.firing)) {
+               timer->it.cpu.firing = -1;
+               ret = TIMER_RETRY;
+       } else
+               list_del_init(&timer->it.cpu.entry);
        spin_unlock(&p->sighand->siglock);
 
        /*
@@ -780,7 +787,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                }
        }
 
-       if (unlikely(timer->it.cpu.firing)) {
+       if (unlikely(ret)) {
                /*
                 * We are colliding with the timer actually firing.
                 * Punt after filling in the timer's old value, and
@@ -788,8 +795,6 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
                 * it as an overrun (thanks to bump_cpu_timer above).
                 */
                read_unlock(&tasklist_lock);
-               timer->it.cpu.firing = -1;
-               ret = TIMER_RETRY;
                goto out;
        }
 
@@ -955,14 +960,16 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 static void check_thread_timers(struct task_struct *tsk,
                                struct list_head *firing)
 {
+       int maxfire;
        struct list_head *timers = tsk->cpu_timers;
 
+       maxfire = 20;
        tsk->it_prof_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
+               if (!--maxfire || cputime_lt(prof_ticks(tsk), t->expires.cpu)) {
                        tsk->it_prof_expires = t->expires.cpu;
                        break;
                }
@@ -971,12 +978,13 @@ static void check_thread_timers(struct task_struct *tsk,
        }
 
        ++timers;
+       maxfire = 20;
        tsk->it_virt_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
+               if (!--maxfire || cputime_lt(virt_ticks(tsk), t->expires.cpu)) {
                        tsk->it_virt_expires = t->expires.cpu;
                        break;
                }
@@ -985,12 +993,13 @@ static void check_thread_timers(struct task_struct *tsk,
        }
 
        ++timers;
+       maxfire = 20;
        tsk->it_sched_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (tsk->sched_time < t->expires.sched) {
+               if (!--maxfire || tsk->sched_time < t->expires.sched) {
                        tsk->it_sched_expires = t->expires.sched;
                        break;
                }
@@ -1007,6 +1016,7 @@ static void check_thread_timers(struct task_struct *tsk,
 static void check_process_timers(struct task_struct *tsk,
                                 struct list_head *firing)
 {
+       int maxfire;
        struct signal_struct *const sig = tsk->signal;
        cputime_t utime, stime, ptime, virt_expires, prof_expires;
        unsigned long long sched_time, sched_expires;
@@ -1039,12 +1049,13 @@ static void check_process_timers(struct task_struct *tsk,
        } while (t != tsk);
        ptime = cputime_add(utime, stime);
 
+       maxfire = 20;
        prof_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (cputime_lt(ptime, t->expires.cpu)) {
+               if (!--maxfire || cputime_lt(ptime, t->expires.cpu)) {
                        prof_expires = t->expires.cpu;
                        break;
                }
@@ -1053,12 +1064,13 @@ static void check_process_timers(struct task_struct *tsk,
        }
 
        ++timers;
+       maxfire = 20;
        virt_expires = cputime_zero;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (cputime_lt(utime, t->expires.cpu)) {
+               if (!--maxfire || cputime_lt(utime, t->expires.cpu)) {
                        virt_expires = t->expires.cpu;
                        break;
                }
@@ -1067,12 +1079,13 @@ static void check_process_timers(struct task_struct *tsk,
        }
 
        ++timers;
+       maxfire = 20;
        sched_expires = 0;
        while (!list_empty(timers)) {
                struct cpu_timer_list *t = list_entry(timers->next,
                                                      struct cpu_timer_list,
                                                      entry);
-               if (sched_time < t->expires.sched) {
+               if (!--maxfire || sched_time < t->expires.sched) {
                        sched_expires = t->expires.sched;
                        break;
                }
@@ -1155,19 +1168,26 @@ static void check_process_timers(struct task_struct *tsk,
                unsigned long long sched_left, sched;
                const unsigned int nthreads = atomic_read(&sig->live);
 
+               if (!nthreads)
+                       return;
+
                prof_left = cputime_sub(prof_expires, utime);
                prof_left = cputime_sub(prof_left, stime);
-               prof_left = cputime_div(prof_left, nthreads);
+               prof_left = cputime_div_non_zero(prof_left, nthreads);
                virt_left = cputime_sub(virt_expires, utime);
-               virt_left = cputime_div(virt_left, nthreads);
+               virt_left = cputime_div_non_zero(virt_left, nthreads);
                if (sched_expires) {
                        sched_left = sched_expires - sched_time;
                        do_div(sched_left, nthreads);
+                       sched_left = max_t(unsigned long long, sched_left, 1);
                } else {
                        sched_left = 0;
                }
                t = tsk;
                do {
+                       if (unlikely(t->flags & PF_EXITING))
+                               continue;
+
                        ticks = cputime_add(cputime_add(t->utime, t->stime),
                                            prof_left);
                        if (!cputime_eq(prof_expires, cputime_zero) &&
@@ -1188,11 +1208,7 @@ static void check_process_timers(struct task_struct *tsk,
                                              t->it_sched_expires > sched)) {
                                t->it_sched_expires = sched;
                        }
-
-                       do {
-                               t = next_thread(t);
-                       } while (unlikely(t->exit_state));
-               } while (t != tsk);
+               } while ((t = next_thread(t)) != tsk);
        }
 }
 
@@ -1209,7 +1225,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                /*
                 * The task was cleaned up already, no future firings.
                 */
-               return;
+               goto out;
 
        /*
         * Fetch the current sample and update the timer's expiry time.
@@ -1219,7 +1235,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                bump_cpu_timer(timer, now);
                if (unlikely(p->exit_state)) {
                        clear_dead_task(timer, now);
-                       return;
+                       goto out;
                }
                read_lock(&tasklist_lock); /* arm_timer needs it.  */
        } else {
@@ -1232,8 +1248,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                        put_task_struct(p);
                        timer->it.cpu.task = p = NULL;
                        timer->it.cpu.expires.sched = 0;
-                       read_unlock(&tasklist_lock);
-                       return;
+                       goto out_unlock;
                } else if (unlikely(p->exit_state) && thread_group_empty(p)) {
                        /*
                         * We've noticed that the thread is dead, but
@@ -1241,8 +1256,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
                         * drop our task ref.
                         */
                        clear_dead_task(timer, now);
-                       read_unlock(&tasklist_lock);
-                       return;
+                       goto out_unlock;
                }
                cpu_clock_sample_group(timer->it_clock, p, &now);
                bump_cpu_timer(timer, now);
@@ -1254,7 +1268,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
         */
        arm_timer(timer, now);
 
+out_unlock:
        read_unlock(&tasklist_lock);
+
+out:
+       timer->it_overrun_last = timer->it_overrun;
+       timer->it_overrun = -1;
+       ++timer->it_requeue_pending;
 }
 
 /*
@@ -1280,30 +1300,30 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 
 #undef UNEXPIRED
 
-       BUG_ON(tsk->exit_state);
-
        /*
         * Double-check with locks held.
         */
        read_lock(&tasklist_lock);
-       spin_lock(&tsk->sighand->siglock);
+       if (likely(tsk->signal != NULL)) {
+               spin_lock(&tsk->sighand->siglock);
 
-       /*
-        * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
-        * all the timers that are firing, and put them on the firing list.
-        */
-       check_thread_timers(tsk, &firing);
-       check_process_timers(tsk, &firing);
+               /*
+                * Here we take off tsk->cpu_timers[N] and tsk->signal->cpu_timers[N]
+                * all the timers that are firing, and put them on the firing list.
+                */
+               check_thread_timers(tsk, &firing);
+               check_process_timers(tsk, &firing);
 
-       /*
-        * We must release these locks before taking any timer's lock.
-        * There is a potential race with timer deletion here, as the
-        * siglock now protects our private firing list.  We have set
-        * the firing flag in each timer, so that a deletion attempt
-        * that gets the timer lock before we do will give it up and
-        * spin until we've taken care of that timer below.
-        */
-       spin_unlock(&tsk->sighand->siglock);
+               /*
+                * We must release these locks before taking any timer's lock.
+                * There is a potential race with timer deletion here, as the
+                * siglock now protects our private firing list.  We have set
+                * the firing flag in each timer, so that a deletion attempt
+                * that gets the timer lock before we do will give it up and
+                * spin until we've taken care of that timer below.
+                */
+               spin_unlock(&tsk->sighand->siglock);
+       }
        read_unlock(&tasklist_lock);
 
        /*
@@ -1388,25 +1408,13 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
        }
 }
 
-static long posix_cpu_clock_nanosleep_restart(struct restart_block *);
-
-int posix_cpu_nsleep(clockid_t which_clock, int flags,
-                    struct timespec *rqtp)
+static int do_cpu_nanosleep(const clockid_t which_clock, int flags,
+                           struct timespec *rqtp, struct itimerspec *it)
 {
-       struct restart_block *restart_block =
-           &current_thread_info()->restart_block;
        struct k_itimer timer;
        int error;
 
        /*
-        * Diagnose required errors first.
-        */
-       if (CPUCLOCK_PERTHREAD(which_clock) &&
-           (CPUCLOCK_PID(which_clock) == 0 ||
-            CPUCLOCK_PID(which_clock) == current->pid))
-               return -EINVAL;
-
-       /*
         * Set up a temporary timer and then wait for it to go off.
         */
        memset(&timer, 0, sizeof timer);
@@ -1416,13 +1424,13 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags,
        error = posix_cpu_timer_create(&timer);
        timer.it_process = current;
        if (!error) {
-               struct timespec __user *rmtp;
                static struct itimerspec zero_it;
-               struct itimerspec it = { .it_value = *rqtp,
-                                        .it_interval = {} };
+
+               memset(it, 0, sizeof *it);
+               it->it_value = *rqtp;
 
                spin_lock_irq(&timer.it_lock);
-               error = posix_cpu_timer_set(&timer, flags, &it, NULL);
+               error = posix_cpu_timer_set(&timer, flags, it, NULL);
                if (error) {
                        spin_unlock_irq(&timer.it_lock);
                        return error;
@@ -1450,55 +1458,102 @@ int posix_cpu_nsleep(clockid_t which_clock, int flags,
                 * We were interrupted by a signal.
                 */
                sample_to_timespec(which_clock, timer.it.cpu.expires, rqtp);
-               posix_cpu_timer_set(&timer, 0, &zero_it, &it);
+               posix_cpu_timer_set(&timer, 0, &zero_it, it);
                spin_unlock_irq(&timer.it_lock);
 
-               if ((it.it_value.tv_sec | it.it_value.tv_nsec) == 0) {
+               if ((it->it_value.tv_sec | it->it_value.tv_nsec) == 0) {
                        /*
                         * It actually did fire already.
                         */
                        return 0;
                }
 
+               error = -ERESTART_RESTARTBLOCK;
+       }
+
+       return error;
+}
+
+int posix_cpu_nsleep(const clockid_t which_clock, int flags,
+                    struct timespec *rqtp, struct timespec __user *rmtp)
+{
+       struct restart_block *restart_block =
+           &current_thread_info()->restart_block;
+       struct itimerspec it;
+       int error;
+
+       /*
+        * Diagnose required errors first.
+        */
+       if (CPUCLOCK_PERTHREAD(which_clock) &&
+           (CPUCLOCK_PID(which_clock) == 0 ||
+            CPUCLOCK_PID(which_clock) == current->pid))
+               return -EINVAL;
+
+       error = do_cpu_nanosleep(which_clock, flags, rqtp, &it);
+
+       if (error == -ERESTART_RESTARTBLOCK) {
+
+               if (flags & TIMER_ABSTIME)
+                       return -ERESTARTNOHAND;
                /*
-                * Report back to the user the time still remaining.
-                */
-               rmtp = (struct timespec __user *) restart_block->arg1;
-               if (rmtp != NULL && !(flags & TIMER_ABSTIME) &&
-                   copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                * Report back to the user the time still remaining.
+                */
+               if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
                        return -EFAULT;
 
-               restart_block->fn = posix_cpu_clock_nanosleep_restart;
-               /* Caller already set restart_block->arg1 */
+               restart_block->fn = posix_cpu_nsleep_restart;
                restart_block->arg0 = which_clock;
+               restart_block->arg1 = (unsigned long) rmtp;
                restart_block->arg2 = rqtp->tv_sec;
                restart_block->arg3 = rqtp->tv_nsec;
-
-               error = -ERESTART_RESTARTBLOCK;
        }
-
        return error;
 }
 
-static long
-posix_cpu_clock_nanosleep_restart(struct restart_block *restart_block)
+long posix_cpu_nsleep_restart(struct restart_block *restart_block)
 {
        clockid_t which_clock = restart_block->arg0;
-       struct timespec t = { .tv_sec = restart_block->arg2,
-                             .tv_nsec = restart_block->arg3 };
+       struct timespec __user *rmtp;
+       struct timespec t;
+       struct itimerspec it;
+       int error;
+
+       rmtp = (struct timespec __user *) restart_block->arg1;
+       t.tv_sec = restart_block->arg2;
+       t.tv_nsec = restart_block->arg3;
+
        restart_block->fn = do_no_restart_syscall;
-       return posix_cpu_nsleep(which_clock, TIMER_ABSTIME, &t);
+       error = do_cpu_nanosleep(which_clock, TIMER_ABSTIME, &t, &it);
+
+       if (error == -ERESTART_RESTARTBLOCK) {
+               /*
+                * Report back to the user the time still remaining.
+                */
+               if (rmtp != NULL && copy_to_user(rmtp, &it.it_value, sizeof *rmtp))
+                       return -EFAULT;
+
+               restart_block->fn = posix_cpu_nsleep_restart;
+               restart_block->arg0 = which_clock;
+               restart_block->arg1 = (unsigned long) rmtp;
+               restart_block->arg2 = t.tv_sec;
+               restart_block->arg3 = t.tv_nsec;
+       }
+       return error;
+
 }
 
 
 #define PROCESS_CLOCK  MAKE_PROCESS_CPUCLOCK(0, CPUCLOCK_SCHED)
 #define THREAD_CLOCK   MAKE_THREAD_CPUCLOCK(0, CPUCLOCK_SCHED)
 
-static int process_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+static int process_cpu_clock_getres(const clockid_t which_clock,
+                                   struct timespec *tp)
 {
        return posix_cpu_clock_getres(PROCESS_CLOCK, tp);
 }
-static int process_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+static int process_cpu_clock_get(const clockid_t which_clock,
+                                struct timespec *tp)
 {
        return posix_cpu_clock_get(PROCESS_CLOCK, tp);
 }
@@ -1507,16 +1562,23 @@ static int process_cpu_timer_create(struct k_itimer *timer)
        timer->it_clock = PROCESS_CLOCK;
        return posix_cpu_timer_create(timer);
 }
-static int process_cpu_nsleep(clockid_t which_clock, int flags,
-                             struct timespec *rqtp)
+static int process_cpu_nsleep(const clockid_t which_clock, int flags,
+                             struct timespec *rqtp,
+                             struct timespec __user *rmtp)
+{
+       return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp, rmtp);
+}
+static long process_cpu_nsleep_restart(struct restart_block *restart_block)
 {
-       return posix_cpu_nsleep(PROCESS_CLOCK, flags, rqtp);
+       return -EINVAL;
 }
-static int thread_cpu_clock_getres(clockid_t which_clock, struct timespec *tp)
+static int thread_cpu_clock_getres(const clockid_t which_clock,
+                                  struct timespec *tp)
 {
        return posix_cpu_clock_getres(THREAD_CLOCK, tp);
 }
-static int thread_cpu_clock_get(clockid_t which_clock, struct timespec *tp)
+static int thread_cpu_clock_get(const clockid_t which_clock,
+                               struct timespec *tp)
 {
        return posix_cpu_clock_get(THREAD_CLOCK, tp);
 }
@@ -1525,8 +1587,12 @@ static int thread_cpu_timer_create(struct k_itimer *timer)
        timer->it_clock = THREAD_CLOCK;
        return posix_cpu_timer_create(timer);
 }
-static int thread_cpu_nsleep(clockid_t which_clock, int flags,
-                             struct timespec *rqtp)
+static int thread_cpu_nsleep(const clockid_t which_clock, int flags,
+                             struct timespec *rqtp, struct timespec __user *rmtp)
+{
+       return -EINVAL;
+}
+static long thread_cpu_nsleep_restart(struct restart_block *restart_block)
 {
        return -EINVAL;
 }
@@ -1539,6 +1605,7 @@ static __init int init_posix_cpu_timers(void)
                .clock_set = do_posix_clock_nosettime,
                .timer_create = process_cpu_timer_create,
                .nsleep = process_cpu_nsleep,
+               .nsleep_restart = process_cpu_nsleep_restart,
        };
        struct k_clock thread = {
                .clock_getres = thread_cpu_clock_getres,
@@ -1546,6 +1613,7 @@ static __init int init_posix_cpu_timers(void)
                .clock_set = do_posix_clock_nosettime,
                .timer_create = thread_cpu_timer_create,
                .nsleep = thread_cpu_nsleep,
+               .nsleep_restart = thread_cpu_nsleep_restart,
        };
 
        register_posix_clock(CLOCK_PROCESS_CPUTIME_ID, &process);