Merge branch 'timers-for-linus-clocksource' of git://git.kernel.org/pub/scm/linux...
[safe/jmp/linux-2.6] / kernel / futex.c
index 041bf3a..80b5ce7 100644 (file)
@@ -100,8 +100,8 @@ struct futex_pi_state {
  */
 struct futex_q {
        struct plist_node list;
-       /* There can only be a single waiter */
-       wait_queue_head_t waiter;
+       /* Waiter reference */
+       struct task_struct *task;
 
        /* Which hash list lock to use: */
        spinlock_t *lock_ptr;
@@ -111,7 +111,6 @@ struct futex_q {
 
        /* Optional priority inheritance state: */
        struct futex_pi_state *pi_state;
-       struct task_struct *task;
 
        /* rt_waiter storage for requeue_pi: */
        struct rt_mutex_waiter *rt_waiter;
@@ -200,6 +199,7 @@ static void drop_futex_key_refs(union futex_key *key)
  * @uaddr: virtual address of the futex
  * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
  * @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
  *
  * Returns a negative error code or 0
  * The key words are stored in *key on success.
@@ -210,7 +210,8 @@ static void drop_futex_key_refs(union futex_key *key)
  *
  * lock_page() might sleep, the caller should not hold a spinlock.
  */
-static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
+static int
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
 {
        unsigned long address = (unsigned long)uaddr;
        struct mm_struct *mm = current->mm;
@@ -233,7 +234,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
         *        but access_ok() should be faster than find_vma()
         */
        if (!fshared) {
-               if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+               if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
                        return -EFAULT;
                key->private.mm = mm;
                key->private.address = address;
@@ -242,7 +243,7 @@ static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
        }
 
 again:
-       err = get_user_pages_fast(address, 1, 0, &page);
+       err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
        if (err < 0)
                return err;
 
@@ -694,22 +695,29 @@ retry:
  */
 static void wake_futex(struct futex_q *q)
 {
-       plist_del(&q->list, &q->list.plist);
+       struct task_struct *p = q->task;
+
        /*
-        * The lock in wake_up_all() is a crucial memory barrier after the
-        * plist_del() and also before assigning to q->lock_ptr.
+        * We set q->lock_ptr = NULL _before_ we wake up the task. If
+        * a non futex wake up happens on another CPU then the task
+        * might exit and p would dereference a non existing task
+        * struct. Prevent this by holding a reference on p across the
+        * wake up.
         */
-       wake_up(&q->waiter);
+       get_task_struct(p);
+
+       plist_del(&q->list, &q->list.plist);
        /*
-        * The waiting task can free the futex_q as soon as this is written,
-        * without taking any locks.  This must come last.
-        *
-        * A memory barrier is required here to prevent the following store to
-        * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
-        * end of wake_up() does not prevent this store from moving.
+        * The waiting task can free the futex_q as soon as
+        * q->lock_ptr = NULL is written, without taking any locks. A
+        * memory barrier is required here to prevent the following
+        * store to lock_ptr from getting ahead of the plist_del.
         */
        smp_wmb();
        q->lock_ptr = NULL;
+
+       wake_up_state(p, TASK_NORMAL);
+       put_task_struct(p);
 }
 
 static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
@@ -828,7 +836,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
        if (!bitset)
                return -EINVAL;
 
-       ret = get_futex_key(uaddr, fshared, &key);
+       ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
 
@@ -874,10 +882,10 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
        int ret, op_ret;
 
 retry:
-       ret = get_futex_key(uaddr1, fshared, &key1);
+       ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out_put_key1;
 
@@ -1003,7 +1011,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key)
        WARN_ON(!q->rt_waiter);
        q->rt_waiter = NULL;
 
-       wake_up(&q->waiter);
+       wake_up_state(q->task, TASK_NORMAL);
 }
 
 /**
@@ -1125,10 +1133,11 @@ retry:
                pi_state = NULL;
        }
 
-       ret = get_futex_key(uaddr1, fshared, &key1);
+       ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
        if (unlikely(ret != 0))
                goto out;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, fshared, &key2,
+                           requeue_pi ? VERIFY_WRITE : VERIFY_READ);
        if (unlikely(ret != 0))
                goto out_put_key1;
 
@@ -1261,7 +1270,12 @@ retry_private:
 out_unlock:
        double_unlock_hb(hb1, hb2);
 
-       /* drop_futex_key_refs() must be called outside the spinlocks. */
+       /*
+        * drop_futex_key_refs() must be called outside the spinlocks. During
+        * the requeue we moved futex_q's from the hash bucket at key1 to the
+        * one at key2 and updated their key pointer.  We no longer need to
+        * hold the references to key1.
+        */
        while (--drop_count >= 0)
                drop_futex_key_refs(&key1);
 
@@ -1280,8 +1294,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
 {
        struct futex_hash_bucket *hb;
 
-       init_waitqueue_head(&q->waiter);
-
        get_futex_key_refs(&q->key);
        hb = hash_futex(&q->key);
        q->lock_ptr = &hb->lock;
@@ -1495,7 +1507,6 @@ handle_fault:
 #define FLAGS_HAS_TIMEOUT      0x04
 
 static long futex_wait_restart(struct restart_block *restart);
-static long futex_lock_pi_restart(struct restart_block *restart);
 
 /**
  * fixup_owner() - Post lock pi_state and corner case management
@@ -1575,11 +1586,9 @@ out:
  * @hb:                the futex hash bucket, must be locked by the caller
  * @q:         the futex_q to queue up on
  * @timeout:   the prepared hrtimer_sleeper, or null for no timeout
- * @wait:      the wait_queue to add to the futex_q after queueing in the hb
  */
 static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
-                               struct hrtimer_sleeper *timeout,
-                               wait_queue_t *wait)
+                               struct hrtimer_sleeper *timeout)
 {
        queue_me(q, hb);
 
@@ -1587,19 +1596,11 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
         * There might have been scheduling since the queue_me(), as we
         * cannot hold a spinlock across the get_user() in case it
         * faults, and we cannot just set TASK_INTERRUPTIBLE state when
-        * queueing ourselves into the futex hash.  This code thus has to
+        * queueing ourselves into the futex hash. This code thus has to
         * rely on the futex_wake() code removing us from hash when it
         * wakes us up.
         */
-
-       /* add_wait_queue is the barrier after __set_current_state. */
-       __set_current_state(TASK_INTERRUPTIBLE);
-
-       /*
-        * Add current as the futex_q waiter.  We don't remove ourselves from
-        * the wait_queue because we are the only user of it.
-        */
-       add_wait_queue(&q->waiter, wait);
+       set_current_state(TASK_INTERRUPTIBLE);
 
        /* Arm the timer */
        if (timeout) {
@@ -1666,9 +1667,9 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared,
         */
 retry:
        q->key = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr, fshared, &q->key);
+       ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
        if (unlikely(ret != 0))
-               goto out;
+               return ret;
 
 retry_private:
        *hb = queue_lock(q);
@@ -1704,7 +1705,6 @@ static int futex_wait(u32 __user *uaddr, int fshared,
                      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
-       DECLARE_WAITQUEUE(wait, current);
        struct restart_block *restart;
        struct futex_hash_bucket *hb;
        struct futex_q q;
@@ -1733,7 +1733,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
                goto out;
 
        /* queue_me and wait for wakeup, timeout, or a signal. */
-       futex_wait_queue_me(hb, &q, to, &wait);
+       futex_wait_queue_me(hb, &q, to);
 
        /* If we were woken (and unqueued), we succeeded, whatever. */
        ret = 0;
@@ -1826,7 +1826,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
        q.rt_waiter = NULL;
 retry:
        q.key = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr, fshared, &q.key);
+       ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
 
@@ -1929,21 +1929,6 @@ uaddr_faulted:
        goto retry;
 }
 
-static long futex_lock_pi_restart(struct restart_block *restart)
-{
-       u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
-       ktime_t t, *tp = NULL;
-       int fshared = restart->futex.flags & FLAGS_SHARED;
-
-       if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
-               t.tv64 = restart->futex.time;
-               tp = &t;
-       }
-       restart->fn = do_no_restart_syscall;
-
-       return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0);
-}
-
 /*
  * Userspace attempted a TID -> 0 atomic transition, and failed.
  * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1967,7 +1952,7 @@ retry:
        if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
                return -EPERM;
 
-       ret = get_futex_key(uaddr, fshared, &key);
+       ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
 
@@ -2059,7 +2044,7 @@ pi_faulted:
  *
  * Returns
  *  0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?)
+ * <0 - -ETIMEDOUT or -ERESTARTNOINTR
  */
 static inline
 int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
@@ -2086,15 +2071,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 
                if (timeout && !timeout->task)
                        ret = -ETIMEDOUT;
-               else {
-                       /*
-                        * We expect signal_pending(current), but another
-                        * thread may have handled it for us already.
-                        */
-                       /* FIXME: ERESTARTSYS or ERESTARTNOINTR?  Do we care if
-                        * the user specified SA_RESTART or not? */
-                       ret = -ERESTARTSYS;
-               }
+               else
+                       ret = -ERESTARTNOINTR;
        }
        return ret;
 }
@@ -2147,13 +2125,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        struct hrtimer_sleeper timeout, *to = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct rt_mutex *pi_mutex = NULL;
-       DECLARE_WAITQUEUE(wait, current);
-       struct restart_block *restart;
        struct futex_hash_bucket *hb;
        union futex_key key2;
        struct futex_q q;
        int res, ret;
-       u32 uval;
 
        if (!bitset)
                return -EINVAL;
@@ -2179,19 +2154,17 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
        q.rt_waiter = &rt_waiter;
 
        key2 = FUTEX_KEY_INIT;
-       ret = get_futex_key(uaddr2, fshared, &key2);
+       ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
        if (unlikely(ret != 0))
                goto out;
 
        /* Prepare to wait on uaddr. */
        ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
-       if (ret) {
-               put_futex_key(fshared, &key2);
-               goto out;
-       }
+       if (ret)
+               goto out_key2;
 
        /* Queue the futex_q, drop the hb lock, wait for wakeup. */
-       futex_wait_queue_me(hb, &q, to, &wait);
+       futex_wait_queue_me(hb, &q, to);
 
        spin_lock(&hb->lock);
        ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
@@ -2254,34 +2227,22 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
                if (rt_mutex_owner(pi_mutex) == current)
                        rt_mutex_unlock(pi_mutex);
        } else if (ret == -EINTR) {
-               ret = -EFAULT;
-               if (get_user(uval, uaddr2))
-                       goto out_put_keys;
-
                /*
-                * We've already been requeued, so restart by calling
-                * futex_lock_pi() directly, rather then returning to this
-                * function.
+                * We've already been requeued, but we have no way to
+                * restart by calling futex_lock_pi() directly. We
+                * could restart the syscall, but that will look at
+                * the user space value and return right away. So we
+                * drop back with EWOULDBLOCK to tell user space that
+                * "val" has been changed. That's the same what the
+                * restart of the syscall would do in
+                * futex_wait_setup().
                 */
-               ret = -ERESTART_RESTARTBLOCK;
-               restart = &current_thread_info()->restart_block;
-               restart->fn = futex_lock_pi_restart;
-               restart->futex.uaddr = (u32 *)uaddr2;
-               restart->futex.val = uval;
-               restart->futex.flags = 0;
-               if (abs_time) {
-                       restart->futex.flags |= FLAGS_HAS_TIMEOUT;
-                       restart->futex.time = abs_time->tv64;
-               }
-
-               if (fshared)
-                       restart->futex.flags |= FLAGS_SHARED;
-               if (clockrt)
-                       restart->futex.flags |= FLAGS_CLOCKRT;
+               ret = -EWOULDBLOCK;
        }
 
 out_put_keys:
        put_futex_key(fshared, &q.key);
+out_key2:
        put_futex_key(fshared, &key2);
 
 out:
@@ -2555,9 +2516,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
                ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
                                            clockrt, uaddr2);
                break;
-       case FUTEX_REQUEUE_PI:
-               ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1);
-               break;
        case FUTEX_CMP_REQUEUE_PI:
                ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
                                    1);
@@ -2596,8 +2554,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val,
         * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
         */
        if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
-           cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI ||
-           cmd == FUTEX_WAKE_OP)
+           cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
                val2 = (u32) (unsigned long) utime;
 
        return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);