*/
struct futex_q {
struct plist_node list;
- /* There can only be a single waiter */
- wait_queue_head_t waiter;
+ /* Waiter reference */
+ struct task_struct *task;
/* Which hash list lock to use: */
spinlock_t *lock_ptr;
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
- struct task_struct *task;
/* rt_waiter storage for requeue_pi: */
struct rt_mutex_waiter *rt_waiter;
* @uaddr: virtual address of the futex
* @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
* @key: address where result is stored.
+ * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE)
*
* Returns a negative error code or 0
* The key words are stored in *key on success.
*
* lock_page() might sleep, the caller should not hold a spinlock.
*/
-static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
+static int
+get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
* but access_ok() should be faster than find_vma()
*/
if (!fshared) {
- if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))))
+ if (unlikely(!access_ok(rw, uaddr, sizeof(u32))))
return -EFAULT;
key->private.mm = mm;
key->private.address = address;
}
again:
- err = get_user_pages_fast(address, 1, 0, &page);
+ err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page);
if (err < 0)
return err;
drop_futex_key_refs(key);
}
+/*
+ * fault_in_user_writeable - fault in user address and verify RW access
+ * @uaddr: pointer to faulting user space address
+ *
+ * Slow path to fixup the fault we just took in the atomic write
+ * access to @uaddr.
+ *
+ * We have no generic implementation of a non destructive write to the
+ * user address. We know that we faulted in the atomic pagefault
+ * disabled section so we can as well avoid the #PF overhead by
+ * calling get_user_pages() right away.
+ */
+static int fault_in_user_writeable(u32 __user *uaddr)
+{
+ int ret = get_user_pages(current, current->mm, (unsigned long)uaddr,
+ 1, 1, 0, NULL, NULL);
+ return ret < 0 ? ret : 0;
+}
+
/**
* futex_top_waiter() - Return the highest priority waiter on a futex
* @hb: the hash bucket the futex_q's reside in
/**
* futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex
- * @uaddr: the pi futex user address
- * @hb: the pi futex hash bucket
- * @key: the futex key associated with uaddr and hb
- * @ps: the pi_state pointer where we store the result of the lookup
- * @task: the task to perform the atomic lock work for. This will be
- * "current" except in the case of requeue pi.
+ * @uaddr: the pi futex user address
+ * @hb: the pi futex hash bucket
+ * @key: the futex key associated with uaddr and hb
+ * @ps: the pi_state pointer where we store the result of the
+ * lookup
+ * @task: the task to perform the atomic lock work for. This will
+ * be "current" except in the case of requeue pi.
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Returns:
* 0 - ready to wait
static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
union futex_key *key,
struct futex_pi_state **ps,
- struct task_struct *task)
+ struct task_struct *task, int set_waiters)
{
int lock_taken, ret, ownerdied = 0;
u32 uval, newval, curval;
* the locks. It will most likely not succeed.
*/
newval = task_pid_vnr(task);
+ if (set_waiters)
+ newval |= FUTEX_WAITERS;
curval = cmpxchg_futex_value_locked(uaddr, 0, newval);
*/
static void wake_futex(struct futex_q *q)
{
- plist_del(&q->list, &q->list.plist);
+ struct task_struct *p = q->task;
+
/*
- * The lock in wake_up_all() is a crucial memory barrier after the
- * plist_del() and also before assigning to q->lock_ptr.
+ * We set q->lock_ptr = NULL _before_ we wake up the task. If
+ * a non futex wake up happens on another CPU then the task
+ * might exit and p would dereference a non existing task
+ * struct. Prevent this by holding a reference on p across the
+ * wake up.
*/
- wake_up(&q->waiter);
+ get_task_struct(p);
+
+ plist_del(&q->list, &q->list.plist);
/*
- * The waiting task can free the futex_q as soon as this is written,
- * without taking any locks. This must come last.
- *
- * A memory barrier is required here to prevent the following store to
- * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
- * end of wake_up() does not prevent this store from moving.
+ * The waiting task can free the futex_q as soon as
+ * q->lock_ptr = NULL is written, without taking any locks. A
+ * memory barrier is required here to prevent the following
+ * store to lock_ptr from getting ahead of the plist_del.
*/
smp_wmb();
q->lock_ptr = NULL;
+
+ wake_up_state(p, TASK_NORMAL);
+ put_task_struct(p);
}
static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
if (!bitset)
return -EINVAL;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
int ret, op_ret;
retry:
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out_put_key1;
retry_private:
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
- u32 dummy;
double_unlock_hb(hb1, hb2);
goto out_put_keys;
}
- ret = get_user(dummy, uaddr2);
+ ret = fault_in_user_writeable(uaddr2);
if (ret)
goto out_put_keys;
WARN_ON(!q->rt_waiter);
q->rt_waiter = NULL;
- wake_up(&q->waiter);
+ wake_up_state(q->task, TASK_NORMAL);
}
/**
* futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter
- * @pifutex: the user address of the to futex
- * @hb1: the from futex hash bucket, must be locked by the caller
- * @hb2: the to futex hash bucket, must be locked by the caller
- * @key1: the from futex key
- * @key2: the to futex key
+ * @pifutex: the user address of the to futex
+ * @hb1: the from futex hash bucket, must be locked by the caller
+ * @hb2: the to futex hash bucket, must be locked by the caller
+ * @key1: the from futex key
+ * @key2: the to futex key
+ * @ps: address to store the pi_state pointer
+ * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0)
*
* Try and get the lock on behalf of the top waiter if we can do it atomically.
- * Wake the top waiter if we succeed. hb1 and hb2 must be held by the caller.
+ * Wake the top waiter if we succeed. If the caller specified set_waiters,
+ * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit.
+ * hb1 and hb2 must be held by the caller.
*
* Returns:
* 0 - failed to acquire the lock atomicly
struct futex_hash_bucket *hb1,
struct futex_hash_bucket *hb2,
union futex_key *key1, union futex_key *key2,
- struct futex_pi_state **ps)
+ struct futex_pi_state **ps, int set_waiters)
{
- struct futex_q *top_waiter;
+ struct futex_q *top_waiter = NULL;
u32 curval;
int ret;
if (get_futex_value_locked(&curval, pifutex))
return -EFAULT;
+ /*
+ * Find the top_waiter and determine if there are additional waiters.
+ * If the caller intends to requeue more than 1 waiter to pifutex,
+ * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now,
+ * as we have means to handle the possible fault. If not, don't set
+ * the bit unecessarily as it will force the subsequent unlock to enter
+ * the kernel.
+ */
top_waiter = futex_top_waiter(hb1, key1);
/* There are no waiters, nothing for us to do. */
return 0;
/*
- * Either take the lock for top_waiter or set the FUTEX_WAITERS bit.
- * The pi_state is returned in ps in contended cases.
+ * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in
+ * the contended case or if set_waiters is 1. The pi_state is returned
+ * in ps in contended cases.
*/
- ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task);
+ ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task,
+ set_waiters);
if (ret == 1)
requeue_pi_wake_futex(top_waiter, key2);
pi_state = NULL;
}
- ret = get_futex_key(uaddr1, fshared, &key1);
+ ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ);
if (unlikely(ret != 0))
goto out;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2,
+ requeue_pi ? VERIFY_WRITE : VERIFY_READ);
if (unlikely(ret != 0))
goto out_put_key1;
}
if (requeue_pi && (task_count - nr_wake < nr_requeue)) {
- /* Attempt to acquire uaddr2 and wake the top_waiter. */
+ /*
+ * Attempt to acquire uaddr2 and wake the top waiter. If we
+ * intend to requeue waiters, force setting the FUTEX_WAITERS
+ * bit. We force this here where we are able to easily handle
+ * faults rather in the requeue loop below.
+ */
ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1,
- &key2, &pi_state);
+ &key2, &pi_state, nr_requeue);
/*
* At this point the top_waiter has either taken uaddr2 or is
double_unlock_hb(hb1, hb2);
put_futex_key(fshared, &key2);
put_futex_key(fshared, &key1);
- ret = get_user(curval2, uaddr2);
+ ret = fault_in_user_writeable(uaddr2);
if (!ret)
goto retry;
goto out;
out_unlock:
double_unlock_hb(hb1, hb2);
- /* drop_futex_key_refs() must be called outside the spinlocks. */
+ /*
+ * drop_futex_key_refs() must be called outside the spinlocks. During
+ * the requeue we moved futex_q's from the hash bucket at key1 to the
+ * one at key2 and updated their key pointer. We no longer need to
+ * hold the references to key1.
+ */
while (--drop_count >= 0)
drop_futex_key_refs(&key1);
{
struct futex_hash_bucket *hb;
- init_waitqueue_head(&q->waiter);
-
get_futex_key_refs(&q->key);
hb = hash_futex(&q->key);
q->lock_ptr = &hb->lock;
handle_fault:
spin_unlock(q->lock_ptr);
- ret = get_user(uval, uaddr);
+ ret = fault_in_user_writeable(uaddr);
spin_lock(q->lock_ptr);
#define FLAGS_HAS_TIMEOUT 0x04
static long futex_wait_restart(struct restart_block *restart);
-static long futex_lock_pi_restart(struct restart_block *restart);
/**
* fixup_owner() - Post lock pi_state and corner case management
* @hb: the futex hash bucket, must be locked by the caller
* @q: the futex_q to queue up on
* @timeout: the prepared hrtimer_sleeper, or null for no timeout
- * @wait: the wait_queue to add to the futex_q after queueing in the hb
*/
static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q,
- struct hrtimer_sleeper *timeout,
- wait_queue_t *wait)
+ struct hrtimer_sleeper *timeout)
{
queue_me(q, hb);
* There might have been scheduling since the queue_me(), as we
* cannot hold a spinlock across the get_user() in case it
* faults, and we cannot just set TASK_INTERRUPTIBLE state when
- * queueing ourselves into the futex hash. This code thus has to
+ * queueing ourselves into the futex hash. This code thus has to
* rely on the futex_wake() code removing us from hash when it
* wakes us up.
*/
-
- /* add_wait_queue is the barrier after __set_current_state. */
- __set_current_state(TASK_INTERRUPTIBLE);
-
- /*
- * Add current as the futex_q waiter. We don't remove ourselves from
- * the wait_queue because we are the only user of it.
- */
- add_wait_queue(&q->waiter, wait);
+ set_current_state(TASK_INTERRUPTIBLE);
/* Arm the timer */
if (timeout) {
*/
retry:
q->key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q->key);
+ ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ);
if (unlikely(ret != 0))
- goto out;
+ return ret;
retry_private:
*hb = queue_lock(q);
u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
{
struct hrtimer_sleeper timeout, *to = NULL;
- DECLARE_WAITQUEUE(wait, current);
struct restart_block *restart;
struct futex_hash_bucket *hb;
struct futex_q q;
goto out;
/* queue_me and wait for wakeup, timeout, or a signal. */
- futex_wait_queue_me(hb, &q, to, &wait);
+ futex_wait_queue_me(hb, &q, to);
/* If we were woken (and unqueued), we succeeded, whatever. */
ret = 0;
{
struct hrtimer_sleeper timeout, *to = NULL;
struct futex_hash_bucket *hb;
- u32 uval;
struct futex_q q;
int res, ret;
q.rt_waiter = NULL;
retry:
q.key = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr, fshared, &q.key);
+ ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
retry_private:
hb = queue_lock(&q);
- ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current);
+ ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0);
if (unlikely(ret)) {
switch (ret) {
case 1:
return ret != -EINTR ? ret : -ERESTARTNOINTR;
uaddr_faulted:
- /*
- * We have to r/w *(int __user *)uaddr, and we have to modify it
- * atomically. Therefore, if we continue to fault after get_user()
- * below, we need to handle the fault ourselves, while still holding
- * the mmap_sem. This can occur if the uaddr is under contention as
- * we have to drop the mmap_sem in order to call get_user().
- */
queue_unlock(&q, hb);
- ret = get_user(uval, uaddr);
+ ret = fault_in_user_writeable(uaddr);
if (ret)
goto out_put_key;
goto retry;
}
-static long futex_lock_pi_restart(struct restart_block *restart)
-{
- u32 __user *uaddr = (u32 __user *)restart->futex.uaddr;
- ktime_t t, *tp = NULL;
- int fshared = restart->futex.flags & FLAGS_SHARED;
-
- if (restart->futex.flags & FLAGS_HAS_TIMEOUT) {
- t.tv64 = restart->futex.time;
- tp = &t;
- }
- restart->fn = do_no_restart_syscall;
-
- return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0);
-}
-
/*
* Userspace attempted a TID -> 0 atomic transition, and failed.
* This is the in-kernel slowpath: we look up the PI state (if any),
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
- ret = get_futex_key(uaddr, fshared, &key);
+ ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
return ret;
pi_faulted:
- /*
- * We have to r/w *(int __user *)uaddr, and we have to modify it
- * atomically. Therefore, if we continue to fault after get_user()
- * below, we need to handle the fault ourselves, while still holding
- * the mmap_sem. This can occur if the uaddr is under contention as
- * we have to drop the mmap_sem in order to call get_user().
- */
spin_unlock(&hb->lock);
put_futex_key(fshared, &key);
- ret = get_user(uval, uaddr);
+ ret = fault_in_user_writeable(uaddr);
if (!ret)
goto retry;
*
* Returns
* 0 - no early wakeup detected
- * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?)
+ * <0 - -ETIMEDOUT or -ERESTARTNOINTR
*/
static inline
int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
if (timeout && !timeout->task)
ret = -ETIMEDOUT;
- else {
- /*
- * We expect signal_pending(current), but another
- * thread may have handled it for us already.
- */
- /* FIXME: ERESTARTSYS or ERESTARTNOINTR? Do we care if
- * the user specified SA_RESTART or not? */
- ret = -ERESTARTSYS;
- }
+ else
+ ret = -ERESTARTNOINTR;
}
return ret;
}
struct hrtimer_sleeper timeout, *to = NULL;
struct rt_mutex_waiter rt_waiter;
struct rt_mutex *pi_mutex = NULL;
- DECLARE_WAITQUEUE(wait, current);
- struct restart_block *restart;
struct futex_hash_bucket *hb;
union futex_key key2;
struct futex_q q;
int res, ret;
- u32 uval;
if (!bitset)
return -EINVAL;
q.rt_waiter = &rt_waiter;
key2 = FUTEX_KEY_INIT;
- ret = get_futex_key(uaddr2, fshared, &key2);
+ ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
if (unlikely(ret != 0))
goto out;
/* Prepare to wait on uaddr. */
ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
- if (ret) {
- put_futex_key(fshared, &key2);
- goto out;
- }
+ if (ret)
+ goto out_key2;
/* Queue the futex_q, drop the hb lock, wait for wakeup. */
- futex_wait_queue_me(hb, &q, to, &wait);
+ futex_wait_queue_me(hb, &q, to);
spin_lock(&hb->lock);
ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to);
if (rt_mutex_owner(pi_mutex) == current)
rt_mutex_unlock(pi_mutex);
} else if (ret == -EINTR) {
- ret = -EFAULT;
- if (get_user(uval, uaddr2))
- goto out_put_keys;
-
/*
- * We've already been requeued, so restart by calling
- * futex_lock_pi() directly, rather then returning to this
- * function.
+ * We've already been requeued, but we have no way to
+ * restart by calling futex_lock_pi() directly. We
+ * could restart the syscall, but that will look at
+ * the user space value and return right away. So we
+ * drop back with EWOULDBLOCK to tell user space that
+ * "val" has been changed. That's the same what the
+ * restart of the syscall would do in
+ * futex_wait_setup().
*/
- ret = -ERESTART_RESTARTBLOCK;
- restart = ¤t_thread_info()->restart_block;
- restart->fn = futex_lock_pi_restart;
- restart->futex.uaddr = (u32 *)uaddr2;
- restart->futex.val = uval;
- restart->futex.flags = 0;
- if (abs_time) {
- restart->futex.flags |= FLAGS_HAS_TIMEOUT;
- restart->futex.time = abs_time->tv64;
- }
-
- if (fshared)
- restart->futex.flags |= FLAGS_SHARED;
- if (clockrt)
- restart->futex.flags |= FLAGS_CLOCKRT;
+ ret = -EWOULDBLOCK;
}
out_put_keys:
put_futex_key(fshared, &q.key);
+out_key2:
put_futex_key(fshared, &key2);
out:
ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3,
clockrt, uaddr2);
break;
- case FUTEX_REQUEUE_PI:
- ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1);
- break;
case FUTEX_CMP_REQUEUE_PI:
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3,
1);
* number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP.
*/
if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE ||
- cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI ||
- cmd == FUTEX_WAKE_OP)
+ cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP)
val2 = (u32) (unsigned long) utime;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);