X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;f=kernel%2Ffutex.c;h=e7a35f1039e785464b318d3713557e9d2db6a591;hb=d7e81c269db899b800e0963dc4aceece1f82a680;hp=2aa216e5b594f30f0f726902b59b9ba56908dc6d;hpb=1c840c14906d4ddf66c1f4f5daea059aad951c82;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/futex.c b/kernel/futex.c index 2aa216e..e7a35f1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -89,33 +89,36 @@ struct futex_pi_state { union futex_key key; }; -/* - * We use this hashed waitqueue instead of a normal wait_queue_t, so +/** + * struct futex_q - The hashed futex queue entry, one per waiting task + * @task: the task waiting on the futex + * @lock_ptr: the hash bucket lock + * @key: the key the futex is hashed on + * @pi_state: optional priority inheritance state + * @rt_waiter: rt_waiter storage for use with requeue_pi + * @requeue_pi_key: the requeue_pi target futex key + * @bitset: bitset for the optional bitmasked wakeup + * + * We use this hashed waitqueue, instead of a normal wait_queue_t, so * we can wake only the relevant ones (hashed queues may be shared). * * A futex_q has a woken state, just like tasks have TASK_RUNNING. * It is considered woken when plist_node_empty(&q->list) || q->lock_ptr == 0. * The order of wakup is always to make the first condition true, then - * wake up q->waiter, then make the second condition true. + * the second. + * + * PI futexes are typically woken before they are removed from the hash list via + * the rt_mutex code. See unqueue_me_pi(). */ struct futex_q { struct plist_node list; - /* Waiter reference */ - struct task_struct *task; - /* Which hash list lock to use: */ + struct task_struct *task; spinlock_t *lock_ptr; - - /* Key which the futex is hashed on: */ union futex_key key; - - /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; - - /* rt_waiter storage for requeue_pi: */ struct rt_mutex_waiter *rt_waiter; - - /* Bitset for the optional bitmasked wakeup */ + union futex_key *requeue_pi_key; u32 bitset; }; @@ -147,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key) */ static inline int match_futex(union futex_key *key1, union futex_key *key2) { - return (key1->both.word == key2->both.word + return (key1 && key2 + && key1->both.word == key2->both.word && key1->both.ptr == key2->both.ptr && key1->both.offset == key2->both.offset); } @@ -195,11 +199,10 @@ static void drop_futex_key_refs(union futex_key *key) } /** - * get_futex_key - Get parameters which are the keys for a futex. - * @uaddr: virtual address of the futex - * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED - * @key: address where result is stored. - * @rw: mapping needs to be read/write (values: VERIFY_READ, VERIFY_WRITE) + * get_futex_key() - Get parameters which are the keys for a futex + * @uaddr: virtual address of the futex + * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED + * @key: address where result is stored. * * Returns a negative error code or 0 * The key words are stored in *key on success. @@ -211,7 +214,7 @@ static void drop_futex_key_refs(union futex_key *key) * lock_page() might sleep, the caller should not hold a spinlock. */ static int -get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) +get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -234,7 +237,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) * but access_ok() should be faster than find_vma() */ if (!fshared) { - if (unlikely(!access_ok(rw, uaddr, sizeof(u32)))) + if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) return -EFAULT; key->private.mm = mm; key->private.address = address; @@ -243,10 +246,11 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw) } again: - err = get_user_pages_fast(address, 1, rw == VERIFY_WRITE, &page); + err = get_user_pages_fast(address, 1, 1, &page); if (err < 0) return err; + page = compound_head(page); lock_page(page); if (!page->mapping) { unlock_page(page); @@ -285,9 +289,34 @@ void put_futex_key(int fshared, union futex_key *key) } /** + * fault_in_user_writeable() - Fault in user address and verify RW access + * @uaddr: pointer to faulting user space address + * + * Slow path to fixup the fault we just took in the atomic write + * access to @uaddr. + * + * We have no generic implementation of a non destructive write to the + * user address. We know that we faulted in the atomic pagefault + * disabled section so we can as well avoid the #PF overhead by + * calling get_user_pages() right away. + */ +static int fault_in_user_writeable(u32 __user *uaddr) +{ + struct mm_struct *mm = current->mm; + int ret; + + down_read(&mm->mmap_sem); + ret = get_user_pages(current, mm, (unsigned long)uaddr, + 1, 1, 0, NULL, NULL); + up_read(&mm->mmap_sem); + + return ret < 0 ? ret : 0; +} + +/** * futex_top_waiter() - Return the highest priority waiter on a futex - * @hb: the hash bucket the futex_q's reside in - * @key: the futex key (to distinguish it from other futex futex_q's) + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) * * Must be called with the hb lock held. */ @@ -372,9 +401,9 @@ static void free_pi_state(struct futex_pi_state *pi_state) * and has cleaned up the pi_state already */ if (pi_state->owner) { - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); rt_mutex_proxy_unlock(&pi_state->pi_mutex, pi_state->owner); } @@ -439,18 +468,18 @@ void exit_pi_state_list(struct task_struct *curr) * pi_state_list anymore, but we have to be careful * versus waiters unqueueing themselves: */ - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); while (!list_empty(head)) { next = head->next; pi_state = list_entry(next, struct futex_pi_state, list); key = pi_state->key; hb = hash_futex(&key); - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); spin_lock(&hb->lock); - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); /* * We dropped the pi-lock, so re-check whether this * task still owns the PI-state: @@ -464,15 +493,15 @@ void exit_pi_state_list(struct task_struct *curr) WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); pi_state->owner = NULL; - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); rt_mutex_unlock(&pi_state->pi_mutex); spin_unlock(&hb->lock); - spin_lock_irq(&curr->pi_lock); + raw_spin_lock_irq(&curr->pi_lock); } - spin_unlock_irq(&curr->pi_lock); + raw_spin_unlock_irq(&curr->pi_lock); } static int @@ -501,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); - WARN_ON(pid && pi_state->owner && - pi_state->owner->pid != pid); + + /* + * When pi_state->owner is NULL then the owner died + * and another waiter is on the fly. pi_state->owner + * is fixed up by the task which acquires + * pi_state->rt_mutex. + * + * We do not check for pid == 0 which can happen when + * the owner died and robust_list_exit() cleared the + * TID. + */ + if (pid && pi_state->owner) { + /* + * Bail out if user space manipulated the + * futex value. + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + } atomic_inc(&pi_state->refcount); *ps = pi_state; @@ -527,7 +573,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, * change of the task flags, we do this protected by * p->pi_lock: */ - spin_lock_irq(&p->pi_lock); + raw_spin_lock_irq(&p->pi_lock); if (unlikely(p->flags & PF_EXITING)) { /* * The task is on the way out. When PF_EXITPIDONE is @@ -536,7 +582,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, */ int ret = (p->flags & PF_EXITPIDONE) ? -ESRCH : -EAGAIN; - spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); return ret; } @@ -555,7 +601,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &p->pi_state_list); pi_state->owner = p; - spin_unlock_irq(&p->pi_lock); + raw_spin_unlock_irq(&p->pi_lock); put_task_struct(p); @@ -565,7 +611,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, } /** - * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * futex_lock_pi_atomic() - Atomic work required to acquire a pi aware futex * @uaddr: the pi futex user address * @hb: the pi futex hash bucket * @key: the futex key associated with uaddr and hb @@ -729,7 +775,14 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; - spin_lock(&pi_state->pi_mutex.wait_lock); + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + return -EINVAL; + + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); /* @@ -758,23 +811,23 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) else if (curval != uval) ret = -EINVAL; if (ret) { - spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); return ret; } } - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); - spin_lock_irq(&new_owner->pi_lock); + raw_spin_lock_irq(&new_owner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &new_owner->pi_state_list); pi_state->owner = new_owner; - spin_unlock_irq(&new_owner->pi_lock); + raw_spin_unlock_irq(&new_owner->pi_lock); - spin_unlock(&pi_state->pi_mutex.wait_lock); + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); rt_mutex_unlock(&pi_state->pi_mutex); return 0; @@ -836,7 +889,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) if (!bitset) return -EINVAL; - ret = get_futex_key(uaddr, fshared, &key, VERIFY_READ); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -882,21 +935,20 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, int ret, op_ret; retry: - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out_put_key1; hb1 = hash_futex(&key1); hb2 = hash_futex(&key2); - double_lock_hb(hb1, hb2); retry_private: + double_lock_hb(hb1, hb2); op_ret = futex_atomic_op_inuser(op, uaddr2); if (unlikely(op_ret < 0)) { - u32 dummy; double_unlock_hb(hb1, hb2); @@ -914,7 +966,7 @@ retry_private: goto out_put_keys; } - ret = get_user(dummy, uaddr2); + ret = fault_in_user_writeable(uaddr2); if (ret) goto out_put_keys; @@ -980,7 +1032,7 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, plist_add(&q->list, &hb2->chain); q->lock_ptr = &hb2->lock; #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb2->lock; + q->list.plist.spinlock = &hb2->lock; #endif } get_futex_key_refs(key2); @@ -989,19 +1041,22 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, /** * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue - * q: the futex_q - * key: the key of the requeue target futex + * @q: the futex_q + * @key: the key of the requeue target futex + * @hb: the hash_bucket of the requeue target futex * * During futex_requeue, with requeue_pi=1, it is possible to acquire the * target futex if it is uncontended or via a lock steal. Set the futex_q key * to the requeue target futex so the waiter can detect the wakeup on the right * futex, but remove it from the hb and NULL the rt_waiter so it can detect - * atomic lock acquisition. Must be called with the q->lock_ptr held. + * atomic lock acquisition. Set the q->lock_ptr to the requeue target hb->lock + * to protect access to the pi_state to fixup the owner later. Must be called + * with both q->lock_ptr and hb->lock held. */ static inline -void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key, + struct futex_hash_bucket *hb) { - drop_futex_key_refs(&q->key); get_futex_key_refs(key); q->key = *key; @@ -1011,6 +1066,11 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; + q->lock_ptr = &hb->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.spinlock = &hb->lock; +#endif + wake_up_state(q->task, TASK_NORMAL); } @@ -1061,6 +1121,10 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, if (!top_waiter) return 0; + /* Ensure we requeue to the expected futex. */ + if (!match_futex(top_waiter->requeue_pi_key, key2)) + return -EINVAL; + /* * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in * the contended case or if set_waiters is 1. The pi_state is returned @@ -1069,7 +1133,7 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, set_waiters); if (ret == 1) - requeue_pi_wake_futex(top_waiter, key2); + requeue_pi_wake_futex(top_waiter, key2, hb2); return ret; } @@ -1133,11 +1197,10 @@ retry: pi_state = NULL; } - ret = get_futex_key(uaddr1, fshared, &key1, VERIFY_READ); + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; - ret = get_futex_key(uaddr2, fshared, &key2, - requeue_pi ? VERIFY_WRITE : VERIFY_READ); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out_put_key1; @@ -1190,6 +1253,7 @@ retry_private: */ if (ret == 1) { WARN_ON(pi_state); + drop_count++; task_count++; ret = get_futex_value_locked(&curval2, uaddr2); if (!ret) @@ -1204,7 +1268,7 @@ retry_private: double_unlock_hb(hb1, hb2); put_futex_key(fshared, &key2); put_futex_key(fshared, &key1); - ret = get_user(curval2, uaddr2); + ret = fault_in_user_writeable(uaddr2); if (!ret) goto retry; goto out; @@ -1228,8 +1292,15 @@ retry_private: if (!match_futex(&this->key, &key1)) continue; - WARN_ON(!requeue_pi && this->rt_waiter); - WARN_ON(requeue_pi && !this->rt_waiter); + /* + * FUTEX_WAIT_REQEUE_PI and FUTEX_CMP_REQUEUE_PI should always + * be paired with each other and no other futex ops. + */ + if ((requeue_pi && !this->rt_waiter) || + (!requeue_pi && this->rt_waiter)) { + ret = -EINVAL; + break; + } /* * Wake nr_wake waiters. For requeue_pi, if we acquired the @@ -1241,6 +1312,12 @@ retry_private: continue; } + /* Ensure we requeue to the expected futex for requeue_pi. */ + if (requeue_pi && !match_futex(this->requeue_pi_key, &key2)) { + ret = -EINVAL; + break; + } + /* * Requeue nr_requeue waiters and possibly one more in the case * of requeue_pi if we couldn't acquire the lock atomically. @@ -1254,7 +1331,8 @@ retry_private: this->task, 1); if (ret == 1) { /* We got the lock. */ - requeue_pi_wake_futex(this, &key2); + requeue_pi_wake_futex(this, &key2, hb2); + drop_count++; continue; } else if (ret) { /* -EDEADLK */ @@ -1302,6 +1380,25 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) return hb; } +static inline void +queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) +{ + spin_unlock(&hb->lock); + drop_futex_key_refs(&q->key); +} + +/** + * queue_me() - Enqueue the futex_q on the futex_hash_bucket + * @q: The futex_q to enqueue + * @hb: The destination hash bucket + * + * The hb->lock must be held by the caller, and is released here. A call to + * queue_me() is typically paired with exactly one call to unqueue_me(). The + * exceptions involve the PI related operations, which may use unqueue_me_pi() + * or nothing if the unqueue is done as part of the wake process and the unqueue + * state is implicit in the state of woken task (see futex_wait_requeue_pi() for + * an example). + */ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -1318,26 +1415,24 @@ static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) plist_node_init(&q->list, prio); #ifdef CONFIG_DEBUG_PI_LIST - q->list.plist.lock = &hb->lock; + q->list.plist.spinlock = &hb->lock; #endif plist_add(&q->list, &hb->chain); q->task = current; spin_unlock(&hb->lock); } -static inline void -queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) -{ - spin_unlock(&hb->lock); - drop_futex_key_refs(&q->key); -} - -/* - * queue_me and unqueue_me must be called as a pair, each - * exactly once. They are called with the hashed spinlock held. +/** + * unqueue_me() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be held by the caller. A call to unqueue_me() must + * be paired with exactly one earlier call to queue_me(). + * + * Returns: + * 1 - if the futex_q was still queued (and we removed unqueued it) + * 0 - if the futex_q was already removed by the waking thread */ - -/* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { spinlock_t *lock_ptr; @@ -1455,18 +1550,18 @@ retry: * itself. */ if (pi_state->owner != NULL) { - spin_lock_irq(&pi_state->owner->pi_lock); + raw_spin_lock_irq(&pi_state->owner->pi_lock); WARN_ON(list_empty(&pi_state->list)); list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); + raw_spin_unlock_irq(&pi_state->owner->pi_lock); } pi_state->owner = newowner; - spin_lock_irq(&newowner->pi_lock); + raw_spin_lock_irq(&newowner->pi_lock); WARN_ON(!list_empty(&pi_state->list)); list_add(&pi_state->list, &newowner->pi_state_list); - spin_unlock_irq(&newowner->pi_lock); + raw_spin_unlock_irq(&newowner->pi_lock); return 0; /* @@ -1482,7 +1577,7 @@ retry: handle_fault: spin_unlock(q->lock_ptr); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); spin_lock(q->lock_ptr); @@ -1507,7 +1602,6 @@ handle_fault: #define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); -static long futex_lock_pi_restart(struct restart_block *restart); /** * fixup_owner() - Post lock pi_state and corner case management @@ -1591,17 +1685,14 @@ out: static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, struct hrtimer_sleeper *timeout) { - queue_me(q, hb); - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. + * The task state is guaranteed to be set before another task can + * wake it. set_current_state() is implemented using set_mb() and + * queue_me() calls spin_unlock() upon completion, both serializing + * access to the hash list and forcing another memory barrier. */ set_current_state(TASK_INTERRUPTIBLE); + queue_me(q, hb); /* Arm the timer */ if (timeout) { @@ -1611,8 +1702,8 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, } /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + * If we have been removed from the hash list, then another task + * has tried to wake us, and we can skip the call to schedule(). */ if (likely(!plist_node_empty(&q->list))) { /* @@ -1668,7 +1759,7 @@ static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, */ retry: q->key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q->key, VERIFY_READ); + ret = get_futex_key(uaddr, fshared, &q->key); if (unlikely(ret != 0)) return ret; @@ -1717,6 +1808,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.bitset = bitset; q.rt_waiter = NULL; + q.requeue_pi_key = NULL; if (abs_time) { to = &timeout; @@ -1728,6 +1820,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } +retry: /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -1745,9 +1838,14 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out_put_key; /* - * We expect signal_pending(current), but another thread may - * have handled it for us already. + * We expect signal_pending(current), but we might be the + * victim of a spurious wakeup as well. */ + if (!signal_pending(current)) { + put_futex_key(fshared, &q.key); + goto retry; + } + ret = -ERESTARTSYS; if (!abs_time) goto out_put_key; @@ -1808,7 +1906,6 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, { struct hrtimer_sleeper timeout, *to = NULL; struct futex_hash_bucket *hb; - u32 uval; struct futex_q q; int res, ret; @@ -1825,9 +1922,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.rt_waiter = NULL; + q.requeue_pi_key = NULL; retry: q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key, VERIFY_WRITE); + ret = get_futex_key(uaddr, fshared, &q.key); if (unlikely(ret != 0)) goto out; @@ -1897,7 +1995,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - goto out; + goto out_put_key; out_unlock_put_key: queue_unlock(&q, hb); @@ -1910,16 +2008,9 @@ out: return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: - /* - * We have to r/w *(int __user *)uaddr, and we have to modify it - * atomically. Therefore, if we continue to fault after get_user() - * below, we need to handle the fault ourselves, while still holding - * the mmap_sem. This can occur if the uaddr is under contention as - * we have to drop the mmap_sem in order to call get_user(). - */ queue_unlock(&q, hb); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); if (ret) goto out_put_key; @@ -1930,21 +2021,6 @@ uaddr_faulted: goto retry; } -static long futex_lock_pi_restart(struct restart_block *restart) -{ - u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; - ktime_t t, *tp = NULL; - int fshared = restart->futex.flags & FLAGS_SHARED; - - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t.tv64 = restart->futex.time; - tp = &t; - } - restart->fn = do_no_restart_syscall; - - return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0); -} - /* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), @@ -1968,7 +2044,7 @@ retry: if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; - ret = get_futex_key(uaddr, fshared, &key, VERIFY_WRITE); + ret = get_futex_key(uaddr, fshared, &key); if (unlikely(ret != 0)) goto out; @@ -2029,17 +2105,10 @@ out: return ret; pi_faulted: - /* - * We have to r/w *(int __user *)uaddr, and we have to modify it - * atomically. Therefore, if we continue to fault after get_user() - * below, we need to handle the fault ourselves, while still holding - * the mmap_sem. This can occur if the uaddr is under contention as - * we have to drop the mmap_sem in order to call get_user(). - */ spin_unlock(&hb->lock); put_futex_key(fshared, &key); - ret = get_user(uval, uaddr); + ret = fault_in_user_writeable(uaddr); if (!ret) goto retry; @@ -2083,11 +2152,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * Unqueue the futex_q and determine which it was. */ plist_del(&q->list, &q->list.plist); - drop_futex_key_refs(&q->key); + /* Handle spurious wakeups gracefully */ + ret = -EWOULDBLOCK; if (timeout && !timeout->task) ret = -ETIMEDOUT; - else + else if (signal_pending(current)) ret = -ERESTARTNOINTR; } return ret; @@ -2095,12 +2165,12 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, /** * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 - * @uaddr: the futex we initialyl wait on (non-pi) + * @uaddr: the futex we initially wait on (non-pi) * @fshared: whether the futexes are shared (1) or not (0). They must be * the same type, no requeueing from private to shared, etc. * @val: the expected value of uaddr * @abs_time: absolute timeout - * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) * @uaddr2: the pi futex we will take prior to returning to user-space * @@ -2114,11 +2184,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * We call schedule in futex_wait_queue_me() when we enqueue and return there * via the following: * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() - * 2) wakeup on uaddr2 after a requeue and subsequent unlock - * 3) signal (before or after requeue) - * 4) timeout (before or after requeue) + * 2) wakeup on uaddr2 after a requeue + * 3) signal + * 4) timeout * - * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * If 3, cleanup and return -ERESTARTNOINTR. * * If 2, we may then block on trying to take the rt_mutex and return via: * 5) successful lock @@ -2126,7 +2196,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, * 7) timeout * 8) other lock acquisition failure * - * If 6, we setup a restart_block with futex_lock_pi() as the function. + * If 6, return -EWOULDBLOCK (restarting the syscall would do the same). * * If 4 or 7, we cleanup and return with -ETIMEDOUT. * @@ -2141,12 +2211,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; - struct restart_block *restart; struct futex_hash_bucket *hb; union futex_key key2; struct futex_q q; int res, ret; - u32 uval; if (!bitset) return -EINVAL; @@ -2167,15 +2235,16 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, debug_rt_mutex_init_waiter(&rt_waiter); rt_waiter.task = NULL; - q.pi_state = NULL; - q.bitset = bitset; - q.rt_waiter = &rt_waiter; - key2 = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE); + ret = get_futex_key(uaddr2, fshared, &key2); if (unlikely(ret != 0)) goto out; + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + q.requeue_pi_key = &key2; + /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); if (ret) @@ -2228,7 +2297,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, res = fixup_owner(uaddr2, fshared, &q, !ret); /* * If fixup_owner() returned an error, proprogate that. If it - * acquired the lock, clear our -ETIMEDOUT or -EINTR. + * acquired the lock, clear -ETIMEDOUT or -EINTR. */ if (res) ret = (res < 0) ? res : 0; @@ -2245,30 +2314,14 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, if (rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { - ret = -EFAULT; - if (get_user(uval, uaddr2)) - goto out_put_keys; - /* - * We've already been requeued, so restart by calling - * futex_lock_pi() directly, rather then returning to this - * function. + * We've already been requeued, but cannot restart by calling + * futex_lock_pi() directly. We could restart this syscall, but + * it would detect that the user space "val" changed and return + * -EWOULDBLOCK. Save the overhead of the restart and return + * -EWOULDBLOCK directly. */ - ret = -ERESTART_RESTARTBLOCK; - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_lock_pi_restart; - restart->futex.uaddr = (u32 *)uaddr2; - restart->futex.val = uval; - restart->futex.flags = 0; - if (abs_time) { - restart->futex.flags |= FLAGS_HAS_TIMEOUT; - restart->futex.time = abs_time->tv64; - } - - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - if (clockrt) - restart->futex.flags |= FLAGS_CLOCKRT; + ret = -EWOULDBLOCK; } out_put_keys: @@ -2300,9 +2353,9 @@ out: */ /** - * sys_set_robust_list - set the robust-futex list head of a task - * @head: pointer to the list-head - * @len: length of the list-head, as userspace expects + * sys_set_robust_list() - Set the robust-futex list head of a task + * @head: pointer to the list-head + * @len: length of the list-head, as userspace expects */ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, size_t, len) @@ -2321,10 +2374,10 @@ SYSCALL_DEFINE2(set_robust_list, struct robust_list_head __user *, head, } /** - * sys_get_robust_list - get the robust-futex list head of a task - * @pid: pid of the process [zero for current task] - * @head_ptr: pointer to a list-head pointer, the kernel fills it in - * @len_ptr: pointer to a length field, the kernel fills in the header size + * sys_get_robust_list() - Get the robust-futex list head of a task + * @pid: pid of the process [zero for current task] + * @head_ptr: pointer to a list-head pointer, the kernel fills it in + * @len_ptr: pointer to a length field, the kernel fills in the header size */ SYSCALL_DEFINE3(get_robust_list, int, pid, struct robust_list_head __user * __user *, head_ptr,