X-Git-Url: http://ftp.safe.ca/?a=blobdiff_plain;ds=sidebyside;f=kernel%2Ffutex.c;h=4fe790e89d0f34af1cc24359d32bca3b58e970ca;hb=ac40b1fae5e8ac831298351ff8291944d1ff0354;hp=a12425051ee98f8f51944905ec82244631fffc37;hpb=83c54070ee1a2d05c89793884bea1a03f2851ed4;p=safe%2Fjmp%2Flinux-2.6 diff --git a/kernel/futex.c b/kernel/futex.c index a124250..4fe790e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -52,10 +52,16 @@ #include #include #include +#include +#include +#include + #include #include "rtmutex_common.h" +int __read_mostly futex_cmpxchg_enabled; + #define FUTEX_HASHBITS (CONFIG_BASE_SMALL ? 4 : 8) /* @@ -98,13 +104,12 @@ struct futex_q { /* Key which the futex is hashed on: */ union futex_key key; - /* For fd, sigio sent using these: */ - int fd; - struct file *filp; - /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; struct task_struct *task; + + /* Bitset for the optional bitmasked wakeup */ + u32 bitset; }; /* @@ -117,9 +122,6 @@ struct futex_hash_bucket { static struct futex_hash_bucket futex_queues[1<mmap_sem, when futex is shared */ @@ -177,8 +179,8 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2) * For other futexes, it points to ¤t->mm->mmap_sem and * caller must have taken the reader lock. but NOT any spinlocks. */ -int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, - union futex_key *key) +static int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, + union futex_key *key) { unsigned long address = (unsigned long)uaddr; struct mm_struct *mm = current->mm; @@ -264,16 +266,15 @@ int get_futex_key(u32 __user *uaddr, struct rw_semaphore *fshared, } return err; } -EXPORT_SYMBOL_GPL(get_futex_key); /* * Take a reference to the resource addressed by a key. * Can be called while holding spinlocks. * */ -inline void get_futex_key_refs(union futex_key *key) +static void get_futex_key_refs(union futex_key *key) { - if (key->both.ptr == 0) + if (key->both.ptr == NULL) return; switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: @@ -284,15 +285,14 @@ inline void get_futex_key_refs(union futex_key *key) break; } } -EXPORT_SYMBOL_GPL(get_futex_key_refs); /* * Drop a reference to the resource addressed by a key. * The hash bucket spinlock must not be held. */ -void drop_futex_key_refs(union futex_key *key) +static void drop_futex_key_refs(union futex_key *key) { - if (key->both.ptr == 0) + if (!key->both.ptr) return; switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) { case FUT_OFF_INODE: @@ -303,7 +303,6 @@ void drop_futex_key_refs(union futex_key *key) break; } } -EXPORT_SYMBOL_GPL(drop_futex_key_refs); static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) { @@ -440,14 +439,20 @@ static void free_pi_state(struct futex_pi_state *pi_state) static struct task_struct * futex_find_get_task(pid_t pid) { struct task_struct *p; + const struct cred *cred = current_cred(), *pcred; rcu_read_lock(); - p = find_task_by_pid(pid); - - if (!p || ((current->euid != p->euid) && (current->euid != p->uid))) + p = find_task_by_vpid(pid); + if (!p) { p = ERR_PTR(-ESRCH); - else - get_task_struct(p); + } else { + pcred = __task_cred(p); + if (cred->euid != pcred->euid && + cred->euid != pcred->uid) + p = ERR_PTR(-ESRCH); + else + get_task_struct(p); + } rcu_read_unlock(); @@ -466,6 +471,8 @@ void exit_pi_state_list(struct task_struct *curr) struct futex_hash_bucket *hb; union futex_key key; + if (!futex_cmpxchg_enabled) + return; /* * We are a ZOMBIE and nobody can enqueue itself on * pi_state_list anymore, but we have to be careful @@ -603,8 +610,6 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, static void wake_futex(struct futex_q *q) { plist_del(&q->list, &q->list.plist); - if (q->filp) - send_sigio(&q->filp->f_owner, q->fd, POLL_IN); /* * The lock in wake_up_all() is a crucial memory barrier after the * plist_del() and also before assigning to q->lock_ptr. @@ -652,13 +657,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!(uval & FUTEX_OWNER_DIED)) { int ret = 0; - newval = FUTEX_WAITERS | new_owner->pid; + newval = FUTEX_WAITERS | task_pid_vnr(new_owner); curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (curval == -EFAULT) ret = -EFAULT; - if (curval != uval) + else if (curval != uval) ret = -EINVAL; if (ret) { spin_unlock(&pi_state->pi_mutex.wait_lock); @@ -722,7 +727,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2) * to this virtual address: */ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, - int nr_wake) + int nr_wake, u32 bitset) { struct futex_hash_bucket *hb; struct futex_q *this, *next; @@ -730,6 +735,9 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, union futex_key key; int ret; + if (!bitset) + return -EINVAL; + futex_lock_mm(fshared); ret = get_futex_key(uaddr, fshared, &key); @@ -746,6 +754,11 @@ static int futex_wake(u32 __user *uaddr, struct rw_semaphore *fshared, ret = -EINVAL; break; } + + /* Check if one of the bits is set in both bitsets */ + if (!(this->bitset & bitset)) + continue; + wake_futex(this); if (++ret >= nr_wake) break; @@ -973,14 +986,10 @@ out: } /* The key must be already stored in q->key. */ -static inline struct futex_hash_bucket * -queue_lock(struct futex_q *q, int fd, struct file *filp) +static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - q->fd = fd; - q->filp = filp; - init_waitqueue_head(&q->waiters); get_futex_key_refs(&q->key); @@ -991,7 +1000,7 @@ queue_lock(struct futex_q *q, int fd, struct file *filp) return hb; } -static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb) +static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb) { int prio; @@ -1026,15 +1035,6 @@ queue_unlock(struct futex_q *q, struct futex_hash_bucket *hb) * exactly once. They are called with the hashed spinlock held. */ -/* The key must be already stored in q->key. */ -static void queue_me(struct futex_q *q, int fd, struct file *filp) -{ - struct futex_hash_bucket *hb; - - hb = queue_lock(q, fd, filp); - __queue_me(q, hb); -} - /* Return 1 if we were still queued (ie. 0 means we were woken) */ static int unqueue_me(struct futex_q *q) { @@ -1045,7 +1045,7 @@ static int unqueue_me(struct futex_q *q) retry: lock_ptr = q->lock_ptr; barrier(); - if (lock_ptr != 0) { + if (lock_ptr != NULL) { spin_lock(lock_ptr); /* * q->lock_ptr can change between reading it and @@ -1097,66 +1097,118 @@ static void unqueue_me_pi(struct futex_q *q) } /* - * Fixup the pi_state owner with current. + * Fixup the pi_state owner with the new owner. * * Must be called with hash bucket lock held and mm->sem held for non * private futexes. */ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q, - struct task_struct *curr) + struct task_struct *newowner, + struct rw_semaphore *fshared) { - u32 newtid = curr->pid | FUTEX_WAITERS; + u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS; struct futex_pi_state *pi_state = q->pi_state; + struct task_struct *oldowner = pi_state->owner; u32 uval, curval, newval; - int ret; + int ret, attempt = 0; /* Owner died? */ - if (pi_state->owner != NULL) { - spin_lock_irq(&pi_state->owner->pi_lock); - WARN_ON(list_empty(&pi_state->list)); - list_del_init(&pi_state->list); - spin_unlock_irq(&pi_state->owner->pi_lock); - } else + if (!pi_state->owner) newtid |= FUTEX_OWNER_DIED; - pi_state->owner = curr; - - spin_lock_irq(&curr->pi_lock); - WARN_ON(!list_empty(&pi_state->list)); - list_add(&pi_state->list, &curr->pi_state_list); - spin_unlock_irq(&curr->pi_lock); - /* - * We own it, so we have to replace the pending owner - * TID. This must be atomic as we have preserve the - * owner died bit here. + * We are here either because we stole the rtmutex from the + * pending owner or we are the pending owner which failed to + * get the rtmutex. We have to replace the pending owner TID + * in the user space variable. This must be atomic as we have + * to preserve the owner died bit here. + * + * Note: We write the user space value _before_ changing the + * pi_state because we can fault here. Imagine swapped out + * pages or a fork, which was running right before we acquired + * mmap_sem, that marked all the anonymous memory readonly for + * cow. + * + * Modifying pi_state _before_ the user space value would + * leave the pi_state in an inconsistent state when we fault + * here, because we need to drop the hash bucket lock to + * handle the fault. This might be observed in the PID check + * in lookup_pi_state. */ - ret = get_futex_value_locked(&uval, uaddr); +retry: + if (get_futex_value_locked(&uval, uaddr)) + goto handle_fault; - while (!ret) { + while (1) { newval = (uval & FUTEX_OWNER_DIED) | newtid; curval = cmpxchg_futex_value_locked(uaddr, uval, newval); if (curval == -EFAULT) - ret = -EFAULT; + goto handle_fault; if (curval == uval) break; uval = curval; } - return ret; + + /* + * We fixed up user space. Now we need to fix the pi_state + * itself. + */ + if (pi_state->owner != NULL) { + spin_lock_irq(&pi_state->owner->pi_lock); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + spin_unlock_irq(&pi_state->owner->pi_lock); + } + + pi_state->owner = newowner; + + spin_lock_irq(&newowner->pi_lock); + WARN_ON(!list_empty(&pi_state->list)); + list_add(&pi_state->list, &newowner->pi_state_list); + spin_unlock_irq(&newowner->pi_lock); + return 0; + + /* + * To handle the page fault we need to drop the hash bucket + * lock here. That gives the other task (either the pending + * owner itself or the task which stole the rtmutex) the + * chance to try the fixup of the pi_state. So once we are + * back from handling the fault we need to check the pi_state + * after reacquiring the hash bucket lock and before trying to + * do another fixup. When the fixup has been done already we + * simply return. + */ +handle_fault: + spin_unlock(q->lock_ptr); + + ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++); + + spin_lock(q->lock_ptr); + + /* + * Check if someone else fixed it for us: + */ + if (pi_state->owner != oldowner) + return 0; + + if (ret) + return ret; + + goto retry; } /* * In case we must use restart_block to restart a futex_wait, - * we encode in the 'arg3' shared capability + * we encode in the 'flags' shared capability */ -#define ARG3_SHARED 1 +#define FLAGS_SHARED 1 static long futex_wait_restart(struct restart_block *restart); static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, - u32 val, ktime_t *abs_time) + u32 val, ktime_t *abs_time, u32 bitset) { struct task_struct *curr = current; DECLARE_WAITQUEUE(wait, curr); @@ -1167,7 +1219,11 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, struct hrtimer_sleeper t; int rem = 0; + if (!bitset) + return -EINVAL; + q.pi_state = NULL; + q.bitset = bitset; retry: futex_lock_mm(fshared); @@ -1175,7 +1231,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (unlikely(ret != 0)) goto out_release_sem; - hb = queue_lock(&q, -1, NULL); + hb = queue_lock(&q); /* * Access the page AFTER the futex is queued. @@ -1219,7 +1275,7 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, goto out_unlock_release_sem; /* Only actually queue if *uaddr contained val. */ - __queue_me(&q, hb); + queue_me(&q, hb); /* * Now the futex is queued and we have checked the data, we @@ -1247,11 +1303,18 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, if (!abs_time) schedule(); else { - hrtimer_init(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + unsigned long slack; + slack = current->timer_slack_ns; + if (rt_task(current)) + slack = 0; + hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(&t, current); - t.timer.expires = *abs_time; + hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack); - hrtimer_start(&t.timer, t.timer.expires, HRTIMER_MODE_ABS); + hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&t.timer)) + t.task = NULL; /* * the timer could have already expired, in which @@ -1265,6 +1328,8 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, /* Flag if a timeout occured */ rem = (t.task == NULL); + + destroy_hrtimer_on_stack(&t.timer); } } __set_current_state(TASK_RUNNING); @@ -1290,12 +1355,14 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, struct restart_block *restart; restart = ¤t_thread_info()->restart_block; restart->fn = futex_wait_restart; - restart->arg0 = (unsigned long)uaddr; - restart->arg1 = (unsigned long)val; - restart->arg2 = (unsigned long)abs_time; - restart->arg3 = 0; + restart->futex.uaddr = (u32 *)uaddr; + restart->futex.val = val; + restart->futex.time = abs_time->tv64; + restart->futex.bitset = bitset; + restart->futex.flags = 0; + if (fshared) - restart->arg3 |= ARG3_SHARED; + restart->futex.flags |= FLAGS_SHARED; return -ERESTART_RESTARTBLOCK; } @@ -1310,15 +1377,16 @@ static int futex_wait(u32 __user *uaddr, struct rw_semaphore *fshared, static long futex_wait_restart(struct restart_block *restart) { - u32 __user *uaddr = (u32 __user *)restart->arg0; - u32 val = (u32)restart->arg1; - ktime_t *abs_time = (ktime_t *)restart->arg2; + u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; struct rw_semaphore *fshared = NULL; + ktime_t t; + t.tv64 = restart->futex.time; restart->fn = do_no_restart_syscall; - if (restart->arg3 & ARG3_SHARED) + if (restart->futex.flags & FLAGS_SHARED) fshared = ¤t->mm->mmap_sem; - return (long)futex_wait(uaddr, fshared, val, abs_time); + return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, + restart->futex.bitset); } @@ -1343,9 +1411,10 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, if (time) { to = &timeout; - hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS); + hrtimer_init_on_stack(&to->timer, CLOCK_REALTIME, + HRTIMER_MODE_ABS); hrtimer_init_sleeper(to, current); - to->timer.expires = *time; + hrtimer_set_expires(&to->timer, *time); } q.pi_state = NULL; @@ -1357,7 +1426,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, goto out_release_sem; retry_unlocked: - hb = queue_lock(&q, -1, NULL); + hb = queue_lock(&q); retry_locked: ret = lock_taken = 0; @@ -1367,7 +1436,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * (by doing a 0 -> TID atomic cmpxchg), while holding all * the locks. It will most likely not succeed. */ - newval = current->pid; + newval = task_pid_vnr(current); curval = cmpxchg_futex_value_locked(uaddr, 0, newval); @@ -1378,7 +1447,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * Detect deadlocks. In case of REQUEUE_PI this is a valid * situation and we return success to user space. */ - if (unlikely((curval & FUTEX_TID_MASK) == current->pid)) { + if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { ret = -EDEADLK; goto out_unlock_release_sem; } @@ -1407,7 +1476,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, */ if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | current->pid; + newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); ownerdied = 0; lock_taken = 1; } @@ -1470,7 +1539,7 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, /* * Only actually queue now that the atomic ops are done: */ - __queue_me(&q, hb); + queue_me(&q, hb); /* * Now the futex is queued and we have checked the data, we @@ -1500,16 +1569,45 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, * that case: */ if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr); + ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); } else { /* * Catch the rare case, where the lock was released * when we were on the way back before we locked the * hash bucket. */ - if (q.pi_state->owner == curr && - rt_mutex_trylock(&q.pi_state->pi_mutex)) { - ret = 0; + if (q.pi_state->owner == curr) { + /* + * Try to get the rt_mutex now. This might + * fail as some other task acquired the + * rt_mutex after we removed ourself from the + * rt_mutex waiters list. + */ + if (rt_mutex_trylock(&q.pi_state->pi_mutex)) + ret = 0; + else { + /* + * pi_state is incorrect, some other + * task did a lock steal and we + * returned due to timeout or signal + * without taking the rt_mutex. Too + * late. We can access the + * rt_mutex_owner without locking, as + * the other task is now blocked on + * the hash bucket lock. Fix the state + * up. + */ + struct task_struct *owner; + int res; + + owner = rt_mutex_owner(&q.pi_state->pi_mutex); + res = fixup_pi_state_owner(uaddr, &q, owner, + fshared); + + /* propagate -EFAULT, if the fixup failed */ + if (res) + ret = res; + } } else { /* * Paranoia check. If we did not take the lock @@ -1529,6 +1627,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, unqueue_me_pi(&q); futex_unlock_mm(fshared); + if (to) + destroy_hrtimer_on_stack(&to->timer); return ret != -EINTR ? ret : -ERESTARTNOINTR; out_unlock_release_sem: @@ -1536,6 +1636,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, out_release_sem: futex_unlock_mm(fshared); + if (to) + destroy_hrtimer_on_stack(&to->timer); return ret; uaddr_faulted: @@ -1563,6 +1665,8 @@ static int futex_lock_pi(u32 __user *uaddr, struct rw_semaphore *fshared, if (!ret && (uval != -EFAULT)) goto retry; + if (to) + destroy_hrtimer_on_stack(&to->timer); return ret; } @@ -1586,7 +1690,7 @@ retry: /* * We release only a lock we actually own: */ - if ((uval & FUTEX_TID_MASK) != current->pid) + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current)) return -EPERM; /* * First take all the futex related locks: @@ -1607,7 +1711,7 @@ retry_unlocked: * anyone else up: */ if (!(uval & FUTEX_OWNER_DIED)) - uval = cmpxchg_futex_value_locked(uaddr, current->pid, 0); + uval = cmpxchg_futex_value_locked(uaddr, task_pid_vnr(current), 0); if (unlikely(uval == -EFAULT)) @@ -1616,7 +1720,7 @@ retry_unlocked: * Rare case: we managed to release the lock atomically, * no need to wake anyone else up: */ - if (unlikely(uval == current->pid)) + if (unlikely(uval == task_pid_vnr(current))) goto out_unlock; /* @@ -1670,6 +1774,7 @@ pi_faulted: attempt); if (ret) goto out; + uval = 0; goto retry_unlocked; } @@ -1682,121 +1787,6 @@ pi_faulted: return ret; } -static int futex_close(struct inode *inode, struct file *filp) -{ - struct futex_q *q = filp->private_data; - - unqueue_me(q); - kfree(q); - - return 0; -} - -/* This is one-shot: once it's gone off you need a new fd */ -static unsigned int futex_poll(struct file *filp, - struct poll_table_struct *wait) -{ - struct futex_q *q = filp->private_data; - int ret = 0; - - poll_wait(filp, &q->waiters, wait); - - /* - * plist_node_empty() is safe here without any lock. - * q->lock_ptr != 0 is not safe, because of ordering against wakeup. - */ - if (plist_node_empty(&q->list)) - ret = POLLIN | POLLRDNORM; - - return ret; -} - -static const struct file_operations futex_fops = { - .release = futex_close, - .poll = futex_poll, -}; - -/* - * Signal allows caller to avoid the race which would occur if they - * set the sigio stuff up afterwards. - */ -static int futex_fd(u32 __user *uaddr, int signal) -{ - struct futex_q *q; - struct file *filp; - int ret, err; - struct rw_semaphore *fshared; - static unsigned long printk_interval; - - if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) { - printk(KERN_WARNING "Process `%s' used FUTEX_FD, which " - "will be removed from the kernel in June 2007\n", - current->comm); - } - - ret = -EINVAL; - if (!valid_signal(signal)) - goto out; - - ret = get_unused_fd(); - if (ret < 0) - goto out; - filp = get_empty_filp(); - if (!filp) { - put_unused_fd(ret); - ret = -ENFILE; - goto out; - } - filp->f_op = &futex_fops; - filp->f_path.mnt = mntget(futex_mnt); - filp->f_path.dentry = dget(futex_mnt->mnt_root); - filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping; - - if (signal) { - err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1); - if (err < 0) { - goto error; - } - filp->f_owner.signum = signal; - } - - q = kmalloc(sizeof(*q), GFP_KERNEL); - if (!q) { - err = -ENOMEM; - goto error; - } - q->pi_state = NULL; - - fshared = ¤t->mm->mmap_sem; - down_read(fshared); - err = get_futex_key(uaddr, fshared, &q->key); - - if (unlikely(err != 0)) { - up_read(fshared); - kfree(q); - goto error; - } - - /* - * queue_me() must be called before releasing mmap_sem, because - * key->shared.inode needs to be referenced while holding it. - */ - filp->private_data = q; - - queue_me(q, ret, filp); - up_read(fshared); - - /* Now we map fd to filp, so userspace can access it */ - fd_install(ret, filp); -out: - return ret; -error: - put_unused_fd(ret); - put_filp(filp); - ret = err; - goto out; -} - /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -1821,6 +1811,8 @@ asmlinkage long sys_set_robust_list(struct robust_list_head __user *head, size_t len) { + if (!futex_cmpxchg_enabled) + return -ENOSYS; /* * The kernel knows only one size for now: */ @@ -1844,6 +1836,10 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, { struct robust_list_head __user *head; unsigned long ret; + const struct cred *cred = current_cred(), *pcred; + + if (!futex_cmpxchg_enabled) + return -ENOSYS; if (!pid) head = current->robust_list; @@ -1852,12 +1848,14 @@ sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr, ret = -ESRCH; rcu_read_lock(); - p = find_task_by_pid(pid); + p = find_task_by_vpid(pid); if (!p) goto err_unlock; ret = -EPERM; - if ((current->euid != p->euid) && (current->euid != p->uid) && - !capable(CAP_SYS_PTRACE)) + pcred = __task_cred(p); + if (cred->euid != pcred->euid && + cred->euid != pcred->uid && + !capable(CAP_SYS_PTRACE)) goto err_unlock; head = p->robust_list; rcu_read_unlock(); @@ -1885,7 +1883,7 @@ retry: if (get_user(uval, uaddr)) return -1; - if ((uval & FUTEX_TID_MASK) == curr->pid) { + if ((uval & FUTEX_TID_MASK) == task_pid_vnr(curr)) { /* * Ok, this dying thread is truly holding a futex * of interest. Set the OWNER_DIED bit atomically @@ -1910,7 +1908,8 @@ retry: * PI futexes happens in exit_pi_state(): */ if (!pi && (uval & FUTEX_WAITERS)) - futex_wake(uaddr, &curr->mm->mmap_sem, 1); + futex_wake(uaddr, &curr->mm->mmap_sem, 1, + FUTEX_BITSET_MATCH_ANY); } return 0; } @@ -1942,9 +1941,13 @@ static inline int fetch_robust_entry(struct robust_list __user **entry, void exit_robust_list(struct task_struct *curr) { struct robust_list_head __user *head = curr->robust_list; - struct robust_list __user *entry, *pending; - unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, next_pi, pip; unsigned long futex_offset; + int rc; + + if (!futex_cmpxchg_enabled) + return; /* * Fetch the list head (which was registered earlier, via @@ -1964,12 +1967,14 @@ void exit_robust_list(struct task_struct *curr) if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) return; - if (pending) - handle_futex_death((void __user *)pending + futex_offset, - curr, pip); - + next_entry = NULL; /* avoid warning with gcc */ while (entry != &head->list) { /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); + /* * A pending lock might already be on the list, so * don't process it twice: */ @@ -1977,11 +1982,10 @@ void exit_robust_list(struct task_struct *curr) if (handle_futex_death((void __user *)entry + futex_offset, curr, pi)) return; - /* - * Fetch the next entry in the list: - */ - if (fetch_robust_entry(&entry, &entry->next, &pi)) + if (rc) return; + entry = next_entry; + pi = next_pi; /* * Avoid excessively long or circular lists: */ @@ -1990,12 +1994,16 @@ void exit_robust_list(struct task_struct *curr) cond_resched(); } + + if (pending) + handle_futex_death((void __user *)pending + futex_offset, + curr, pip); } long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, u32 __user *uaddr2, u32 val2, u32 val3) { - int ret; + int ret = -ENOSYS; int cmd = op & FUTEX_CMD_MASK; struct rw_semaphore *fshared = NULL; @@ -2004,14 +2012,14 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, switch (cmd) { case FUTEX_WAIT: - ret = futex_wait(uaddr, fshared, val, timeout); + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAIT_BITSET: + ret = futex_wait(uaddr, fshared, val, timeout, val3); break; case FUTEX_WAKE: - ret = futex_wake(uaddr, fshared, val); - break; - case FUTEX_FD: - /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */ - ret = futex_fd(uaddr, val); + val3 = FUTEX_BITSET_MATCH_ANY; + case FUTEX_WAKE_BITSET: + ret = futex_wake(uaddr, fshared, val, val3); break; case FUTEX_REQUEUE: ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); @@ -2023,13 +2031,16 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); break; case FUTEX_LOCK_PI: - ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, val, timeout, 0); break; case FUTEX_UNLOCK_PI: - ret = futex_unlock_pi(uaddr, fshared); + if (futex_cmpxchg_enabled) + ret = futex_unlock_pi(uaddr, fshared); break; case FUTEX_TRYLOCK_PI: - ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); + if (futex_cmpxchg_enabled) + ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; default: ret = -ENOSYS; @@ -2047,7 +2058,8 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, u32 val2 = 0; int cmd = op & FUTEX_CMD_MASK; - if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI)) { + if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || + cmd == FUTEX_WAIT_BITSET)) { if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) @@ -2055,48 +2067,44 @@ asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val, t = timespec_to_ktime(ts); if (cmd == FUTEX_WAIT) - t = ktime_add(ktime_get(), t); + t = ktime_add_safe(ktime_get(), t); tp = &t; } /* * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. + * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ - if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE) + if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); } -static int futexfs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data, - struct vfsmount *mnt) -{ - return get_sb_pseudo(fs_type, "futex", NULL, 0xBAD1DEA, mnt); -} - -static struct file_system_type futex_fs_type = { - .name = "futexfs", - .get_sb = futexfs_get_sb, - .kill_sb = kill_anon_super, -}; - -static int __init init(void) +static int __init futex_init(void) { - int i = register_filesystem(&futex_fs_type); - - if (i) - return i; + u32 curval; + int i; - futex_mnt = kern_mount(&futex_fs_type); - if (IS_ERR(futex_mnt)) { - unregister_filesystem(&futex_fs_type); - return PTR_ERR(futex_mnt); - } + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non functional ones will return + * -ENOSYS. + */ + curval = cmpxchg_futex_value_locked(NULL, 0, 0); + if (curval == -EFAULT) + futex_cmpxchg_enabled = 1; for (i = 0; i < ARRAY_SIZE(futex_queues); i++) { plist_head_init(&futex_queues[i].chain, &futex_queues[i].lock); spin_lock_init(&futex_queues[i].lock); } + return 0; } -__initcall(init); +__initcall(futex_init);