/*
* Get parameters which are the keys for a futex.
*
- * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
+ * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
* offset_within_page). For private mappings, it's (uaddr, current->mm).
* We can usually work out the index without swapping in the page.
*
/*
* Linear file mappings are also simple.
*/
- key->shared.inode = vma->vm_file->f_dentry->d_inode;
+ key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
{
int ret;
- inc_preempt_count();
+ pagefault_disable();
ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
- dec_preempt_count();
+ pagefault_enable();
return ret ? -EFAULT : 0;
}
struct vm_area_struct * vma;
struct mm_struct *mm = current->mm;
- if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
+ if (attempt > 2 || !(vma = find_vma(mm, address)) ||
vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
return -EFAULT;
if (likely(current->pi_state_cache))
return 0;
- pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+ pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
if (!pi_state)
return -ENOMEM;
- memset(pi_state, 0, sizeof(*pi_state));
INIT_LIST_HEAD(&pi_state->list);
/* pi_mutex gets initialized later */
pi_state->owner = NULL;
{
struct task_struct *p;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
p = find_task_by_pid(pid);
if (!p)
goto out_unlock;
p = NULL;
goto out_unlock;
}
- if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+ if (p->exit_state != 0) {
p = NULL;
goto out_unlock;
}
get_task_struct(p);
out_unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return p;
}
*/
void exit_pi_state_list(struct task_struct *curr)
{
- struct futex_hash_bucket *hb;
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
+ struct futex_hash_bucket *hb;
union futex_key key;
/*
* We are a ZOMBIE and nobody can enqueue itself on
* pi_state_list anymore, but we have to be careful
- * versus waiters unqueueing themselfs
+ * versus waiters unqueueing themselves:
*/
spin_lock_irq(&curr->pi_lock);
while (!list_empty(head)) {
next = head->next;
pi_state = list_entry(next, struct futex_pi_state, list);
key = pi_state->key;
+ hb = hash_futex(&key);
spin_unlock_irq(&curr->pi_lock);
- hb = hash_futex(&key);
spin_lock(&hb->lock);
spin_lock_irq(&curr->pi_lock);
+ /*
+ * We dropped the pi-lock, so re-check whether this
+ * task still owns the PI-state:
+ */
if (head->next != next) {
spin_unlock(&hb->lock);
continue;
}
- list_del_init(&pi_state->list);
-
WARN_ON(pi_state->owner != curr);
-
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
pi_state->owner = NULL;
spin_unlock_irq(&curr->pi_lock);
head = &hb->chain;
list_for_each_entry_safe(this, next, head, list) {
- if (match_futex (&this->key, &me->key)) {
+ if (match_futex(&this->key, &me->key)) {
/*
* Another waiter already exists - bump up
* the refcount and return its pi_state:
*/
pi_state = this->pi_state;
+ /*
+ * Userspace might have messed up non PI and PI futexes
+ */
+ if (unlikely(!pi_state))
+ return -EINVAL;
+
+ WARN_ON(!atomic_read(&pi_state->refcount));
+
atomic_inc(&pi_state->refcount);
me->pi_state = pi_state;
}
/*
- * We are the first waiter - try to look up the real owner and
- * attach the new pi_state to it:
+ * We are the first waiter - try to look up the real owner and attach
+ * the new pi_state to it, but bail out when the owner died bit is set
+ * and TID = 0:
*/
pid = uval & FUTEX_TID_MASK;
+ if (!pid && (uval & FUTEX_OWNER_DIED))
+ return -ESRCH;
p = futex_find_get_task(pid);
if (!p)
return -ESRCH;
pi_state->key = me->key;
spin_lock_irq(&p->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &p->pi_state_list);
pi_state->owner = p;
spin_unlock_irq(&p->pi_lock);
* at the end of wake_up_all() does not prevent this store from
* moving.
*/
- wmb();
+ smp_wmb();
q->lock_ptr = NULL;
}
if (!pi_state)
return -EINVAL;
+ spin_lock(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
* kept enabled while there is PI state around. We must also
* preserve the owner died bit.)
*/
- newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ newval = FUTEX_WAITERS | new_owner->pid;
- inc_preempt_count();
- curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
- dec_preempt_count();
+ pagefault_disable();
+ curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+ pagefault_enable();
+ if (curval == -EFAULT)
+ return -EFAULT;
+ if (curval != uval)
+ return -EINVAL;
+ }
- if (curval == -EFAULT)
- return -EFAULT;
- if (curval != uval)
- return -EINVAL;
+ spin_lock_irq(&pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&pi_state->list));
+ list_del_init(&pi_state->list);
+ spin_unlock_irq(&pi_state->owner->pi_lock);
- list_del_init(&pi_state->owner->pi_state_list);
+ spin_lock_irq(&new_owner->pi_lock);
+ WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &new_owner->pi_state_list);
pi_state->owner = new_owner;
+ spin_unlock_irq(&new_owner->pi_lock);
+
+ spin_unlock(&pi_state->pi_mutex.wait_lock);
rt_mutex_unlock(&pi_state->pi_mutex);
return 0;
* There is no waiter, so we unlock the futex. The owner died
* bit has not to be preserved here. We are the owner:
*/
- inc_preempt_count();
+ pagefault_disable();
oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
- dec_preempt_count();
+ pagefault_enable();
if (oldval == -EFAULT)
return oldval;
}
/*
+ * Express the locking dependencies for lockdep:
+ */
+static inline void
+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+ if (hb1 <= hb2) {
+ spin_lock(&hb1->lock);
+ if (hb1 < hb2)
+ spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+ } else { /* hb1 > hb2 */
+ spin_lock(&hb2->lock);
+ spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+ }
+}
+
+/*
* Wake up all waiters hashed on the physical page that is mapped
* to this virtual address:
*/
list_for_each_entry_safe(this, next, head, list) {
if (match_futex (&this->key, &key)) {
- if (this->pi_state)
- return -EINVAL;
+ if (this->pi_state) {
+ ret = -EINVAL;
+ break;
+ }
wake_futex(this);
if (++ret >= nr_wake)
break;
hb2 = hash_futex(&key2);
retry:
- if (hb1 < hb2)
- spin_lock(&hb1->lock);
- spin_lock(&hb2->lock);
- if (hb1 > hb2)
- spin_lock(&hb1->lock);
+ double_lock_hb(hb1, hb2);
op_ret = futex_atomic_op_inuser(op, uaddr2);
if (unlikely(op_ret < 0)) {
*/
if (attempt++) {
if (futex_handle_fault((unsigned long)uaddr2,
- attempt))
+ attempt)) {
+ ret = -EFAULT;
goto out;
+ }
goto retry;
}
hb1 = hash_futex(&key1);
hb2 = hash_futex(&key2);
- if (hb1 < hb2)
- spin_lock(&hb1->lock);
- spin_lock(&hb2->lock);
- if (hb1 > hb2)
- spin_lock(&hb1->lock);
+ double_lock_hb(hb1, hb2);
if (likely(cmpval != NULL)) {
u32 curval;
if (++ret <= nr_wake) {
wake_futex(this);
} else {
- list_move_tail(&this->list, &hb2->chain);
- this->lock_ptr = &hb2->lock;
+ /*
+ * If key1 and key2 hash to the same bucket, no need to
+ * requeue.
+ */
+ if (likely(head1 != &hb2->chain)) {
+ list_move_tail(&this->list, &hb2->chain);
+ this->lock_ptr = &hb2->lock;
+ }
this->key = key2;
get_key_refs(&key2);
drop_count++;
if (ret - nr_wake >= nr_requeue)
break;
- /* Make sure to stop if key1 == key2: */
- if (head1 == &hb2->chain && head1 != &next->list)
- head1 = &this->list;
}
}
/* In the common case we don't take the spinlock, which is nice. */
retry:
lock_ptr = q->lock_ptr;
+ barrier();
if (lock_ptr != 0) {
spin_lock(lock_ptr);
/*
* if there are waiters then it will block, it does PI, etc. (Due to
* races the kernel might see a 0 value of the futex too.)
*/
-static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
- struct hrtimer_sleeper *to)
+static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
+ long nsec, int trylock)
{
+ struct hrtimer_sleeper timeout, *to = NULL;
struct task_struct *curr = current;
struct futex_hash_bucket *hb;
u32 uval, newval, curval;
if (refill_pi_state_cache())
return -ENOMEM;
+ if (sec != MAX_SCHEDULE_TIMEOUT) {
+ to = &timeout;
+ hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+ hrtimer_init_sleeper(to, current);
+ to->timer.expires = ktime_set(sec, nsec);
+ }
+
q.pi_state = NULL;
retry:
down_read(&curr->mm->mmap_sem);
*/
newval = current->pid;
- inc_preempt_count();
+ pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
- dec_preempt_count();
+ pagefault_enable();
if (unlikely(curval == -EFAULT))
goto uaddr_faulted;
uval = curval;
newval = uval | FUTEX_WAITERS;
- inc_preempt_count();
+ pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
- dec_preempt_count();
+ pagefault_enable();
if (unlikely(curval == -EFAULT))
goto uaddr_faulted;
newval = current->pid |
FUTEX_OWNER_DIED | FUTEX_WAITERS;
- inc_preempt_count();
+ pagefault_disable();
curval = futex_atomic_cmpxchg_inatomic(uaddr,
uval, newval);
- dec_preempt_count();
+ pagefault_enable();
if (unlikely(curval == -EFAULT))
goto uaddr_faulted;
}
down_read(&curr->mm->mmap_sem);
- hb = queue_lock(&q, -1, NULL);
+ spin_lock(q.lock_ptr);
/*
* Got the lock. We might not be the anticipated owner if we
/* Owner died? */
if (q.pi_state->owner != NULL) {
spin_lock_irq(&q.pi_state->owner->pi_lock);
+ WARN_ON(list_empty(&q.pi_state->list));
list_del_init(&q.pi_state->list);
spin_unlock_irq(&q.pi_state->owner->pi_lock);
} else
q.pi_state->owner = current;
spin_lock_irq(¤t->pi_lock);
+ WARN_ON(!list_empty(&q.pi_state->list));
list_add(&q.pi_state->list, ¤t->pi_state_list);
spin_unlock_irq(¤t->pi_lock);
if (!detect && ret == -EDEADLK && 0)
force_sig(SIGKILL, current);
- return ret;
+ return ret != -EINTR ? ret : -ERESTARTNOINTR;
out_unlock_release_sem:
queue_unlock(&q, hb);
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt))
+ if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+ ret = -EFAULT;
goto out_unlock_release_sem;
-
+ }
goto retry_locked;
}
}
/*
- * Restart handler
- */
-static long futex_lock_pi_restart(struct restart_block *restart)
-{
- struct hrtimer_sleeper timeout, *to = NULL;
- int ret;
-
- restart->fn = do_no_restart_syscall;
-
- if (restart->arg2 || restart->arg3) {
- to = &timeout;
- hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
- hrtimer_init_sleeper(to, current);
- to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
- (u64) restart->arg0;
- }
-
- pr_debug("lock_pi restart: %p, %d (%d)\n",
- (u32 __user *)restart->arg0, current->pid);
-
- ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
- 0, to);
-
- if (ret != -EINTR)
- return ret;
-
- restart->fn = futex_lock_pi_restart;
-
- /* The other values are filled in */
- return -ERESTART_RESTARTBLOCK;
-}
-
-/*
- * Called from the syscall entry below.
- */
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
- long nsec, int trylock)
-{
- struct hrtimer_sleeper timeout, *to = NULL;
- struct restart_block *restart;
- int ret;
-
- if (sec != MAX_SCHEDULE_TIMEOUT) {
- to = &timeout;
- hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
- hrtimer_init_sleeper(to, current);
- to->timer.expires = ktime_set(sec, nsec);
- }
-
- ret = do_futex_lock_pi(uaddr, detect, trylock, to);
-
- if (ret != -EINTR)
- return ret;
-
- pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
-
- restart = ¤t_thread_info()->restart_block;
- restart->fn = futex_lock_pi_restart;
- restart->arg0 = (unsigned long) uaddr;
- restart->arg1 = detect;
- if (to) {
- restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
- restart->arg3 = to->timer.expires.tv64 >> 32;
- } else
- restart->arg2 = restart->arg3 = 0;
-
- return -ERESTART_RESTARTBLOCK;
-}
-
-/*
* Userspace attempted a TID -> 0 atomic transition, and failed.
* This is the in-kernel slowpath: we look up the PI state (if any),
* and do the rt-mutex unlock.
* again. If it succeeds then we can return without waking
* anyone else up:
*/
- inc_preempt_count();
- uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
- dec_preempt_count();
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ pagefault_disable();
+ uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+ pagefault_enable();
+ }
if (unlikely(uval == -EFAULT))
goto pi_faulted;
/*
* No waiters - kernel unlocks the futex:
*/
- ret = unlock_futex_pi(uaddr, uval);
- if (ret == -EFAULT)
- goto pi_faulted;
+ if (!(uval & FUTEX_OWNER_DIED)) {
+ ret = unlock_futex_pi(uaddr, uval);
+ if (ret == -EFAULT)
+ goto pi_faulted;
+ }
out_unlock:
spin_unlock(&hb->lock);
* still holding the mmap_sem.
*/
if (attempt++) {
- if (futex_handle_fault((unsigned long)uaddr, attempt))
+ if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+ ret = -EFAULT;
goto out_unlock;
-
+ }
goto retry_locked;
}
return ret;
}
-static struct file_operations futex_fops = {
+static const struct file_operations futex_fops = {
.release = futex_close,
.poll = futex_poll,
};
struct futex_q *q;
struct file *filp;
int ret, err;
+ static unsigned long printk_interval;
+
+ if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
+ printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
+ "will be removed from the kernel in June 2007\n",
+ current->comm);
+ }
ret = -EINVAL;
if (!valid_signal(signal))
goto out;
}
filp->f_op = &futex_fops;
- filp->f_vfsmnt = mntget(futex_mnt);
- filp->f_dentry = dget(futex_mnt->mnt_root);
- filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+ filp->f_path.mnt = mntget(futex_mnt);
+ filp->f_path.dentry = dget(futex_mnt->mnt_root);
+ filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
if (signal) {
- err = f_setown(filp, current->pid, 1);
+ err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
if (err < 0) {
goto error;
}
* @len_ptr: pointer to a length field, the kernel fills in the header size
*/
asmlinkage long
-sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
size_t __user *len_ptr)
{
- struct robust_list_head *head;
+ struct robust_list_head __user *head;
unsigned long ret;
if (!pid)
struct task_struct *p;
ret = -ESRCH;
- read_lock(&tasklist_lock);
+ rcu_read_lock();
p = find_task_by_pid(pid);
if (!p)
goto err_unlock;
!capable(CAP_SYS_PTRACE))
goto err_unlock;
head = p->robust_list;
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
}
if (put_user(sizeof(*head), len_ptr))
return put_user(head, head_ptr);
err_unlock:
- read_unlock(&tasklist_lock);
+ rcu_read_unlock();
return ret;
}
* Process a futex-list entry, check whether it's owned by the
* dying task, and do notification if so:
*/
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
{
- u32 uval, nval;
+ u32 uval, nval, mval;
retry:
if (get_user(uval, uaddr))
* thread-death.) The rest of the cleanup is done in
* userspace.
*/
- nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
- uval | FUTEX_OWNER_DIED);
+ mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+ nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+
if (nval == -EFAULT)
return -1;
if (nval != uval)
goto retry;
- if (uval & FUTEX_WAITERS)
- futex_wake(uaddr, 1);
+ /*
+ * Wake robust non-PI futexes here. The wakeup of
+ * PI futexes happens in exit_pi_state():
+ */
+ if (!pi) {
+ if (uval & FUTEX_WAITERS)
+ futex_wake(uaddr, 1);
+ }
}
return 0;
}
/*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+ struct robust_list __user * __user *head,
+ int *pi)
+{
+ unsigned long uentry;
+
+ if (get_user(uentry, (unsigned long __user *)head))
+ return -EFAULT;
+
+ *entry = (void __user *)(uentry & ~1UL);
+ *pi = uentry & 1;
+
+ return 0;
+}
+
+/*
* Walk curr->robust_list (very carefully, it's a userspace list!)
* and mark any locks found there dead, and notify any waiters.
*
{
struct robust_list_head __user *head = curr->robust_list;
struct robust_list __user *entry, *pending;
- unsigned int limit = ROBUST_LIST_LIMIT;
+ unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
unsigned long futex_offset;
/*
* Fetch the list head (which was registered earlier, via
* sys_set_robust_list()):
*/
- if (get_user(entry, &head->list.next))
+ if (fetch_robust_entry(&entry, &head->list.next, &pi))
return;
/*
* Fetch the relative futex offset:
* Fetch any possibly pending lock-add first, and handle it
* if it exists:
*/
- if (get_user(pending, &head->list_op_pending))
+ if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
return;
+
if (pending)
- handle_futex_death((void *)pending + futex_offset, curr);
+ handle_futex_death((void __user *)pending + futex_offset, curr, pip);
while (entry != &head->list) {
/*
* don't process it twice:
*/
if (entry != pending)
- if (handle_futex_death((void *)entry + futex_offset,
- curr))
+ if (handle_futex_death((void __user *)entry + futex_offset,
+ curr, pi))
return;
/*
* Fetch the next entry in the list:
*/
- if (get_user(entry, &entry->next))
+ if (fetch_robust_entry(&entry, &entry->next, &pi))
return;
/*
* Avoid excessively long or circular lists:
static int __init init(void)
{
- unsigned int i;
+ int i = register_filesystem(&futex_fs_type);
+
+ if (i)
+ return i;
- register_filesystem(&futex_fs_type);
futex_mnt = kern_mount(&futex_fs_type);
+ if (IS_ERR(futex_mnt)) {
+ unregister_filesystem(&futex_fs_type);
+ return PTR_ERR(futex_mnt);
+ }
for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
INIT_LIST_HEAD(&futex_queues[i].chain);