/* Key which the futex is hashed on: */
union futex_key key;
- /* For fd, sigio sent using these: */
- int fd;
- struct file *filp;
-
/* Optional priority inheritance state: */
struct futex_pi_state *pi_state;
struct task_struct *task;
static struct futex_hash_bucket futex_queues[1<<FUTEX_HASHBITS];
-/* Futex-fs vfsmount entry: */
-static struct vfsmount *futex_mnt;
-
-/*
- * Take mm->mmap_sem, when futex is shared
- */
-static inline void futex_lock_mm(struct rw_semaphore *fshared)
-{
- if (fshared)
- down_read(fshared);
-}
-
-/*
- * Release mm->mmap_sem, when the futex is shared
- */
-static inline void futex_unlock_mm(struct rw_semaphore *fshared)
-{
- if (fshared)
- up_read(fshared);
-}
-
/*
* We hash on the keys returned from get_futex_key (see below).
*/
&& key1->both.offset == key2->both.offset);
}
+/*
+ * Take a reference to the resource addressed by a key.
+ * Can be called while holding spinlocks.
+ *
+ */
+static void get_futex_key_refs(union futex_key *key)
+{
+ if (!key->both.ptr)
+ return;
+
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
+ atomic_inc(&key->shared.inode->i_count);
+ break;
+ case FUT_OFF_MMSHARED:
+ atomic_inc(&key->private.mm->mm_count);
+ break;
+ }
+}
+
+/*
+ * Drop a reference to the resource addressed by a key.
+ * The hash bucket spinlock must not be held.
+ */
+static void drop_futex_key_refs(union futex_key *key)
+{
+ if (!key->both.ptr)
+ return;
+
+ switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
+ case FUT_OFF_INODE:
+ iput(key->shared.inode);
+ break;
+ case FUT_OFF_MMSHARED:
+ mmdrop(key->private.mm);
+ break;
+ }
+}
+
/**
* get_futex_key - Get parameters which are the keys for a futex.
* @uaddr: virtual address of the futex
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
struct page *page;
int err;
key->private.address = address;
return 0;
}
- /*
- * The futex is hashed differently depending on whether
- * it's in a shared or private mapping. So check vma first.
- */
- vma = find_extend_vma(mm, address);
- if (unlikely(!vma))
- return -EFAULT;
- /*
- * Permissions.
- */
- if (unlikely((vma->vm_flags & (VM_IO|VM_READ)) != VM_READ))
- return (vma->vm_flags & VM_IO) ? -EPERM : -EACCES;
+again:
+ down_read(&mm->mmap_sem);
+ err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
+ up_read(&mm->mmap_sem);
+ if (err < 0)
+ return err;
+
+ lock_page(page);
+ if (!page->mapping) {
+ unlock_page(page);
+ put_page(page);
+ goto again;
+ }
/*
* Private mappings are handled in a simple way.
*
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
- * the object not the particular process. Therefore we use
- * VM_MAYSHARE here, not VM_SHARED which is restricted to shared
- * mappings of _writable_ handles.
+ * the object not the particular process.
*/
- if (likely(!(vma->vm_flags & VM_MAYSHARE))) {
- key->both.offset |= FUT_OFF_MMSHARED; /* reference taken on mm */
+ if (PageAnon(page)) {
+ key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
- return 0;
- }
-
- /*
- * Linear file mappings are also simple.
- */
- key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
- key->both.offset |= FUT_OFF_INODE; /* inode-based key. */
- if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
- key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
- + vma->vm_pgoff);
- return 0;
+ } else {
+ key->both.offset |= FUT_OFF_INODE; /* inode-based key */
+ key->shared.inode = page->mapping->host;
+ key->shared.pgoff = page->index;
}
- /*
- * We could walk the page table to read the non-linear
- * pte, and get the page index without fetching the page
- * from swap. But that's a lot of code to duplicate here
- * for a rare case, so we simply fetch the page.
- */
- err = get_user_pages(current, mm, address, 1, 0, 0, &page, NULL);
- if (err >= 0) {
- key->shared.pgoff =
- page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
- put_page(page);
- return 0;
- }
- return err;
-}
+ get_futex_key_refs(key);
-/*
- * Take a reference to the resource addressed by a key.
- * Can be called while holding spinlocks.
- *
- */
-static void get_futex_key_refs(union futex_key *key)
-{
- if (key->both.ptr == NULL)
- return;
- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
- case FUT_OFF_INODE:
- atomic_inc(&key->shared.inode->i_count);
- break;
- case FUT_OFF_MMSHARED:
- atomic_inc(&key->private.mm->mm_count);
- break;
- }
+ unlock_page(page);
+ put_page(page);
+ return 0;
}
-/*
- * Drop a reference to the resource addressed by a key.
- * The hash bucket spinlock must not be held.
- */
-static void drop_futex_key_refs(union futex_key *key)
+static inline
+void put_futex_key(struct rw_semaphore *fshared, union futex_key *key)
{
- if (!key->both.ptr)
- return;
- switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
- case FUT_OFF_INODE:
- iput(key->shared.inode);
- break;
- case FUT_OFF_MMSHARED:
- mmdrop(key->private.mm);
- break;
- }
+ drop_futex_key_refs(key);
}
static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval)
if (attempt > 2)
return ret;
- if (!fshared)
- down_read(&mm->mmap_sem);
+ down_read(&mm->mmap_sem);
vma = find_vma(mm, address);
if (vma && address >= vma->vm_start &&
(vma->vm_flags & VM_WRITE)) {
current->min_flt++;
}
}
- if (!fshared)
- up_read(&mm->mmap_sem);
+ up_read(&mm->mmap_sem);
return ret;
}
/* pi_mutex gets initialized later */
pi_state->owner = NULL;
atomic_set(&pi_state->refcount, 1);
+ pi_state->key = FUTEX_KEY_INIT;
current->pi_state_cache = pi_state;
struct list_head *next, *head = &curr->pi_state_list;
struct futex_pi_state *pi_state;
struct futex_hash_bucket *hb;
- union futex_key key;
+ union futex_key key = FUTEX_KEY_INIT;
if (!futex_cmpxchg_enabled)
return;
static void wake_futex(struct futex_q *q)
{
plist_del(&q->list, &q->list.plist);
- if (q->filp)
- send_sigio(&q->filp->f_owner, q->fd, POLL_IN);
/*
* The lock in wake_up_all() is a crucial memory barrier after the
* plist_del() and also before assigning to q->lock_ptr.
struct futex_hash_bucket *hb;
struct futex_q *this, *next;
struct plist_head *head;
- union futex_key key;
+ union futex_key key = FUTEX_KEY_INIT;
int ret;
if (!bitset)
return -EINVAL;
- futex_lock_mm(fshared);
-
ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
goto out;
spin_unlock(&hb->lock);
out:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &key);
return ret;
}
u32 __user *uaddr2,
int nr_wake, int nr_wake2, int op)
{
- union futex_key key1, key2;
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head;
struct futex_q *this, *next;
int ret, op_ret, attempt = 0;
retryfull:
- futex_lock_mm(fshared);
-
ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
goto retry;
}
- /*
- * If we would have faulted, release mmap_sem,
- * fault it in and start all over again.
- */
- futex_unlock_mm(fshared);
-
ret = get_user(dummy, uaddr2);
if (ret)
return ret;
if (hb1 != hb2)
spin_unlock(&hb2->lock);
out:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
return ret;
}
u32 __user *uaddr2,
int nr_wake, int nr_requeue, u32 *cmpval)
{
- union futex_key key1, key2;
+ union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT;
struct futex_hash_bucket *hb1, *hb2;
struct plist_head *head1;
struct futex_q *this, *next;
int ret, drop_count = 0;
retry:
- futex_lock_mm(fshared);
-
ret = get_futex_key(uaddr1, fshared, &key1);
if (unlikely(ret != 0))
goto out;
if (hb1 != hb2)
spin_unlock(&hb2->lock);
- /*
- * If we would have faulted, release mmap_sem, fault
- * it in and start all over again.
- */
- futex_unlock_mm(fshared);
-
ret = get_user(curval, uaddr1);
if (!ret)
drop_futex_key_refs(&key1);
out:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &key2);
+ put_futex_key(fshared, &key1);
return ret;
}
/* The key must be already stored in q->key. */
-static inline struct futex_hash_bucket *
-queue_lock(struct futex_q *q, int fd, struct file *filp)
+static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
{
struct futex_hash_bucket *hb;
- q->fd = fd;
- q->filp = filp;
-
init_waitqueue_head(&q->waiters);
get_futex_key_refs(&q->key);
return hb;
}
-static inline void __queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
+static inline void queue_me(struct futex_q *q, struct futex_hash_bucket *hb)
{
int prio;
* exactly once. They are called with the hashed spinlock held.
*/
-/* The key must be already stored in q->key. */
-static void queue_me(struct futex_q *q, int fd, struct file *filp)
-{
- struct futex_hash_bucket *hb;
-
- hb = queue_lock(q, fd, filp);
- __queue_me(q, hb);
-}
-
/* Return 1 if we were still queued (ie. 0 means we were woken) */
static int unqueue_me(struct futex_q *q)
{
* private futexes.
*/
static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
- struct task_struct *newowner)
+ struct task_struct *newowner,
+ struct rw_semaphore *fshared)
{
u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
struct futex_pi_state *pi_state = q->pi_state;
+ struct task_struct *oldowner = pi_state->owner;
u32 uval, curval, newval;
- int ret;
+ int ret, attempt = 0;
/* Owner died? */
+ if (!pi_state->owner)
+ newtid |= FUTEX_OWNER_DIED;
+
+ /*
+ * We are here either because we stole the rtmutex from the
+ * pending owner or we are the pending owner which failed to
+ * get the rtmutex. We have to replace the pending owner TID
+ * in the user space variable. This must be atomic as we have
+ * to preserve the owner died bit here.
+ *
+ * Note: We write the user space value _before_ changing the
+ * pi_state because we can fault here. Imagine swapped out
+ * pages or a fork, which was running right before we acquired
+ * mmap_sem, that marked all the anonymous memory readonly for
+ * cow.
+ *
+ * Modifying pi_state _before_ the user space value would
+ * leave the pi_state in an inconsistent state when we fault
+ * here, because we need to drop the hash bucket lock to
+ * handle the fault. This might be observed in the PID check
+ * in lookup_pi_state.
+ */
+retry:
+ if (get_futex_value_locked(&uval, uaddr))
+ goto handle_fault;
+
+ while (1) {
+ newval = (uval & FUTEX_OWNER_DIED) | newtid;
+
+ curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+
+ if (curval == -EFAULT)
+ goto handle_fault;
+ if (curval == uval)
+ break;
+ uval = curval;
+ }
+
+ /*
+ * We fixed up user space. Now we need to fix the pi_state
+ * itself.
+ */
if (pi_state->owner != NULL) {
spin_lock_irq(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
spin_unlock_irq(&pi_state->owner->pi_lock);
- } else
- newtid |= FUTEX_OWNER_DIED;
+ }
pi_state->owner = newowner;
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &newowner->pi_state_list);
spin_unlock_irq(&newowner->pi_lock);
+ return 0;
/*
- * We own it, so we have to replace the pending owner
- * TID. This must be atomic as we have preserve the
- * owner died bit here.
+ * To handle the page fault we need to drop the hash bucket
+ * lock here. That gives the other task (either the pending
+ * owner itself or the task which stole the rtmutex) the
+ * chance to try the fixup of the pi_state. So once we are
+ * back from handling the fault we need to check the pi_state
+ * after reacquiring the hash bucket lock and before trying to
+ * do another fixup. When the fixup has been done already we
+ * simply return.
*/
- ret = get_futex_value_locked(&uval, uaddr);
+handle_fault:
+ spin_unlock(q->lock_ptr);
- while (!ret) {
- newval = (uval & FUTEX_OWNER_DIED) | newtid;
+ ret = futex_handle_fault((unsigned long)uaddr, fshared, attempt++);
- curval = cmpxchg_futex_value_locked(uaddr, uval, newval);
+ spin_lock(q->lock_ptr);
- if (curval == -EFAULT)
- ret = -EFAULT;
- if (curval == uval)
- break;
- uval = curval;
- }
- return ret;
+ /*
+ * Check if someone else fixed it for us:
+ */
+ if (pi_state->owner != oldowner)
+ return 0;
+
+ if (ret)
+ return ret;
+
+ goto retry;
}
/*
q.pi_state = NULL;
q.bitset = bitset;
retry:
- futex_lock_mm(fshared);
-
+ q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
- hb = queue_lock(&q, -1, NULL);
+ hb = queue_lock(&q);
/*
* Access the page AFTER the futex is queued.
if (unlikely(ret)) {
queue_unlock(&q, hb);
- /*
- * If we would have faulted, release mmap_sem, fault it in and
- * start all over again.
- */
- futex_unlock_mm(fshared);
-
ret = get_user(uval, uaddr);
if (!ret)
goto out_unlock_release_sem;
/* Only actually queue if *uaddr contained val. */
- __queue_me(&q, hb);
-
- /*
- * Now the futex is queued and we have checked the data, we
- * don't want to hold mmap_sem while we sleep.
- */
- futex_unlock_mm(fshared);
+ queue_me(&q, hb);
/*
* There might have been scheduling since the queue_me(), as we
queue_unlock(&q, hb);
out_release_sem:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &q.key);
return ret;
}
q.pi_state = NULL;
retry:
- futex_lock_mm(fshared);
-
+ q.key = FUTEX_KEY_INIT;
ret = get_futex_key(uaddr, fshared, &q.key);
if (unlikely(ret != 0))
goto out_release_sem;
retry_unlocked:
- hb = queue_lock(&q, -1, NULL);
+ hb = queue_lock(&q);
retry_locked:
ret = lock_taken = 0;
* exit to complete.
*/
queue_unlock(&q, hb);
- futex_unlock_mm(fshared);
cond_resched();
goto retry;
/*
* Only actually queue now that the atomic ops are done:
*/
- __queue_me(&q, hb);
-
- /*
- * Now the futex is queued and we have checked the data, we
- * don't want to hold mmap_sem while we sleep.
- */
- futex_unlock_mm(fshared);
+ queue_me(&q, hb);
WARN_ON(!q.pi_state);
/*
ret = ret ? 0 : -EWOULDBLOCK;
}
- futex_lock_mm(fshared);
spin_lock(q.lock_ptr);
if (!ret) {
* that case:
*/
if (q.pi_state->owner != curr)
- ret = fixup_pi_state_owner(uaddr, &q, curr);
+ ret = fixup_pi_state_owner(uaddr, &q, curr, fshared);
} else {
/*
* Catch the rare case, where the lock was released
int res;
owner = rt_mutex_owner(&q.pi_state->pi_mutex);
- res = fixup_pi_state_owner(uaddr, &q, owner);
+ res = fixup_pi_state_owner(uaddr, &q, owner,
+ fshared);
/* propagate -EFAULT, if the fixup failed */
if (res)
/* Unqueue and drop the lock */
unqueue_me_pi(&q);
- futex_unlock_mm(fshared);
if (to)
destroy_hrtimer_on_stack(&to->timer);
queue_unlock(&q, hb);
out_release_sem:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &q.key);
if (to)
destroy_hrtimer_on_stack(&to->timer);
return ret;
goto retry_unlocked;
}
- futex_unlock_mm(fshared);
-
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
goto retry;
struct futex_q *this, *next;
u32 uval;
struct plist_head *head;
- union futex_key key;
+ union futex_key key = FUTEX_KEY_INIT;
int ret, attempt = 0;
retry:
*/
if ((uval & FUTEX_TID_MASK) != task_pid_vnr(current))
return -EPERM;
- /*
- * First take all the futex related locks:
- */
- futex_lock_mm(fshared);
ret = get_futex_key(uaddr, fshared, &key);
if (unlikely(ret != 0))
out_unlock:
spin_unlock(&hb->lock);
out:
- futex_unlock_mm(fshared);
+ put_futex_key(fshared, &key);
return ret;
goto retry_unlocked;
}
- futex_unlock_mm(fshared);
-
ret = get_user(uval, uaddr);
if (!ret && (uval != -EFAULT))
goto retry;
return ret;
}
-static int futex_close(struct inode *inode, struct file *filp)
-{
- struct futex_q *q = filp->private_data;
-
- unqueue_me(q);
- kfree(q);
-
- return 0;
-}
-
-/* This is one-shot: once it's gone off you need a new fd */
-static unsigned int futex_poll(struct file *filp,
- struct poll_table_struct *wait)
-{
- struct futex_q *q = filp->private_data;
- int ret = 0;
-
- poll_wait(filp, &q->waiters, wait);
-
- /*
- * plist_node_empty() is safe here without any lock.
- * q->lock_ptr != 0 is not safe, because of ordering against wakeup.
- */
- if (plist_node_empty(&q->list))
- ret = POLLIN | POLLRDNORM;
-
- return ret;
-}
-
-static const struct file_operations futex_fops = {
- .release = futex_close,
- .poll = futex_poll,
-};
-
-/*
- * Signal allows caller to avoid the race which would occur if they
- * set the sigio stuff up afterwards.
- */
-static int futex_fd(u32 __user *uaddr, int signal)
-{
- struct futex_q *q;
- struct file *filp;
- int ret, err;
- struct rw_semaphore *fshared;
- static unsigned long printk_interval;
-
- if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
- printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
- "will be removed from the kernel in June 2007\n",
- current->comm);
- }
-
- ret = -EINVAL;
- if (!valid_signal(signal))
- goto out;
-
- ret = get_unused_fd();
- if (ret < 0)
- goto out;
- filp = get_empty_filp();
- if (!filp) {
- put_unused_fd(ret);
- ret = -ENFILE;
- goto out;
- }
- filp->f_op = &futex_fops;
- filp->f_path.mnt = mntget(futex_mnt);
- filp->f_path.dentry = dget(futex_mnt->mnt_root);
- filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
-
- if (signal) {
- err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
- if (err < 0) {
- goto error;
- }
- filp->f_owner.signum = signal;
- }
-
- q = kmalloc(sizeof(*q), GFP_KERNEL);
- if (!q) {
- err = -ENOMEM;
- goto error;
- }
- q->pi_state = NULL;
-
- fshared = ¤t->mm->mmap_sem;
- down_read(fshared);
- err = get_futex_key(uaddr, fshared, &q->key);
-
- if (unlikely(err != 0)) {
- up_read(fshared);
- kfree(q);
- goto error;
- }
-
- /*
- * queue_me() must be called before releasing mmap_sem, because
- * key->shared.inode needs to be referenced while holding it.
- */
- filp->private_data = q;
-
- queue_me(q, ret, filp);
- up_read(fshared);
-
- /* Now we map fd to filp, so userspace can access it */
- fd_install(ret, filp);
-out:
- return ret;
-error:
- put_unused_fd(ret);
- put_filp(filp);
- ret = err;
- goto out;
-}
-
/*
* Support for robust futexes: the kernel cleans up held futexes at
* thread exit time.
case FUTEX_WAKE_BITSET:
ret = futex_wake(uaddr, fshared, val, val3);
break;
- case FUTEX_FD:
- /* non-zero val means F_SETOWN(getpid()) & F_SETSIG(val) */
- ret = futex_fd(uaddr, val);
- break;
case FUTEX_REQUEUE:
ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL);
break;
return do_futex(uaddr, op, val, tp, uaddr2, val2, val3);
}
-static int futexfs_get_sb(struct file_system_type *fs_type,
- int flags, const char *dev_name, void *data,
- struct vfsmount *mnt)
-{
- return get_sb_pseudo(fs_type, "futex", NULL, FUTEXFS_SUPER_MAGIC, mnt);
-}
-
-static struct file_system_type futex_fs_type = {
- .name = "futexfs",
- .get_sb = futexfs_get_sb,
- .kill_sb = kill_anon_super,
-};
-
static int __init futex_init(void)
{
u32 curval;
spin_lock_init(&futex_queues[i].lock);
}
- i = register_filesystem(&futex_fs_type);
- if (i)
- return i;
-
- futex_mnt = kern_mount(&futex_fs_type);
- if (IS_ERR(futex_mnt)) {
- unregister_filesystem(&futex_fs_type);
- return PTR_ERR(futex_mnt);
- }
-
return 0;
}
__initcall(futex_init);