[TCP]: Keep copied_seq, rcv_wup and rcv_next together.
[safe/jmp/linux-2.6] / kernel / futex.c
index b305b7f..5a270b5 100644 (file)
@@ -166,7 +166,7 @@ static inline int match_futex(union futex_key *key1, union futex_key *key2)
 /*
  * Get parameters which are the keys for a futex.
  *
- * For shared mappings, it's (page->index, vma->vm_file->f_dentry->d_inode,
+ * For shared mappings, it's (page->index, vma->vm_file->f_path.dentry->d_inode,
  * offset_within_page).  For private mappings, it's (uaddr, current->mm).
  * We can usually work out the index without swapping in the page.
  *
@@ -223,7 +223,7 @@ static int get_futex_key(u32 __user *uaddr, union futex_key *key)
        /*
         * Linear file mappings are also simple.
         */
-       key->shared.inode = vma->vm_file->f_dentry->d_inode;
+       key->shared.inode = vma->vm_file->f_path.dentry->d_inode;
        key->both.offset++; /* Bit 0 of offset indicates inode-based key. */
        if (likely(!(vma->vm_flags & VM_NONLINEAR))) {
                key->shared.pgoff = (((address - vma->vm_start) >> PAGE_SHIFT)
@@ -282,9 +282,9 @@ static inline int get_futex_value_locked(u32 *dest, u32 __user *from)
 {
        int ret;
 
-       inc_preempt_count();
+       pagefault_disable();
        ret = __copy_from_user_inatomic(dest, from, sizeof(u32));
-       dec_preempt_count();
+       pagefault_enable();
 
        return ret ? -EFAULT : 0;
 }
@@ -297,7 +297,7 @@ static int futex_handle_fault(unsigned long address, int attempt)
        struct vm_area_struct * vma;
        struct mm_struct *mm = current->mm;
 
-       if (attempt >= 2 || !(vma = find_vma(mm, address)) ||
+       if (attempt > 2 || !(vma = find_vma(mm, address)) ||
            vma->vm_start > address || !(vma->vm_flags & VM_WRITE))
                return -EFAULT;
 
@@ -324,12 +324,11 @@ static int refill_pi_state_cache(void)
        if (likely(current->pi_state_cache))
                return 0;
 
-       pi_state = kmalloc(sizeof(*pi_state), GFP_KERNEL);
+       pi_state = kzalloc(sizeof(*pi_state), GFP_KERNEL);
 
        if (!pi_state)
                return -ENOMEM;
 
-       memset(pi_state, 0, sizeof(*pi_state));
        INIT_LIST_HEAD(&pi_state->list);
        /* pi_mutex gets initialized later */
        pi_state->owner = NULL;
@@ -389,7 +388,7 @@ static struct task_struct * futex_find_get_task(pid_t pid)
 {
        struct task_struct *p;
 
-       read_lock(&tasklist_lock);
+       rcu_read_lock();
        p = find_task_by_pid(pid);
        if (!p)
                goto out_unlock;
@@ -397,13 +396,13 @@ static struct task_struct * futex_find_get_task(pid_t pid)
                p = NULL;
                goto out_unlock;
        }
-       if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) {
+       if (p->exit_state != 0) {
                p = NULL;
                goto out_unlock;
        }
        get_task_struct(p);
 out_unlock:
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
 
        return p;
 }
@@ -415,15 +414,15 @@ out_unlock:
  */
 void exit_pi_state_list(struct task_struct *curr)
 {
-       struct futex_hash_bucket *hb;
        struct list_head *next, *head = &curr->pi_state_list;
        struct futex_pi_state *pi_state;
+       struct futex_hash_bucket *hb;
        union futex_key key;
 
        /*
         * We are a ZOMBIE and nobody can enqueue itself on
         * pi_state_list anymore, but we have to be careful
-        * versus waiters unqueueing themselfs
+        * versus waiters unqueueing themselves:
         */
        spin_lock_irq(&curr->pi_lock);
        while (!list_empty(head)) {
@@ -431,21 +430,24 @@ void exit_pi_state_list(struct task_struct *curr)
                next = head->next;
                pi_state = list_entry(next, struct futex_pi_state, list);
                key = pi_state->key;
+               hb = hash_futex(&key);
                spin_unlock_irq(&curr->pi_lock);
 
-               hb = hash_futex(&key);
                spin_lock(&hb->lock);
 
                spin_lock_irq(&curr->pi_lock);
+               /*
+                * We dropped the pi-lock, so re-check whether this
+                * task still owns the PI-state:
+                */
                if (head->next != next) {
                        spin_unlock(&hb->lock);
                        continue;
                }
 
-               list_del_init(&pi_state->list);
-
                WARN_ON(pi_state->owner != curr);
-
+               WARN_ON(list_empty(&pi_state->list));
+               list_del_init(&pi_state->list);
                pi_state->owner = NULL;
                spin_unlock_irq(&curr->pi_lock);
 
@@ -470,12 +472,20 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        head = &hb->chain;
 
        list_for_each_entry_safe(this, next, head, list) {
-               if (match_futex (&this->key, &me->key)) {
+               if (match_futex(&this->key, &me->key)) {
                        /*
                         * Another waiter already exists - bump up
                         * the refcount and return its pi_state:
                         */
                        pi_state = this->pi_state;
+                       /*
+                        * Userspace might have messed up non PI and PI futexes
+                        */
+                       if (unlikely(!pi_state))
+                               return -EINVAL;
+
+                       WARN_ON(!atomic_read(&pi_state->refcount));
+
                        atomic_inc(&pi_state->refcount);
                        me->pi_state = pi_state;
 
@@ -484,10 +494,13 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        }
 
        /*
-        * We are the first waiter - try to look up the real owner and
-        * attach the new pi_state to it:
+        * We are the first waiter - try to look up the real owner and attach
+        * the new pi_state to it, but bail out when the owner died bit is set
+        * and TID = 0:
         */
        pid = uval & FUTEX_TID_MASK;
+       if (!pid && (uval & FUTEX_OWNER_DIED))
+               return -ESRCH;
        p = futex_find_get_task(pid);
        if (!p)
                return -ESRCH;
@@ -504,6 +517,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, struct futex_q *me)
        pi_state->key = me->key;
 
        spin_lock_irq(&p->pi_lock);
+       WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &p->pi_state_list);
        pi_state->owner = p;
        spin_unlock_irq(&p->pi_lock);
@@ -538,7 +552,7 @@ static void wake_futex(struct futex_q *q)
         * at the end of wake_up_all() does not prevent this store from
         * moving.
         */
-       wmb();
+       smp_wmb();
        q->lock_ptr = NULL;
 }
 
@@ -551,6 +565,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
        if (!pi_state)
                return -EINVAL;
 
+       spin_lock(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
 
        /*
@@ -567,20 +582,30 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this)
         * kept enabled while there is PI state around. We must also
         * preserve the owner died bit.)
         */
-       newval = (uval & FUTEX_OWNER_DIED) | FUTEX_WAITERS | new_owner->pid;
+       if (!(uval & FUTEX_OWNER_DIED)) {
+               newval = FUTEX_WAITERS | new_owner->pid;
 
-       inc_preempt_count();
-       curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-       dec_preempt_count();
+               pagefault_disable();
+               curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
+               pagefault_enable();
+               if (curval == -EFAULT)
+                       return -EFAULT;
+               if (curval != uval)
+                       return -EINVAL;
+       }
 
-       if (curval == -EFAULT)
-               return -EFAULT;
-       if (curval != uval)
-               return -EINVAL;
+       spin_lock_irq(&pi_state->owner->pi_lock);
+       WARN_ON(list_empty(&pi_state->list));
+       list_del_init(&pi_state->list);
+       spin_unlock_irq(&pi_state->owner->pi_lock);
 
-       list_del_init(&pi_state->owner->pi_state_list);
+       spin_lock_irq(&new_owner->pi_lock);
+       WARN_ON(!list_empty(&pi_state->list));
        list_add(&pi_state->list, &new_owner->pi_state_list);
        pi_state->owner = new_owner;
+       spin_unlock_irq(&new_owner->pi_lock);
+
+       spin_unlock(&pi_state->pi_mutex.wait_lock);
        rt_mutex_unlock(&pi_state->pi_mutex);
 
        return 0;
@@ -594,9 +619,9 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
         * There is no waiter, so we unlock the futex. The owner died
         * bit has not to be preserved here. We are the owner:
         */
-       inc_preempt_count();
+       pagefault_disable();
        oldval = futex_atomic_cmpxchg_inatomic(uaddr, uval, 0);
-       dec_preempt_count();
+       pagefault_enable();
 
        if (oldval == -EFAULT)
                return oldval;
@@ -607,6 +632,22 @@ static int unlock_futex_pi(u32 __user *uaddr, u32 uval)
 }
 
 /*
+ * Express the locking dependencies for lockdep:
+ */
+static inline void
+double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+       if (hb1 <= hb2) {
+               spin_lock(&hb1->lock);
+               if (hb1 < hb2)
+                       spin_lock_nested(&hb2->lock, SINGLE_DEPTH_NESTING);
+       } else { /* hb1 > hb2 */
+               spin_lock(&hb2->lock);
+               spin_lock_nested(&hb1->lock, SINGLE_DEPTH_NESTING);
+       }
+}
+
+/*
  * Wake up all waiters hashed on the physical page that is mapped
  * to this virtual address:
  */
@@ -630,8 +671,10 @@ static int futex_wake(u32 __user *uaddr, int nr_wake)
 
        list_for_each_entry_safe(this, next, head, list) {
                if (match_futex (&this->key, &key)) {
-                       if (this->pi_state)
-                               return -EINVAL;
+                       if (this->pi_state) {
+                               ret = -EINVAL;
+                               break;
+                       }
                        wake_futex(this);
                        if (++ret >= nr_wake)
                                break;
@@ -672,11 +715,7 @@ retryfull:
        hb2 = hash_futex(&key2);
 
 retry:
-       if (hb1 < hb2)
-               spin_lock(&hb1->lock);
-       spin_lock(&hb2->lock);
-       if (hb1 > hb2)
-               spin_lock(&hb1->lock);
+       double_lock_hb(hb1, hb2);
 
        op_ret = futex_atomic_op_inuser(op, uaddr2);
        if (unlikely(op_ret < 0)) {
@@ -709,8 +748,10 @@ retry:
                 */
                if (attempt++) {
                        if (futex_handle_fault((unsigned long)uaddr2,
-                                              attempt))
+                                               attempt)) {
+                               ret = -EFAULT;
                                goto out;
+                       }
                        goto retry;
                }
 
@@ -785,11 +826,7 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
        hb1 = hash_futex(&key1);
        hb2 = hash_futex(&key2);
 
-       if (hb1 < hb2)
-               spin_lock(&hb1->lock);
-       spin_lock(&hb2->lock);
-       if (hb1 > hb2)
-               spin_lock(&hb1->lock);
+       double_lock_hb(hb1, hb2);
 
        if (likely(cmpval != NULL)) {
                u32 curval;
@@ -827,17 +864,20 @@ static int futex_requeue(u32 __user *uaddr1, u32 __user *uaddr2,
                if (++ret <= nr_wake) {
                        wake_futex(this);
                } else {
-                       list_move_tail(&this->list, &hb2->chain);
-                       this->lock_ptr = &hb2->lock;
+                       /*
+                        * If key1 and key2 hash to the same bucket, no need to
+                        * requeue.
+                        */
+                       if (likely(head1 != &hb2->chain)) {
+                               list_move_tail(&this->list, &hb2->chain);
+                               this->lock_ptr = &hb2->lock;
+                       }
                        this->key = key2;
                        get_key_refs(&key2);
                        drop_count++;
 
                        if (ret - nr_wake >= nr_requeue)
                                break;
-                       /* Make sure to stop if key1 == key2: */
-                       if (head1 == &hb2->chain && head1 != &next->list)
-                               head1 = &this->list;
                }
        }
 
@@ -911,6 +951,7 @@ static int unqueue_me(struct futex_q *q)
        /* In the common case we don't take the spinlock, which is nice. */
  retry:
        lock_ptr = q->lock_ptr;
+       barrier();
        if (lock_ptr != 0) {
                spin_lock(lock_ptr);
                /*
@@ -1080,9 +1121,10 @@ static int futex_wait(u32 __user *uaddr, u32 val, unsigned long time)
  * if there are waiters then it will block, it does PI, etc. (Due to
  * races the kernel might see a 0 value of the futex too.)
  */
-static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
-                           struct hrtimer_sleeper *to)
+static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
+                        long nsec, int trylock)
 {
+       struct hrtimer_sleeper timeout, *to = NULL;
        struct task_struct *curr = current;
        struct futex_hash_bucket *hb;
        u32 uval, newval, curval;
@@ -1092,6 +1134,13 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
        if (refill_pi_state_cache())
                return -ENOMEM;
 
+       if (sec != MAX_SCHEDULE_TIMEOUT) {
+               to = &timeout;
+               hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_MODE_ABS);
+               hrtimer_init_sleeper(to, current);
+               to->timer.expires = ktime_set(sec, nsec);
+       }
+
        q.pi_state = NULL;
  retry:
        down_read(&curr->mm->mmap_sem);
@@ -1110,9 +1159,9 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
         */
        newval = current->pid;
 
-       inc_preempt_count();
+       pagefault_disable();
        curval = futex_atomic_cmpxchg_inatomic(uaddr, 0, newval);
-       dec_preempt_count();
+       pagefault_enable();
 
        if (unlikely(curval == -EFAULT))
                goto uaddr_faulted;
@@ -1135,9 +1184,9 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
        uval = curval;
        newval = uval | FUTEX_WAITERS;
 
-       inc_preempt_count();
+       pagefault_disable();
        curval = futex_atomic_cmpxchg_inatomic(uaddr, uval, newval);
-       dec_preempt_count();
+       pagefault_enable();
 
        if (unlikely(curval == -EFAULT))
                goto uaddr_faulted;
@@ -1167,10 +1216,10 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
                        newval = current->pid |
                                FUTEX_OWNER_DIED | FUTEX_WAITERS;
 
-                       inc_preempt_count();
+                       pagefault_disable();
                        curval = futex_atomic_cmpxchg_inatomic(uaddr,
                                                               uval, newval);
-                       dec_preempt_count();
+                       pagefault_enable();
 
                        if (unlikely(curval == -EFAULT))
                                goto uaddr_faulted;
@@ -1205,7 +1254,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
        }
 
        down_read(&curr->mm->mmap_sem);
-       hb = queue_lock(&q, -1, NULL);
+       spin_lock(q.lock_ptr);
 
        /*
         * Got the lock. We might not be the anticipated owner if we
@@ -1217,6 +1266,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
                /* Owner died? */
                if (q.pi_state->owner != NULL) {
                        spin_lock_irq(&q.pi_state->owner->pi_lock);
+                       WARN_ON(list_empty(&q.pi_state->list));
                        list_del_init(&q.pi_state->list);
                        spin_unlock_irq(&q.pi_state->owner->pi_lock);
                } else
@@ -1225,6 +1275,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
                q.pi_state->owner = current;
 
                spin_lock_irq(&current->pi_lock);
+               WARN_ON(!list_empty(&q.pi_state->list));
                list_add(&q.pi_state->list, &current->pi_state_list);
                spin_unlock_irq(&current->pi_lock);
 
@@ -1265,7 +1316,7 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
        if (!detect && ret == -EDEADLK && 0)
                force_sig(SIGKILL, current);
 
-       return ret;
+       return ret != -EINTR ? ret : -ERESTARTNOINTR;
 
  out_unlock_release_sem:
        queue_unlock(&q, hb);
@@ -1282,9 +1333,10 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
         * still holding the mmap_sem.
         */
        if (attempt++) {
-               if (futex_handle_fault((unsigned long)uaddr, attempt))
+               if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+                       ret = -EFAULT;
                        goto out_unlock_release_sem;
-
+               }
                goto retry_locked;
        }
 
@@ -1299,76 +1351,6 @@ static int do_futex_lock_pi(u32 __user *uaddr, int detect, int trylock,
 }
 
 /*
- * Restart handler
- */
-static long futex_lock_pi_restart(struct restart_block *restart)
-{
-       struct hrtimer_sleeper timeout, *to = NULL;
-       int ret;
-
-       restart->fn = do_no_restart_syscall;
-
-       if (restart->arg2 || restart->arg3) {
-               to = &timeout;
-               hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
-               hrtimer_init_sleeper(to, current);
-               to->timer.expires.tv64 = ((u64)restart->arg1 << 32) |
-                       (u64) restart->arg0;
-       }
-
-       pr_debug("lock_pi restart: %p, %d (%d)\n",
-                (u32 __user *)restart->arg0, current->pid);
-
-       ret = do_futex_lock_pi((u32 __user *)restart->arg0, restart->arg1,
-                              0, to);
-
-       if (ret != -EINTR)
-               return ret;
-
-       restart->fn = futex_lock_pi_restart;
-
-       /* The other values are filled in */
-       return -ERESTART_RESTARTBLOCK;
-}
-
-/*
- * Called from the syscall entry below.
- */
-static int futex_lock_pi(u32 __user *uaddr, int detect, unsigned long sec,
-                        long nsec, int trylock)
-{
-       struct hrtimer_sleeper timeout, *to = NULL;
-       struct restart_block *restart;
-       int ret;
-
-       if (sec != MAX_SCHEDULE_TIMEOUT) {
-               to = &timeout;
-               hrtimer_init(&to->timer, CLOCK_REALTIME, HRTIMER_ABS);
-               hrtimer_init_sleeper(to, current);
-               to->timer.expires = ktime_set(sec, nsec);
-       }
-
-       ret = do_futex_lock_pi(uaddr, detect, trylock, to);
-
-       if (ret != -EINTR)
-               return ret;
-
-       pr_debug("lock_pi interrupted: %p, %d (%d)\n", uaddr, current->pid);
-
-       restart = &current_thread_info()->restart_block;
-       restart->fn = futex_lock_pi_restart;
-       restart->arg0 = (unsigned long) uaddr;
-       restart->arg1 = detect;
-       if (to) {
-               restart->arg2 = to->timer.expires.tv64 & 0xFFFFFFFF;
-               restart->arg3 = to->timer.expires.tv64 >> 32;
-       } else
-               restart->arg2 = restart->arg3 = 0;
-
-       return -ERESTART_RESTARTBLOCK;
-}
-
-/*
  * Userspace attempted a TID -> 0 atomic transition, and failed.
  * This is the in-kernel slowpath: we look up the PI state (if any),
  * and do the rt-mutex unlock.
@@ -1408,9 +1390,11 @@ retry_locked:
         * again. If it succeeds then we can return without waking
         * anyone else up:
         */
-       inc_preempt_count();
-       uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
-       dec_preempt_count();
+       if (!(uval & FUTEX_OWNER_DIED)) {
+               pagefault_disable();
+               uval = futex_atomic_cmpxchg_inatomic(uaddr, current->pid, 0);
+               pagefault_enable();
+       }
 
        if (unlikely(uval == -EFAULT))
                goto pi_faulted;
@@ -1443,9 +1427,11 @@ retry_locked:
        /*
         * No waiters - kernel unlocks the futex:
         */
-       ret = unlock_futex_pi(uaddr, uval);
-       if (ret == -EFAULT)
-               goto pi_faulted;
+       if (!(uval & FUTEX_OWNER_DIED)) {
+               ret = unlock_futex_pi(uaddr, uval);
+               if (ret == -EFAULT)
+                       goto pi_faulted;
+       }
 
 out_unlock:
        spin_unlock(&hb->lock);
@@ -1462,9 +1448,10 @@ pi_faulted:
         * still holding the mmap_sem.
         */
        if (attempt++) {
-               if (futex_handle_fault((unsigned long)uaddr, attempt))
+               if (futex_handle_fault((unsigned long)uaddr, attempt)) {
+                       ret = -EFAULT;
                        goto out_unlock;
-
+               }
                goto retry_locked;
        }
 
@@ -1507,7 +1494,7 @@ static unsigned int futex_poll(struct file *filp,
        return ret;
 }
 
-static struct file_operations futex_fops = {
+static const struct file_operations futex_fops = {
        .release        = futex_close,
        .poll           = futex_poll,
 };
@@ -1521,6 +1508,13 @@ static int futex_fd(u32 __user *uaddr, int signal)
        struct futex_q *q;
        struct file *filp;
        int ret, err;
+       static unsigned long printk_interval;
+
+       if (printk_timed_ratelimit(&printk_interval, 60 * 60 * 1000)) {
+               printk(KERN_WARNING "Process `%s' used FUTEX_FD, which "
+                       "will be removed from the kernel in June 2007\n",
+                       current->comm);
+       }
 
        ret = -EINVAL;
        if (!valid_signal(signal))
@@ -1536,12 +1530,12 @@ static int futex_fd(u32 __user *uaddr, int signal)
                goto out;
        }
        filp->f_op = &futex_fops;
-       filp->f_vfsmnt = mntget(futex_mnt);
-       filp->f_dentry = dget(futex_mnt->mnt_root);
-       filp->f_mapping = filp->f_dentry->d_inode->i_mapping;
+       filp->f_path.mnt = mntget(futex_mnt);
+       filp->f_path.dentry = dget(futex_mnt->mnt_root);
+       filp->f_mapping = filp->f_path.dentry->d_inode->i_mapping;
 
        if (signal) {
-               err = f_setown(filp, current->pid, 1);
+               err = __f_setown(filp, task_pid(current), PIDTYPE_PID, 1);
                if (err < 0) {
                        goto error;
                }
@@ -1626,10 +1620,10 @@ sys_set_robust_list(struct robust_list_head __user *head,
  * @len_ptr: pointer to a length field, the kernel fills in the header size
  */
 asmlinkage long
-sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
+sys_get_robust_list(int pid, struct robust_list_head __user * __user *head_ptr,
                    size_t __user *len_ptr)
 {
-       struct robust_list_head *head;
+       struct robust_list_head __user *head;
        unsigned long ret;
 
        if (!pid)
@@ -1638,7 +1632,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
                struct task_struct *p;
 
                ret = -ESRCH;
-               read_lock(&tasklist_lock);
+               rcu_read_lock();
                p = find_task_by_pid(pid);
                if (!p)
                        goto err_unlock;
@@ -1647,7 +1641,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
                                !capable(CAP_SYS_PTRACE))
                        goto err_unlock;
                head = p->robust_list;
-               read_unlock(&tasklist_lock);
+               rcu_read_unlock();
        }
 
        if (put_user(sizeof(*head), len_ptr))
@@ -1655,7 +1649,7 @@ sys_get_robust_list(int pid, struct robust_list_head __user **head_ptr,
        return put_user(head, head_ptr);
 
 err_unlock:
-       read_unlock(&tasklist_lock);
+       rcu_read_unlock();
 
        return ret;
 }
@@ -1664,9 +1658,9 @@ err_unlock:
  * Process a futex-list entry, check whether it's owned by the
  * dying task, and do notification if so:
  */
-int handle_futex_death(u32 __user *uaddr, struct task_struct *curr)
+int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, int pi)
 {
-       u32 uval, nval;
+       u32 uval, nval, mval;
 
 retry:
        if (get_user(uval, uaddr))
@@ -1683,21 +1677,46 @@ retry:
                 * thread-death.) The rest of the cleanup is done in
                 * userspace.
                 */
-               nval = futex_atomic_cmpxchg_inatomic(uaddr, uval,
-                                                    uval | FUTEX_OWNER_DIED);
+               mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED;
+               nval = futex_atomic_cmpxchg_inatomic(uaddr, uval, mval);
+
                if (nval == -EFAULT)
                        return -1;
 
                if (nval != uval)
                        goto retry;
 
-               if (uval & FUTEX_WAITERS)
-                       futex_wake(uaddr, 1);
+               /*
+                * Wake robust non-PI futexes here. The wakeup of
+                * PI futexes happens in exit_pi_state():
+                */
+               if (!pi) {
+                       if (uval & FUTEX_WAITERS)
+                               futex_wake(uaddr, 1);
+               }
        }
        return 0;
 }
 
 /*
+ * Fetch a robust-list pointer. Bit 0 signals PI futexes:
+ */
+static inline int fetch_robust_entry(struct robust_list __user **entry,
+                                    struct robust_list __user * __user *head,
+                                    int *pi)
+{
+       unsigned long uentry;
+
+       if (get_user(uentry, (unsigned long __user *)head))
+               return -EFAULT;
+
+       *entry = (void __user *)(uentry & ~1UL);
+       *pi = uentry & 1;
+
+       return 0;
+}
+
+/*
  * Walk curr->robust_list (very carefully, it's a userspace list!)
  * and mark any locks found there dead, and notify any waiters.
  *
@@ -1707,14 +1726,14 @@ void exit_robust_list(struct task_struct *curr)
 {
        struct robust_list_head __user *head = curr->robust_list;
        struct robust_list __user *entry, *pending;
-       unsigned int limit = ROBUST_LIST_LIMIT;
+       unsigned int limit = ROBUST_LIST_LIMIT, pi, pip;
        unsigned long futex_offset;
 
        /*
         * Fetch the list head (which was registered earlier, via
         * sys_set_robust_list()):
         */
-       if (get_user(entry, &head->list.next))
+       if (fetch_robust_entry(&entry, &head->list.next, &pi))
                return;
        /*
         * Fetch the relative futex offset:
@@ -1725,10 +1744,11 @@ void exit_robust_list(struct task_struct *curr)
         * Fetch any possibly pending lock-add first, and handle it
         * if it exists:
         */
-       if (get_user(pending, &head->list_op_pending))
+       if (fetch_robust_entry(&pending, &head->list_op_pending, &pip))
                return;
+
        if (pending)
-               handle_futex_death((void *)pending + futex_offset, curr);
+               handle_futex_death((void __user *)pending + futex_offset, curr, pip);
 
        while (entry != &head->list) {
                /*
@@ -1736,13 +1756,13 @@ void exit_robust_list(struct task_struct *curr)
                 * don't process it twice:
                 */
                if (entry != pending)
-                       if (handle_futex_death((void *)entry + futex_offset,
-                                               curr))
+                       if (handle_futex_death((void __user *)entry + futex_offset,
+                                               curr, pi))
                                return;
                /*
                 * Fetch the next entry in the list:
                 */
-               if (get_user(entry, &entry->next))
+               if (fetch_robust_entry(&entry, &entry->next, &pi))
                        return;
                /*
                 * Avoid excessively long or circular lists:
@@ -1839,10 +1859,16 @@ static struct file_system_type futex_fs_type = {
 
 static int __init init(void)
 {
-       unsigned int i;
+       int i = register_filesystem(&futex_fs_type);
+
+       if (i)
+               return i;
 
-       register_filesystem(&futex_fs_type);
        futex_mnt = kern_mount(&futex_fs_type);
+       if (IS_ERR(futex_mnt)) {
+               unregister_filesystem(&futex_fs_type);
+               return PTR_ERR(futex_mnt);
+       }
 
        for (i = 0; i < ARRAY_SIZE(futex_queues); i++) {
                INIT_LIST_HEAD(&futex_queues[i].chain);