futex: Pull rt_mutex_futex_unlock() out from under hb->lock
authorPeter Zijlstra <peterz@infradead.org>
Wed, 22 Mar 2017 10:35:55 +0000 (11:35 +0100)
committerThomas Gleixner <tglx@linutronix.de>
Thu, 23 Mar 2017 18:10:08 +0000 (19:10 +0100)
There's a number of 'interesting' problems, all caused by holding
hb->lock while doing the rt_mutex_unlock() equivalient.

Notably:

 - a PI inversion on hb->lock; and,

 - a SCHED_DEADLINE crash because of pointer instability.

The previous changes:

 - changed the locking rules to cover {uval,pi_state} with wait_lock.

 - allow to do rt_mutex_futex_unlock() without dropping wait_lock; which in
   turn allows to rely on wait_lock atomicity completely.

 - simplified the waiter conundrum.

It's now sufficient to hold rtmutex::wait_lock and a reference on the
pi_state to protect the state consistency, so hb->lock can be dropped
before calling rt_mutex_futex_unlock().

Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: juri.lelli@arm.com
Cc: bigeasy@linutronix.de
Cc: xlpang@redhat.com
Cc: rostedt@goodmis.org
Cc: mathieu.desnoyers@efficios.com
Cc: jdesfossez@efficios.com
Cc: dvhart@infradead.org
Cc: bristot@redhat.com
Link: http://lkml.kernel.org/r/20170322104151.900002056@infradead.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
kernel/futex.c

index 51a248af1db939971514021f9f7d66c2803d541d..3b0aace334a8118ab0024481466dbd5e50f049ad 100644 (file)
@@ -921,10 +921,12 @@ void exit_pi_state_list(struct task_struct *curr)
                pi_state->owner = NULL;
                raw_spin_unlock_irq(&curr->pi_lock);
 
-               rt_mutex_futex_unlock(&pi_state->pi_mutex);
-
+               get_pi_state(pi_state);
                spin_unlock(&hb->lock);
 
+               rt_mutex_futex_unlock(&pi_state->pi_mutex);
+               put_pi_state(pi_state);
+
                raw_spin_lock_irq(&curr->pi_lock);
        }
        raw_spin_unlock_irq(&curr->pi_lock);
@@ -1037,6 +1039,11 @@ static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
         * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
         * which in turn means that futex_lock_pi() still has a reference on
         * our pi_state.
+        *
+        * The waiter holding a reference on @pi_state also protects against
+        * the unlocked put_pi_state() in futex_unlock_pi(), futex_lock_pi()
+        * and futex_wait_requeue_pi() as it cannot go to 0 and consequently
+        * free pi_state before we can take a reference ourselves.
         */
        WARN_ON(!atomic_read(&pi_state->refcount));
 
@@ -1380,48 +1387,40 @@ static void mark_wake_futex(struct wake_q_head *wake_q, struct futex_q *q)
        smp_store_release(&q->lock_ptr, NULL);
 }
 
-static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter,
-                        struct futex_hash_bucket *hb)
+/*
+ * Caller must hold a reference on @pi_state.
+ */
+static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_state)
 {
-       struct task_struct *new_owner;
-       struct futex_pi_state *pi_state = top_waiter->pi_state;
        u32 uninitialized_var(curval), newval;
+       struct task_struct *new_owner;
+       bool deboost = false;
        DEFINE_WAKE_Q(wake_q);
-       bool deboost;
        int ret = 0;
 
-       if (!pi_state)
-               return -EINVAL;
-
-       /*
-        * If current does not own the pi_state then the futex is
-        * inconsistent and user space fiddled with the futex value.
-        */
-       if (pi_state->owner != current)
-               return -EINVAL;
-
        raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
        new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
-
-       /*
-        * When we interleave with futex_lock_pi() where it does
-        * rt_mutex_timed_futex_lock(), we might observe @this futex_q waiter,
-        * but the rt_mutex's wait_list can be empty (either still, or again,
-        * depending on which side we land).
-        *
-        * When this happens, give up our locks and try again, giving the
-        * futex_lock_pi() instance time to complete, either by waiting on the
-        * rtmutex or removing itself from the futex queue.
-        */
        if (!new_owner) {
-               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-               return -EAGAIN;
+               /*
+                * Since we held neither hb->lock nor wait_lock when coming
+                * into this function, we could have raced with futex_lock_pi()
+                * such that we might observe @this futex_q waiter, but the
+                * rt_mutex's wait_list can be empty (either still, or again,
+                * depending on which side we land).
+                *
+                * When this happens, give up our locks and try again, giving
+                * the futex_lock_pi() instance time to complete, either by
+                * waiting on the rtmutex or removing itself from the futex
+                * queue.
+                */
+               ret = -EAGAIN;
+               goto out_unlock;
        }
 
        /*
-        * We pass it to the next owner. The WAITERS bit is always
-        * kept enabled while there is PI state around. We cleanup the
-        * owner died bit, because we are the owner.
+        * We pass it to the next owner. The WAITERS bit is always kept
+        * enabled while there is PI state around. We cleanup the owner
+        * died bit, because we are the owner.
         */
        newval = FUTEX_WAITERS | task_pid_vnr(new_owner);
 
@@ -1444,10 +1443,8 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
                        ret = -EINVAL;
        }
 
-       if (ret) {
-               raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-               return ret;
-       }
+       if (ret)
+               goto out_unlock;
 
        raw_spin_lock(&pi_state->owner->pi_lock);
        WARN_ON(list_empty(&pi_state->list));
@@ -1465,15 +1462,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
         */
        deboost = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
 
+out_unlock:
        raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
-       spin_unlock(&hb->lock);
 
        if (deboost) {
                wake_up_q(&wake_q);
                rt_mutex_adjust_prio(current);
        }
 
-       return 0;
+       return ret;
 }
 
 /*
@@ -2232,7 +2229,8 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
        /*
         * We are here either because we stole the rtmutex from the
         * previous highest priority waiter or we are the highest priority
-        * waiter but failed to get the rtmutex the first time.
+        * waiter but have failed to get the rtmutex the first time.
+        *
         * We have to replace the newowner TID in the user space variable.
         * This must be atomic as we have to preserve the owner died bit here.
         *
@@ -2249,7 +2247,7 @@ retry:
        if (get_futex_value_locked(&uval, uaddr))
                goto handle_fault;
 
-       while (1) {
+       for (;;) {
                newval = (uval & FUTEX_OWNER_DIED) | newtid;
 
                if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval))
@@ -2345,6 +2343,10 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
                /*
                 * Got the lock. We might not be the anticipated owner if we
                 * did a lock-steal - fix up the PI-state in that case:
+                *
+                * We can safely read pi_state->owner without holding wait_lock
+                * because we now own the rt_mutex, only the owner will attempt
+                * to change it.
                 */
                if (q->pi_state->owner != current)
                        ret = fixup_pi_state_owner(uaddr, q, current);
@@ -2584,6 +2586,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags,
                         ktime_t *time, int trylock)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
+       struct futex_pi_state *pi_state = NULL;
        struct futex_hash_bucket *hb;
        struct futex_q q = futex_q_init;
        int res, ret;
@@ -2670,12 +2673,19 @@ retry_private:
         * If fixup_owner() faulted and was unable to handle the fault, unlock
         * it and return the fault to userspace.
         */
-       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
-               rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
+       if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) {
+               pi_state = q.pi_state;
+               get_pi_state(pi_state);
+       }
 
        /* Unqueue and drop the lock */
        unqueue_me_pi(&q);
 
+       if (pi_state) {
+               rt_mutex_futex_unlock(&pi_state->pi_mutex);
+               put_pi_state(pi_state);
+       }
+
        goto out_put_key;
 
 out_unlock_put_key:
@@ -2738,10 +2748,36 @@ retry:
         */
        top_waiter = futex_top_waiter(hb, &key);
        if (top_waiter) {
-               ret = wake_futex_pi(uaddr, uval, top_waiter, hb);
+               struct futex_pi_state *pi_state = top_waiter->pi_state;
+
+               ret = -EINVAL;
+               if (!pi_state)
+                       goto out_unlock;
+
+               /*
+                * If current does not own the pi_state then the futex is
+                * inconsistent and user space fiddled with the futex value.
+                */
+               if (pi_state->owner != current)
+                       goto out_unlock;
+
                /*
-                * In case of success wake_futex_pi dropped the hash
-                * bucket lock.
+                * Grab a reference on the pi_state and drop hb->lock.
+                *
+                * The reference ensures pi_state lives, dropping the hb->lock
+                * is tricky.. wake_futex_pi() will take rt_mutex::wait_lock to
+                * close the races against futex_lock_pi(), but in case of
+                * _any_ fail we'll abort and retry the whole deal.
+                */
+               get_pi_state(pi_state);
+               spin_unlock(&hb->lock);
+
+               ret = wake_futex_pi(uaddr, uval, pi_state);
+
+               put_pi_state(pi_state);
+
+               /*
+                * Success, we're done! No tricky corner cases.
                 */
                if (!ret)
                        goto out_putkey;
@@ -2756,7 +2792,6 @@ retry:
                 * setting the FUTEX_WAITERS bit. Try again.
                 */
                if (ret == -EAGAIN) {
-                       spin_unlock(&hb->lock);
                        put_futex_key(&key);
                        goto retry;
                }
@@ -2764,7 +2799,7 @@ retry:
                 * wake_futex_pi has detected invalid state. Tell user
                 * space.
                 */
-               goto out_unlock;
+               goto out_putkey;
        }
 
        /*
@@ -2774,8 +2809,10 @@ retry:
         * preserve the WAITERS bit not the OWNER_DIED one. We are the
         * owner.
         */
-       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0))
+       if (cmpxchg_futex_value_locked(&curval, uaddr, uval, 0)) {
+               spin_unlock(&hb->lock);
                goto pi_faulted;
+       }
 
        /*
         * If uval has changed, let user space handle it.
@@ -2789,7 +2826,6 @@ out_putkey:
        return ret;
 
 pi_faulted:
-       spin_unlock(&hb->lock);
        put_futex_key(&key);
 
        ret = fault_in_user_writeable(uaddr);
@@ -2893,6 +2929,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                                 u32 __user *uaddr2)
 {
        struct hrtimer_sleeper timeout, *to = NULL;
+       struct futex_pi_state *pi_state = NULL;
        struct rt_mutex_waiter rt_waiter;
        struct futex_hash_bucket *hb;
        union futex_key key2 = FUTEX_KEY_INIT;
@@ -2977,8 +3014,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                if (q.pi_state && (q.pi_state->owner != current)) {
                        spin_lock(q.lock_ptr);
                        ret = fixup_pi_state_owner(uaddr2, &q, current);
-                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current)
-                               rt_mutex_futex_unlock(&q.pi_state->pi_mutex);
+                       if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+                               pi_state = q.pi_state;
+                               get_pi_state(pi_state);
+                       }
                        /*
                         * Drop the reference to the pi state which
                         * the requeue_pi() code acquired for us.
@@ -3017,13 +3056,20 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
                 * the fault, unlock the rt_mutex and return the fault to
                 * userspace.
                 */
-               if (ret && rt_mutex_owner(pi_mutex) == current)
-                       rt_mutex_futex_unlock(pi_mutex);
+               if (ret && rt_mutex_owner(&q.pi_state->pi_mutex) == current) {
+                       pi_state = q.pi_state;
+                       get_pi_state(pi_state);
+               }
 
                /* Unqueue and drop the lock. */
                unqueue_me_pi(&q);
        }
 
+       if (pi_state) {
+               rt_mutex_futex_unlock(&pi_state->pi_mutex);
+               put_pi_state(pi_state);
+       }
+
        if (ret == -EINTR) {
                /*
                 * We've already been requeued, but cannot restart by calling