futex: Change locking rules

author Peter Zijlstra <peterz@infradead.org>

Wed, 22 Mar 2017 10:35:52 +0000 (11:35 +0100)

committer Thomas Gleixner <tglx@linutronix.de>

Thu, 23 Mar 2017 18:10:07 +0000 (19:10 +0100)
author Peter Zijlstra <peterz@infradead.org>
Wed, 22 Mar 2017 10:35:52 +0000 (11:35 +0100)
committer Thomas Gleixner <tglx@linutronix.de>
Thu, 23 Mar 2017 18:10:07 +0000 (19:10 +0100)
diff --git a/kernel/futex.c b/kernel/futex.c

index af022919933afe79b704f6672d1671a9a818a261..3e71d66cb78814a3bd41452bcc2d5cc2e5662b24 100644 (file)
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -973,6 +973,39 @@ void exit_pi_state_list(struct task_struct *curr)
   *
   * [10] There is no transient state which leaves owner and user space
   *     TID out of sync.
+ *
+ *
+ * Serialization and lifetime rules:
+ *
+ * hb->lock:
+ *
+ *     hb -> futex_q, relation
+ *     futex_q -> pi_state, relation
+ *
+ *     (cannot be raw because hb can contain arbitrary amount
+ *      of futex_q's)
+ *
+ * pi_mutex->wait_lock:
+ *
+ *     {uval, pi_state}
+ *
+ *     (and pi_mutex 'obviously')
+ *
+ * p->pi_lock:
+ *
+ *     p->pi_state_list -> pi_state->list, relation
+ *
+ * pi_state->refcount:
+ *
+ *     pi_state lifetime
+ *
+ *
+ * Lock order:
+ *
+ *   hb->lock
+ *     pi_mutex->wait_lock
+ *       p->pi_lock
+ *
   */
  
  /*
@@ -980,10 +1013,12 @@ void exit_pi_state_list(struct task_struct *curr)
   * the pi_state against the user space value. If correct, attach to
   * it.
   */
-static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
+static int attach_to_pi_state(u32 __user *uaddr, u32 uval,
+                             struct futex_pi_state *pi_state,
                               struct futex_pi_state **ps)
  {
         pid_t pid = uval & FUTEX_TID_MASK;
+       int ret, uval2;
  
         /*
          * Userspace might have messed up non-PI and PI futexes [3]
@@ -991,8 +1026,33 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
         if (unlikely(!pi_state))
                 return -EINVAL;
  
+       /*
+        * We get here with hb->lock held, and having found a
+        * futex_top_waiter(). This means that futex_lock_pi() of said futex_q
+        * has dropped the hb->lock in between queue_me() and unqueue_me_pi(),
+        * which in turn means that futex_lock_pi() still has a reference on
+        * our pi_state.
+        */
         WARN_ON(!atomic_read(&pi_state->refcount));
  
+       /*
+        * Now that we have a pi_state, we can acquire wait_lock
+        * and do the state validation.
+        */
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+       /*
+        * Since {uval, pi_state} is serialized by wait_lock, and our current
+        * uval was read without holding it, it can have changed. Verify it
+        * still is what we expect it to be, otherwise retry the entire
+        * operation.
+        */
+       if (get_futex_value_locked(&uval2, uaddr))
+               goto out_efault;
+
+       if (uval != uval2)
+               goto out_eagain;
+
         /*
          * Handle the owner died case:
          */
@@ -1008,11 +1068,11 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
                          * is not 0. Inconsistent state. [5]
                          */
                         if (pid)
-                               return -EINVAL;
+                               goto out_einval;
                         /*
                          * Take a ref on the state and return success. [4]
                          */
-                       goto out_state;
+                       goto out_attach;
                 }
  
                 /*
@@ -1024,14 +1084,14 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
                  * Take a ref on the state and return success. [6]
                  */
                 if (!pid)
-                       goto out_state;
+                       goto out_attach;
         } else {
                 /*
                  * If the owner died bit is not set, then the pi_state
                  * must have an owner. [7]
                  */
                 if (!pi_state->owner)
-                       return -EINVAL;
+                       goto out_einval;
         }
  
         /*
@@ -1040,11 +1100,29 @@ static int attach_to_pi_state(u32 uval, struct futex_pi_state *pi_state,
          * user space TID. [9/10]
          */
         if (pid != task_pid_vnr(pi_state->owner))
-               return -EINVAL;
-out_state:
+               goto out_einval;
+
+out_attach:
         atomic_inc(&pi_state->refcount);
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
         *ps = pi_state;
         return 0;
+
+out_einval:
+       ret = -EINVAL;
+       goto out_error;
+
+out_eagain:
+       ret = -EAGAIN;
+       goto out_error;
+
+out_efault:
+       ret = -EFAULT;
+       goto out_error;
+
+out_error:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       return ret;
  }
  
  /*
@@ -1095,6 +1173,9 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
  
         /*
          * No existing pi state. First waiter. [2]
+        *
+        * This creates pi_state, we have hb->lock held, this means nothing can
+        * observe this state, wait_lock is irrelevant.
          */
         pi_state = alloc_pi_state();
  
@@ -1119,7 +1200,8 @@ static int attach_to_pi_owner(u32 uval, union futex_key *key,
         return 0;
  }
  
-static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
+static int lookup_pi_state(u32 __user *uaddr, u32 uval,
+                          struct futex_hash_bucket *hb,
                            union futex_key *key, struct futex_pi_state **ps)
  {
         struct futex_q *top_waiter = futex_top_waiter(hb, key);
@@ -1129,7 +1211,7 @@ static int lookup_pi_state(u32 uval, struct futex_hash_bucket *hb,
          * attach to the pi_state when the validation succeeds.
          */
         if (top_waiter)
-               return attach_to_pi_state(uval, top_waiter->pi_state, ps);
+               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
  
         /*
          * We are the first waiter - try to look up the owner based on
@@ -1148,7 +1230,7 @@ static int lock_pi_update_atomic(u32 __user *uaddr, u32 uval, u32 newval)
         if (unlikely(cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)))
                 return -EFAULT;
  
-       /*If user space value changed, let the caller retry */
+       /* If user space value changed, let the caller retry */
         return curval != uval ? -EAGAIN : 0;
  }
  
@@ -1204,7 +1286,7 @@ static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb,
          */
         top_waiter = futex_top_waiter(hb, key);
         if (top_waiter)
-               return attach_to_pi_state(uval, top_waiter->pi_state, ps);
+               return attach_to_pi_state(uaddr, uval, top_waiter->pi_state, ps);
  
         /*
          * No waiter and user TID is 0. We are here because the
@@ -1336,6 +1418,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
  
         if (cmpxchg_futex_value_locked(&curval, uaddr, uval, newval)) {
                 ret = -EFAULT;
+
         } else if (curval != uval) {
                 /*
                  * If a unconditional UNLOCK_PI operation (user space did not
@@ -1348,6 +1431,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *top_waiter
                 else
                         ret = -EINVAL;
         }
+
         if (ret) {
                 raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
                 return ret;
@@ -1823,7 +1907,7 @@ retry_private:
                          * If that call succeeds then we have pi_state and an
                          * initial refcount on it.
                          */
-                       ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
+                       ret = lookup_pi_state(uaddr2, ret, hb2, &key2, &pi_state);
                 }
  
                 switch (ret) {
@@ -2122,10 +2206,13 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
  {
         u32 newtid = task_pid_vnr(newowner) | FUTEX_WAITERS;
         struct futex_pi_state *pi_state = q->pi_state;
-       struct task_struct *oldowner = pi_state->owner;
         u32 uval, uninitialized_var(curval), newval;
+       struct task_struct *oldowner;
         int ret;
  
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
+
+       oldowner = pi_state->owner;
         /* Owner died? */
         if (!pi_state->owner)
                 newtid |= FUTEX_OWNER_DIED;
@@ -2141,11 +2228,10 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
          * because we can fault here. Imagine swapped out pages or a fork
          * that marked all the anonymous memory readonly for cow.
          *
-        * Modifying pi_state _before_ the user space value would
-        * leave the pi_state in an inconsistent state when we fault
-        * here, because we need to drop the hash bucket lock to
-        * handle the fault. This might be observed in the PID check
-        * in lookup_pi_state.
+        * Modifying pi_state _before_ the user space value would leave the
+        * pi_state in an inconsistent state when we fault here, because we
+        * need to drop the locks to handle the fault. This might be observed
+        * in the PID check in lookup_pi_state.
          */
  retry:
         if (get_futex_value_locked(&uval, uaddr))
@@ -2166,47 +2252,60 @@ retry:
          * itself.
          */
         if (pi_state->owner != NULL) {
-               raw_spin_lock_irq(&pi_state->owner->pi_lock);
+               raw_spin_lock(&pi_state->owner->pi_lock);
                 WARN_ON(list_empty(&pi_state->list));
                 list_del_init(&pi_state->list);
-               raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+               raw_spin_unlock(&pi_state->owner->pi_lock);
         }
  
         pi_state->owner = newowner;
  
-       raw_spin_lock_irq(&newowner->pi_lock);
+       raw_spin_lock(&newowner->pi_lock);
         WARN_ON(!list_empty(&pi_state->list));
         list_add(&pi_state->list, &newowner->pi_state_list);
-       raw_spin_unlock_irq(&newowner->pi_lock);
+       raw_spin_unlock(&newowner->pi_lock);
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+
         return 0;
  
         /*
-        * To handle the page fault we need to drop the hash bucket
-        * lock here. That gives the other task (either the highest priority
-        * waiter itself or the task which stole the rtmutex) the
-        * chance to try the fixup of the pi_state. So once we are
-        * back from handling the fault we need to check the pi_state
-        * after reacquiring the hash bucket lock and before trying to
-        * do another fixup. When the fixup has been done already we
-        * simply return.
+        * To handle the page fault we need to drop the locks here. That gives
+        * the other task (either the highest priority waiter itself or the
+        * task which stole the rtmutex) the chance to try the fixup of the
+        * pi_state. So once we are back from handling the fault we need to
+        * check the pi_state after reacquiring the locks and before trying to
+        * do another fixup. When the fixup has been done already we simply
+        * return.
+        *
+        * Note: we hold both hb->lock and pi_mutex->wait_lock. We can safely
+        * drop hb->lock since the caller owns the hb -> futex_q relation.
+        * Dropping the pi_mutex->wait_lock requires the state revalidate.
          */
  handle_fault:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
         spin_unlock(q->lock_ptr);
  
         ret = fault_in_user_writeable(uaddr);
  
         spin_lock(q->lock_ptr);
+       raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
  
         /*
          * Check if someone else fixed it for us:
          */
-       if (pi_state->owner != oldowner)
-               return 0;
+       if (pi_state->owner != oldowner) {
+               ret = 0;
+               goto out_unlock;
+       }
  
         if (ret)
-               return ret;
+               goto out_unlock;
  
         goto retry;
+
+out_unlock:
+       raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
+       return ret;
  }
  
  static long futex_wait_restart(struct restart_block *restart);
author	Peter Zijlstra <peterz@infradead.org>
	Wed, 22 Mar 2017 10:35:52 +0000 (11:35 +0100)
committer	Thomas Gleixner <tglx@linutronix.de>
	Thu, 23 Mar 2017 18:10:07 +0000 (19:10 +0100)