locking/pvqspinlock: Allow limited lock stealing

author Waiman Long <Waiman.Long@hpe.com>

Tue, 10 Nov 2015 21:18:56 +0000 (16:18 -0500)

committer Ingo Molnar <mingo@kernel.org>

Fri, 4 Dec 2015 10:39:51 +0000 (11:39 +0100)
author Waiman Long <Waiman.Long@hpe.com>
Tue, 10 Nov 2015 21:18:56 +0000 (16:18 -0500)
committer Ingo Molnar <mingo@kernel.org>
Fri, 4 Dec 2015 10:39:51 +0000 (11:39 +0100)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c

index ed9d96708f93c36fdf6ddc5dc480bb006517782e..2ea42999d2d80185caa9056ad7ca2358cc52d2d2 100644 (file)
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -251,15 +251,16 @@ static __always_inline void __pv_init_node(struct mcs_spinlock *node) { }
  static __always_inline void __pv_wait_node(struct mcs_spinlock *node) { }
  static __always_inline void __pv_kick_node(struct qspinlock *lock,
                                            struct mcs_spinlock *node) { }
-static __always_inline void __pv_wait_head(struct qspinlock *lock,
-                                          struct mcs_spinlock *node) { }
+static __always_inline u32  __pv_wait_head_or_lock(struct qspinlock *lock,
+                                                  struct mcs_spinlock *node)
+                                                  { return 0; }
  
  #define pv_enabled()           false
  
  #define pv_init_node           __pv_init_node
  #define pv_wait_node           __pv_wait_node
  #define pv_kick_node           __pv_kick_node
-#define pv_wait_head           __pv_wait_head
+#define pv_wait_head_or_lock   __pv_wait_head_or_lock
  
  #ifdef CONFIG_PARAVIRT_SPINLOCKS
  #define queued_spin_lock_slowpath      native_queued_spin_lock_slowpath
@@ -431,10 +432,22 @@ queue:
          * sequentiality; this is because the set_locked() function below
          * does not imply a full barrier.
          *
+        * The PV pv_wait_head_or_lock function, if active, will acquire
+        * the lock and return a non-zero value. So we have to skip the
+        * smp_load_acquire() call. As the next PV queue head hasn't been
+        * designated yet, there is no way for the locked value to become
+        * _Q_SLOW_VAL. So both the set_locked() and the
+        * atomic_cmpxchg_relaxed() calls will be safe.
+        *
+        * If PV isn't active, 0 will be returned instead.
+        *
          */
-       pv_wait_head(lock, node);
+       if ((val = pv_wait_head_or_lock(lock, node)))
+               goto locked;
+
         smp_cond_acquire(!((val = atomic_read(&lock->val)) & _Q_LOCKED_PENDING_MASK));
  
+locked:
         /*
          * claim the lock:
          *
@@ -446,7 +459,8 @@ queue:
          * to grab the lock.
          */
         for (;;) {
-               if (val != tail) {
+               /* In the PV case we might already have _Q_LOCKED_VAL set */
+               if ((val & _Q_TAIL_MASK) != tail) {
                         set_locked(lock);
                         break;
                 }
@@ -493,7 +507,7 @@ EXPORT_SYMBOL(queued_spin_lock_slowpath);
  #undef pv_init_node
  #undef pv_wait_node
  #undef pv_kick_node
-#undef pv_wait_head
+#undef pv_wait_head_or_lock
  
  #undef  queued_spin_lock_slowpath
  #define queued_spin_lock_slowpath      __pv_queued_spin_lock_slowpath
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h

index aaeeefb791f8243a050dd1cf9f8ae6c45fbc9c01..ace60a451b4f049a936ddace2dfc77fa53ba40b2 100644 (file)
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -40,6 +40,89 @@ struct pv_node {
         u8                      state;
  };
  
+/*
+ * By replacing the regular queued_spin_trylock() with the function below,
+ * it will be called once when a lock waiter enter the PV slowpath before
+ * being queued. By allowing one lock stealing attempt here when the pending
+ * bit is off, it helps to reduce the performance impact of lock waiter
+ * preemption without the drawback of lock starvation.
+ */
+#define queued_spin_trylock(l) pv_queued_spin_steal_lock(l)
+static inline bool pv_queued_spin_steal_lock(struct qspinlock *lock)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       return !(atomic_read(&lock->val) & _Q_LOCKED_PENDING_MASK) &&
+               (cmpxchg(&l->locked, 0, _Q_LOCKED_VAL) == 0);
+}
+
+/*
+ * The pending bit is used by the queue head vCPU to indicate that it
+ * is actively spinning on the lock and no lock stealing is allowed.
+ */
+#if _Q_PENDING_BITS == 8
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       WRITE_ONCE(l->pending, 1);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       WRITE_ONCE(l->pending, 0);
+}
+
+/*
+ * The pending bit check in pv_queued_spin_steal_lock() isn't a memory
+ * barrier. Therefore, an atomic cmpxchg() is used to acquire the lock
+ * just to be sure that it will get it.
+ */
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       return !READ_ONCE(l->locked) &&
+              (cmpxchg(&l->locked_pending, _Q_PENDING_VAL, _Q_LOCKED_VAL)
+                       == _Q_PENDING_VAL);
+}
+#else /* _Q_PENDING_BITS == 8 */
+static __always_inline void set_pending(struct qspinlock *lock)
+{
+       atomic_set_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline void clear_pending(struct qspinlock *lock)
+{
+       atomic_clear_mask(_Q_PENDING_VAL, &lock->val);
+}
+
+static __always_inline int trylock_clear_pending(struct qspinlock *lock)
+{
+       int val = atomic_read(&lock->val);
+
+       for (;;) {
+               int old, new;
+
+               if (val  & _Q_LOCKED_MASK)
+                       break;
+
+               /*
+                * Try to clear pending bit & set locked bit
+                */
+               old = val;
+               new = (val & ~_Q_PENDING_MASK) | _Q_LOCKED_VAL;
+               val = atomic_cmpxchg(&lock->val, old, new);
+
+               if (val == old)
+                       return 1;
+       }
+       return 0;
+}
+#endif /* _Q_PENDING_BITS == 8 */
+
  /*
   * Include queued spinlock statistics code
   */
@@ -202,8 +285,8 @@ static void pv_wait_node(struct mcs_spinlock *node)
  
                 /*
                  * If pv_kick_node() changed us to vcpu_hashed, retain that
-                * value so that pv_wait_head() knows to not also try to hash
-                * this lock.
+                * value so that pv_wait_head_or_lock() knows to not also try
+                * to hash this lock.
                  */
                 cmpxchg(&pn->state, vcpu_halted, vcpu_running);
  
@@ -227,8 +310,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
  /*
   * Called after setting next->locked = 1 when we're the lock owner.
   *
- * Instead of waking the waiters stuck in pv_wait_node() advance their state such
- * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state
+ * such that they're waiting in pv_wait_head_or_lock(), this avoids a
+ * wake/sleep cycle.
   */
  static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
  {
@@ -257,10 +341,14 @@ static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
  }
  
  /*
- * Wait for l->locked to become clear; halt the vcpu after a short spin.
+ * Wait for l->locked to become clear and acquire the lock;
+ * halt the vcpu after a short spin.
   * __pv_queued_spin_unlock() will wake us.
+ *
+ * The current value of the lock will be returned for additional processing.
   */
-static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
+static u32
+pv_wait_head_or_lock(struct qspinlock *lock, struct mcs_spinlock *node)
  {
         struct pv_node *pn = (struct pv_node *)node;
         struct __qspinlock *l = (void *)lock;
@@ -276,11 +364,18 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
                 lp = (struct qspinlock **)1;
  
         for (;; waitcnt++) {
+               /*
+                * Set the pending bit in the active lock spinning loop to
+                * disable lock stealing before attempting to acquire the lock.
+                */
+               set_pending(lock);
                 for (loop = SPIN_THRESHOLD; loop; loop--) {
-                       if (!READ_ONCE(l->locked))
-                               return;
+                       if (trylock_clear_pending(lock))
+                               goto gotlock;
                         cpu_relax();
                 }
+               clear_pending(lock);
+
  
                 if (!lp) { /* ONCE */
                         lp = pv_hash(lock, pn);
@@ -296,36 +391,38 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
                          *
                          * Matches the smp_rmb() in __pv_queued_spin_unlock().
                          */
-                       if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
+                       if (xchg(&l->locked, _Q_SLOW_VAL) == 0) {
                                 /*
-                                * The lock is free and _Q_SLOW_VAL has never
-                                * been set. Therefore we need to unhash before
-                                * getting the lock.
+                                * The lock was free and now we own the lock.
+                                * Change the lock value back to _Q_LOCKED_VAL
+                                * and unhash the table.
                                  */
+                               WRITE_ONCE(l->locked, _Q_LOCKED_VAL);
                                 WRITE_ONCE(*lp, NULL);
-                               return;
+                               goto gotlock;
                         }
                 }
                 qstat_inc(qstat_pv_wait_head, true);
                 qstat_inc(qstat_pv_wait_again, waitcnt);
                 pv_wait(&l->locked, _Q_SLOW_VAL);
  
-               if (!READ_ONCE(l->locked))
-                       return;
                 /*
                  * The unlocker should have freed the lock before kicking the
                  * CPU. So if the lock is still not free, it is a spurious
-                * wakeup and so the vCPU should wait again after spinning for
-                * a while.
+                * wakeup or another vCPU has stolen the lock. The current
+                * vCPU should spin again.
                  */
-               qstat_inc(qstat_pv_spurious_wakeup, true);
+               qstat_inc(qstat_pv_spurious_wakeup, READ_ONCE(l->locked));
         }
  
         /*
-        * Lock is unlocked now; the caller will acquire it without waiting.
-        * As with pv_wait_node() we rely on the caller to do a load-acquire
-        * for us.
+        * The cmpxchg() or xchg() call before coming here provides the
+        * acquire semantics for locking. The dummy ORing of _Q_LOCKED_VAL
+        * here is to indicate to the compiler that the value will always
+        * be nozero to enable better code optimization.
          */
+gotlock:
+       return (u32)(atomic_read(&lock->val) | _Q_LOCKED_VAL);
  }
  
  /*
@@ -350,7 +447,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
          * so we need a barrier to order the read of the node data in
          * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
          *
-        * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+        * Matches the cmpxchg() in pv_wait_head_or_lock() setting _Q_SLOW_VAL.
          */
         smp_rmb();
  
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h

index b1553adec2e7ff650a4f4978792728c5d29353fb..94d4533fe984b5c81a9e6be4d5679f430760aaeb 100644 (file)
--- a/kernel/locking/qspinlock_stat.h
+++ b/kernel/locking/qspinlock_stat.h
@@ -22,6 +22,7 @@
   *   pv_kick_wake      - # of vCPU kicks used for computing pv_latency_wake
   *   pv_latency_kick   - average latency (ns) of vCPU kick operation
   *   pv_latency_wake   - average latency (ns) from vCPU kick to wakeup
+ *   pv_lock_stealing  - # of lock stealing operations
   *   pv_spurious_wakeup        - # of spurious wakeups
   *   pv_wait_again     - # of vCPU wait's that happened after a vCPU kick
   *   pv_wait_head      - # of vCPU wait's at the queue head
@@ -43,6 +44,7 @@ enum qlock_stats {
         qstat_pv_kick_wake,
         qstat_pv_latency_kick,
         qstat_pv_latency_wake,
+       qstat_pv_lock_stealing,
         qstat_pv_spurious_wakeup,
         qstat_pv_wait_again,
         qstat_pv_wait_head,
@@ -66,6 +68,7 @@ static const char * const qstat_names[qstat_num + 1] = {
         [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
         [qstat_pv_latency_kick]    = "pv_latency_kick",
         [qstat_pv_latency_wake]    = "pv_latency_wake",
+       [qstat_pv_lock_stealing]   = "pv_lock_stealing",
         [qstat_pv_wait_again]      = "pv_wait_again",
         [qstat_pv_wait_head]       = "pv_wait_head",
         [qstat_pv_wait_node]       = "pv_wait_node",
@@ -273,6 +276,19 @@ static inline void __pv_wait(u8 *ptr, u8 val)
  #define pv_kick(c)     __pv_kick(c)
  #define pv_wait(p, v)  __pv_wait(p, v)
  
+/*
+ * PV unfair trylock count tracking function
+ */
+static inline int qstat_spin_steal_lock(struct qspinlock *lock)
+{
+       int ret = pv_queued_spin_steal_lock(lock);
+
+       qstat_inc(qstat_pv_lock_stealing, ret);
+       return ret;
+}
+#undef  queued_spin_trylock
+#define queued_spin_trylock(l) qstat_spin_steal_lock(l)
+
  #else /* CONFIG_QUEUED_LOCK_STAT */
  
  static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
author	Waiman Long <Waiman.Long@hpe.com>
	Tue, 10 Nov 2015 21:18:56 +0000 (16:18 -0500)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 4 Dec 2015 10:39:51 +0000 (11:39 +0100)
kernel/locking/qspinlock.c		patch \| blob \| blame \| history
kernel/locking/qspinlock_paravirt.h		patch \| blob \| blame \| history
kernel/locking/qspinlock_stat.h		patch \| blob \| blame \| history