locking/qspinlock: Optimize for smaller NR_CPUS

author Peter Zijlstra (Intel) <peterz@infradead.org>

Fri, 24 Apr 2015 18:56:34 +0000 (14:56 -0400)

committer Ingo Molnar <mingo@kernel.org>

Fri, 8 May 2015 10:36:48 +0000 (12:36 +0200)
author Peter Zijlstra (Intel) <peterz@infradead.org>
Fri, 24 Apr 2015 18:56:34 +0000 (14:56 -0400)
committer Ingo Molnar <mingo@kernel.org>
Fri, 8 May 2015 10:36:48 +0000 (12:36 +0200)
diff --git a/include/asm-generic/qspinlock_types.h b/include/asm-generic/qspinlock_types.h

index 3a7f67173bd000fe15c29500d133517b5265a65b..85f888e86761e1b71bd0de2a891a160194e256d7 100644 (file)
--- a/include/asm-generic/qspinlock_types.h
+++ b/include/asm-generic/qspinlock_types.h
@@ -35,6 +35,14 @@ typedef struct qspinlock {
  /*
   * Bitfields in the atomic value:
   *
+ * When NR_CPUS < 16K
+ *  0- 7: locked byte
+ *     8: pending
+ *  9-15: not used
+ * 16-17: tail index
+ * 18-31: tail cpu (+1)
+ *
+ * When NR_CPUS >= 16K
   *  0- 7: locked byte
   *     8: pending
   *  9-10: tail index
@@ -47,7 +55,11 @@ typedef struct qspinlock {
  #define _Q_LOCKED_MASK         _Q_SET_MASK(LOCKED)
  
  #define _Q_PENDING_OFFSET      (_Q_LOCKED_OFFSET + _Q_LOCKED_BITS)
+#if CONFIG_NR_CPUS < (1U << 14)
+#define _Q_PENDING_BITS                8
+#else
  #define _Q_PENDING_BITS                1
+#endif
  #define _Q_PENDING_MASK                _Q_SET_MASK(PENDING)
  
  #define _Q_TAIL_IDX_OFFSET     (_Q_PENDING_OFFSET + _Q_PENDING_BITS)
@@ -58,6 +70,7 @@ typedef struct qspinlock {
  #define _Q_TAIL_CPU_BITS       (32 - _Q_TAIL_CPU_OFFSET)
  #define _Q_TAIL_CPU_MASK       _Q_SET_MASK(TAIL_CPU)
  
+#define _Q_TAIL_OFFSET         _Q_TAIL_IDX_OFFSET
  #define _Q_TAIL_MASK           (_Q_TAIL_IDX_MASK | _Q_TAIL_CPU_MASK)
  
  #define _Q_LOCKED_VAL          (1U << _Q_LOCKED_OFFSET)
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c

index 82bb4a9e9009270c1cf2008e5dbddc60a1f9e950..e17efe7b8d4d6f7c12b4fc915ed98954fa97e0cb 100644 (file)
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -24,6 +24,7 @@
  #include <linux/percpu.h>
  #include <linux/hardirq.h>
  #include <linux/mutex.h>
+#include <asm/byteorder.h>
  #include <asm/qspinlock.h>
  
  /*
@@ -56,6 +57,10 @@
   * node; whereby avoiding the need to carry a node from lock to unlock, and
   * preserving existing lock API. This also makes the unlock code simpler and
   * faster.
+ *
+ * N.B. The current implementation only supports architectures that allow
+ *      atomic operations on smaller 8-bit and 16-bit data types.
+ *
   */
  
  #include "mcs_spinlock.h"
@@ -96,6 +101,62 @@ static inline struct mcs_spinlock *decode_tail(u32 tail)
  
  #define _Q_LOCKED_PENDING_MASK (_Q_LOCKED_MASK | _Q_PENDING_MASK)
  
+/*
+ * By using the whole 2nd least significant byte for the pending bit, we
+ * can allow better optimization of the lock acquisition for the pending
+ * bit holder.
+ */
+#if _Q_PENDING_BITS == 8
+
+struct __qspinlock {
+       union {
+               atomic_t val;
+               struct {
+#ifdef __LITTLE_ENDIAN
+                       u16     locked_pending;
+                       u16     tail;
+#else
+                       u16     tail;
+                       u16     locked_pending;
+#endif
+               };
+       };
+};
+
+/**
+ * clear_pending_set_locked - take ownership and clear the pending bit.
+ * @lock: Pointer to queued spinlock structure
+ *
+ * *,1,0 -> *,0,1
+ *
+ * Lock stealing is not allowed if this function is used.
+ */
+static __always_inline void clear_pending_set_locked(struct qspinlock *lock)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       WRITE_ONCE(l->locked_pending, _Q_LOCKED_VAL);
+}
+
+/*
+ * xchg_tail - Put in the new queue tail code word & retrieve previous one
+ * @lock : Pointer to queued spinlock structure
+ * @tail : The new queue tail code word
+ * Return: The previous queue tail code word
+ *
+ * xchg(lock, tail)
+ *
+ * p,*,* -> n,*,* ; prev = xchg(lock, node)
+ */
+static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
+{
+       struct __qspinlock *l = (void *)lock;
+
+       return (u32)xchg(&l->tail, tail >> _Q_TAIL_OFFSET) << _Q_TAIL_OFFSET;
+}
+
+#else /* _Q_PENDING_BITS == 8 */
+
  /**
   * clear_pending_set_locked - take ownership and clear the pending bit.
   * @lock: Pointer to queued spinlock structure
@@ -131,6 +192,7 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail)
         }
         return old;
  }
+#endif /* _Q_PENDING_BITS == 8 */
  
  /**
   * queued_spin_lock_slowpath - acquire the queued spinlock
@@ -205,8 +267,13 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val)
          * we're pending, wait for the owner to go away.
          *
          * *,1,1 -> *,1,0
+        *
+        * this wait loop must be a load-acquire such that we match the
+        * store-release that clears the locked bit and create lock
+        * sequentiality; this is because not all clear_pending_set_locked()
+        * implementations imply full barriers.
          */
-       while ((val = atomic_read(&lock->val)) & _Q_LOCKED_MASK)
+       while ((val = smp_load_acquire(&lock->val.counter)) & _Q_LOCKED_MASK)
                 cpu_relax();
  
         /*
author	Peter Zijlstra (Intel) <peterz@infradead.org>
	Fri, 24 Apr 2015 18:56:34 +0000 (14:56 -0400)
committer	Ingo Molnar <mingo@kernel.org>
	Fri, 8 May 2015 10:36:48 +0000 (12:36 +0200)
include/asm-generic/qspinlock_types.h		patch \| blob \| blame \| history
kernel/locking/qspinlock.c		patch \| blob \| blame \| history