locking/pvqspinlock: Collect slowpath lock statistics
authorWaiman Long <Waiman.Long@hpe.com>
Tue, 10 Nov 2015 00:09:25 +0000 (19:09 -0500)
committerIngo Molnar <mingo@kernel.org>
Fri, 4 Dec 2015 10:39:50 +0000 (11:39 +0100)
This patch enables the accumulation of kicking and waiting related
PV qspinlock statistics when the new QUEUED_LOCK_STAT configuration
option is selected. It also enables the collection of data which
enable us to calculate the kicking and wakeup latencies which have
a heavy dependency on the CPUs being used.

The statistical counters are per-cpu variables to minimize the
performance overhead in their updates. These counters are exported
via the debugfs filesystem under the qlockstat directory.  When the
corresponding debugfs files are read, summation and computing of the
required data are then performed.

The measured latencies for different CPUs are:

CPU Wakeup Kicking
--- ------ -------
Haswell-EX 63.6us  7.4us
Westmere-EX 67.6us  9.3us

The measured latencies varied a bit from run-to-run. The wakeup
latency is much higher than the kicking latency.

A sample of statistical counters after system bootup (with vCPU
overcommit) was:

pv_hash_hops=1.00
pv_kick_unlock=1148
pv_kick_wake=1146
pv_latency_kick=11040
pv_latency_wake=194840
pv_spurious_wakeup=7
pv_wait_again=4
pv_wait_head=23
pv_wait_node=1129

Signed-off-by: Waiman Long <Waiman.Long@hpe.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Davidlohr Bueso <dave@stgolabs.net>
Cc: Douglas Hatch <doug.hatch@hpe.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Scott J Norton <scott.norton@hpe.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1447114167-47185-6-git-send-email-Waiman.Long@hpe.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/x86/Kconfig
kernel/locking/qspinlock_paravirt.h
kernel/locking/qspinlock_stat.h [new file with mode: 0644]

index db3622f22b618303a8bc9c3c2f34762f400dbafb..965fc4216f76c0653e32ed2d36390aea161b8adf 100644 (file)
@@ -687,6 +687,14 @@ config PARAVIRT_SPINLOCKS
 
          If you are unsure how to answer this question, answer Y.
 
+config QUEUED_LOCK_STAT
+       bool "Paravirt queued spinlock statistics"
+       depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS
+       ---help---
+         Enable the collection of statistical data on the slowpath
+         behavior of paravirtualized queued spinlocks and report
+         them on debugfs.
+
 source "arch/x86/xen/Kconfig"
 
 config KVM_GUEST
index 4bd323d38c60bce5f5f582e2acf1027d80b023fb..aaeeefb791f8243a050dd1cf9f8ae6c45fbc9c01 100644 (file)
@@ -40,6 +40,11 @@ struct pv_node {
        u8                      state;
 };
 
+/*
+ * Include queued spinlock statistics code
+ */
+#include "qspinlock_stat.h"
+
 /*
  * Lock and MCS node addresses hash table for fast lookup
  *
@@ -100,10 +105,13 @@ static struct qspinlock **pv_hash(struct qspinlock *lock, struct pv_node *node)
 {
        unsigned long offset, hash = hash_ptr(lock, pv_lock_hash_bits);
        struct pv_hash_entry *he;
+       int hopcnt = 0;
 
        for_each_hash_entry(he, offset, hash) {
+               hopcnt++;
                if (!cmpxchg(&he->lock, NULL, lock)) {
                        WRITE_ONCE(he->node, node);
+                       qstat_hop(hopcnt);
                        return &he->lock;
                }
        }
@@ -164,9 +172,11 @@ static void pv_init_node(struct mcs_spinlock *node)
 static void pv_wait_node(struct mcs_spinlock *node)
 {
        struct pv_node *pn = (struct pv_node *)node;
+       int waitcnt = 0;
        int loop;
 
-       for (;;) {
+       /* waitcnt processing will be compiled out if !QUEUED_LOCK_STAT */
+       for (;; waitcnt++) {
                for (loop = SPIN_THRESHOLD; loop; loop--) {
                        if (READ_ONCE(node->locked))
                                return;
@@ -184,12 +194,16 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 */
                smp_store_mb(pn->state, vcpu_halted);
 
-               if (!READ_ONCE(node->locked))
+               if (!READ_ONCE(node->locked)) {
+                       qstat_inc(qstat_pv_wait_node, true);
+                       qstat_inc(qstat_pv_wait_again, waitcnt);
                        pv_wait(&pn->state, vcpu_halted);
+               }
 
                /*
-                * If pv_kick_node() changed us to vcpu_hashed, retain that value
-                * so that pv_wait_head() knows to not also try to hash this lock.
+                * If pv_kick_node() changed us to vcpu_hashed, retain that
+                * value so that pv_wait_head() knows to not also try to hash
+                * this lock.
                 */
                cmpxchg(&pn->state, vcpu_halted, vcpu_running);
 
@@ -200,6 +214,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
                 * So it is better to spin for a while in the hope that the
                 * MCS lock will be released soon.
                 */
+               qstat_inc(qstat_pv_spurious_wakeup, !READ_ONCE(node->locked));
        }
 
        /*
@@ -250,6 +265,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
        struct pv_node *pn = (struct pv_node *)node;
        struct __qspinlock *l = (void *)lock;
        struct qspinlock **lp = NULL;
+       int waitcnt = 0;
        int loop;
 
        /*
@@ -259,7 +275,7 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
        if (READ_ONCE(pn->state) == vcpu_hashed)
                lp = (struct qspinlock **)1;
 
-       for (;;) {
+       for (;; waitcnt++) {
                for (loop = SPIN_THRESHOLD; loop; loop--) {
                        if (!READ_ONCE(l->locked))
                                return;
@@ -290,14 +306,19 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
                                return;
                        }
                }
+               qstat_inc(qstat_pv_wait_head, true);
+               qstat_inc(qstat_pv_wait_again, waitcnt);
                pv_wait(&l->locked, _Q_SLOW_VAL);
 
+               if (!READ_ONCE(l->locked))
+                       return;
                /*
                 * The unlocker should have freed the lock before kicking the
                 * CPU. So if the lock is still not free, it is a spurious
                 * wakeup and so the vCPU should wait again after spinning for
                 * a while.
                 */
+               qstat_inc(qstat_pv_spurious_wakeup, true);
        }
 
        /*
@@ -352,6 +373,7 @@ __pv_queued_spin_unlock_slowpath(struct qspinlock *lock, u8 locked)
         * vCPU is harmless other than the additional latency in completing
         * the unlock.
         */
+       qstat_inc(qstat_pv_kick_unlock, true);
        pv_kick(node->cpu);
 }
 
diff --git a/kernel/locking/qspinlock_stat.h b/kernel/locking/qspinlock_stat.h
new file mode 100644 (file)
index 0000000..b1553ad
--- /dev/null
@@ -0,0 +1,281 @@
+/*
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * Authors: Waiman Long <waiman.long@hpe.com>
+ */
+
+/*
+ * When queued spinlock statistical counters are enabled, the following
+ * debugfs files will be created for reporting the counter values:
+ *
+ * <debugfs>/qlockstat/
+ *   pv_hash_hops      - average # of hops per hashing operation
+ *   pv_kick_unlock    - # of vCPU kicks issued at unlock time
+ *   pv_kick_wake      - # of vCPU kicks used for computing pv_latency_wake
+ *   pv_latency_kick   - average latency (ns) of vCPU kick operation
+ *   pv_latency_wake   - average latency (ns) from vCPU kick to wakeup
+ *   pv_spurious_wakeup        - # of spurious wakeups
+ *   pv_wait_again     - # of vCPU wait's that happened after a vCPU kick
+ *   pv_wait_head      - # of vCPU wait's at the queue head
+ *   pv_wait_node      - # of vCPU wait's at a non-head queue node
+ *
+ * Writing to the "reset_counters" file will reset all the above counter
+ * values.
+ *
+ * These statistical counters are implemented as per-cpu variables which are
+ * summed and computed whenever the corresponding debugfs files are read. This
+ * minimizes added overhead making the counters usable even in a production
+ * environment.
+ *
+ * There may be slight difference between pv_kick_wake and pv_kick_unlock.
+ */
+enum qlock_stats {
+       qstat_pv_hash_hops,
+       qstat_pv_kick_unlock,
+       qstat_pv_kick_wake,
+       qstat_pv_latency_kick,
+       qstat_pv_latency_wake,
+       qstat_pv_spurious_wakeup,
+       qstat_pv_wait_again,
+       qstat_pv_wait_head,
+       qstat_pv_wait_node,
+       qstat_num,      /* Total number of statistical counters */
+       qstat_reset_cnts = qstat_num,
+};
+
+#ifdef CONFIG_QUEUED_LOCK_STAT
+/*
+ * Collect pvqspinlock statistics
+ */
+#include <linux/debugfs.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+
+static const char * const qstat_names[qstat_num + 1] = {
+       [qstat_pv_hash_hops]       = "pv_hash_hops",
+       [qstat_pv_kick_unlock]     = "pv_kick_unlock",
+       [qstat_pv_kick_wake]       = "pv_kick_wake",
+       [qstat_pv_spurious_wakeup] = "pv_spurious_wakeup",
+       [qstat_pv_latency_kick]    = "pv_latency_kick",
+       [qstat_pv_latency_wake]    = "pv_latency_wake",
+       [qstat_pv_wait_again]      = "pv_wait_again",
+       [qstat_pv_wait_head]       = "pv_wait_head",
+       [qstat_pv_wait_node]       = "pv_wait_node",
+       [qstat_reset_cnts]         = "reset_counters",
+};
+
+/*
+ * Per-cpu counters
+ */
+static DEFINE_PER_CPU(unsigned long, qstats[qstat_num]);
+static DEFINE_PER_CPU(u64, pv_kick_time);
+
+/*
+ * Function to read and return the qlock statistical counter values
+ *
+ * The following counters are handled specially:
+ * 1. qstat_pv_latency_kick
+ *    Average kick latency (ns) = pv_latency_kick/pv_kick_unlock
+ * 2. qstat_pv_latency_wake
+ *    Average wake latency (ns) = pv_latency_wake/pv_kick_wake
+ * 3. qstat_pv_hash_hops
+ *    Average hops/hash = pv_hash_hops/pv_kick_unlock
+ */
+static ssize_t qstat_read(struct file *file, char __user *user_buf,
+                         size_t count, loff_t *ppos)
+{
+       char buf[64];
+       int cpu, counter, len;
+       u64 stat = 0, kicks = 0;
+
+       /*
+        * Get the counter ID stored in file->f_inode->i_private
+        */
+       if (!file->f_inode) {
+               WARN_ON_ONCE(1);
+               return -EBADF;
+       }
+       counter = (long)(file->f_inode->i_private);
+
+       if (counter >= qstat_num)
+               return -EBADF;
+
+       for_each_possible_cpu(cpu) {
+               stat += per_cpu(qstats[counter], cpu);
+               /*
+                * Need to sum additional counter for some of them
+                */
+               switch (counter) {
+
+               case qstat_pv_latency_kick:
+               case qstat_pv_hash_hops:
+                       kicks += per_cpu(qstats[qstat_pv_kick_unlock], cpu);
+                       break;
+
+               case qstat_pv_latency_wake:
+                       kicks += per_cpu(qstats[qstat_pv_kick_wake], cpu);
+                       break;
+               }
+       }
+
+       if (counter == qstat_pv_hash_hops) {
+               u64 frac;
+
+               frac = 100ULL * do_div(stat, kicks);
+               frac = DIV_ROUND_CLOSEST_ULL(frac, kicks);
+
+               /*
+                * Return a X.XX decimal number
+                */
+               len = snprintf(buf, sizeof(buf) - 1, "%llu.%02llu\n", stat, frac);
+       } else {
+               /*
+                * Round to the nearest ns
+                */
+               if ((counter == qstat_pv_latency_kick) ||
+                   (counter == qstat_pv_latency_wake)) {
+                       stat = 0;
+                       if (kicks)
+                               stat = DIV_ROUND_CLOSEST_ULL(stat, kicks);
+               }
+               len = snprintf(buf, sizeof(buf) - 1, "%llu\n", stat);
+       }
+
+       return simple_read_from_buffer(user_buf, count, ppos, buf, len);
+}
+
+/*
+ * Function to handle write request
+ *
+ * When counter = reset_cnts, reset all the counter values.
+ * Since the counter updates aren't atomic, the resetting is done twice
+ * to make sure that the counters are very likely to be all cleared.
+ */
+static ssize_t qstat_write(struct file *file, const char __user *user_buf,
+                          size_t count, loff_t *ppos)
+{
+       int cpu;
+
+       /*
+        * Get the counter ID stored in file->f_inode->i_private
+        */
+       if (!file->f_inode) {
+               WARN_ON_ONCE(1);
+               return -EBADF;
+       }
+       if ((long)(file->f_inode->i_private) != qstat_reset_cnts)
+               return count;
+
+       for_each_possible_cpu(cpu) {
+               int i;
+               unsigned long *ptr = per_cpu_ptr(qstats, cpu);
+
+               for (i = 0 ; i < qstat_num; i++)
+                       WRITE_ONCE(ptr[i], 0);
+               for (i = 0 ; i < qstat_num; i++)
+                       WRITE_ONCE(ptr[i], 0);
+       }
+       return count;
+}
+
+/*
+ * Debugfs data structures
+ */
+static const struct file_operations fops_qstat = {
+       .read = qstat_read,
+       .write = qstat_write,
+       .llseek = default_llseek,
+};
+
+/*
+ * Initialize debugfs for the qspinlock statistical counters
+ */
+static int __init init_qspinlock_stat(void)
+{
+       struct dentry *d_qstat = debugfs_create_dir("qlockstat", NULL);
+       int i;
+
+       if (!d_qstat) {
+               pr_warn("Could not create 'qlockstat' debugfs directory\n");
+               return 0;
+       }
+
+       /*
+        * Create the debugfs files
+        *
+        * As reading from and writing to the stat files can be slow, only
+        * root is allowed to do the read/write to limit impact to system
+        * performance.
+        */
+       for (i = 0; i < qstat_num; i++)
+               debugfs_create_file(qstat_names[i], 0400, d_qstat,
+                                  (void *)(long)i, &fops_qstat);
+
+       debugfs_create_file(qstat_names[qstat_reset_cnts], 0200, d_qstat,
+                          (void *)(long)qstat_reset_cnts, &fops_qstat);
+       return 0;
+}
+fs_initcall(init_qspinlock_stat);
+
+/*
+ * Increment the PV qspinlock statistical counters
+ */
+static inline void qstat_inc(enum qlock_stats stat, bool cond)
+{
+       if (cond)
+               this_cpu_inc(qstats[stat]);
+}
+
+/*
+ * PV hash hop count
+ */
+static inline void qstat_hop(int hopcnt)
+{
+       this_cpu_add(qstats[qstat_pv_hash_hops], hopcnt);
+}
+
+/*
+ * Replacement function for pv_kick()
+ */
+static inline void __pv_kick(int cpu)
+{
+       u64 start = sched_clock();
+
+       per_cpu(pv_kick_time, cpu) = start;
+       pv_kick(cpu);
+       this_cpu_add(qstats[qstat_pv_latency_kick], sched_clock() - start);
+}
+
+/*
+ * Replacement function for pv_wait()
+ */
+static inline void __pv_wait(u8 *ptr, u8 val)
+{
+       u64 *pkick_time = this_cpu_ptr(&pv_kick_time);
+
+       *pkick_time = 0;
+       pv_wait(ptr, val);
+       if (*pkick_time) {
+               this_cpu_add(qstats[qstat_pv_latency_wake],
+                            sched_clock() - *pkick_time);
+               qstat_inc(qstat_pv_kick_wake, true);
+       }
+}
+
+#define pv_kick(c)     __pv_kick(c)
+#define pv_wait(p, v)  __pv_wait(p, v)
+
+#else /* CONFIG_QUEUED_LOCK_STAT */
+
+static inline void qstat_inc(enum qlock_stats stat, bool cond) { }
+static inline void qstat_hop(int hopcnt)                       { }
+
+#endif /* CONFIG_QUEUED_LOCK_STAT */