perfcounters: restructure x86 counter math
authorIngo Molnar <mingo@elte.hu>
Sat, 13 Dec 2008 08:00:03 +0000 (09:00 +0100)
committerIngo Molnar <mingo@elte.hu>
Sun, 14 Dec 2008 19:30:48 +0000 (20:30 +0100)
Impact: restructure code

Change counter math from absolute values to clear delta logic.

We try to extract elapsed deltas from the raw hw counter - and put
that into the generic counter.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/x86/Kconfig
arch/x86/kernel/cpu/perf_counter.c
include/linux/perf_counter.h
kernel/perf_counter.c

index f2fdc1867241274e09c248a3ef9f22909018a4a3..fe94490bab614c9326d5ef181e0f124eefc1c66b 100644 (file)
@@ -643,7 +643,7 @@ config X86_UP_IOAPIC
 config X86_LOCAL_APIC
        def_bool y
        depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH))
-       select HAVE_PERF_COUNTERS
+       select HAVE_PERF_COUNTERS if (!M386 && !M486)
 
 config X86_IO_APIC
        def_bool y
index b903f8df72bb5f8b03e635c59d98de693cee03d5..5afae13d8d59b721a493bc0a0a8516d08f9e8656 100644 (file)
@@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] =
 
 const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map);
 
+/*
+ * Propagate counter elapsed time into the generic counter.
+ * Can only be executed on the CPU where the counter is active.
+ * Returns the delta events processed.
+ */
+static void
+x86_perf_counter_update(struct perf_counter *counter,
+                       struct hw_perf_counter *hwc, int idx)
+{
+       u64 prev_raw_count, new_raw_count, delta;
+
+       WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE);
+       /*
+        * Careful: an NMI might modify the previous counter value.
+        *
+        * Our tactic to handle this is to first atomically read and
+        * exchange a new raw count - then add that new-prev delta
+        * count to the generic counter atomically:
+        */
+again:
+       prev_raw_count = atomic64_read(&hwc->prev_count);
+       rdmsrl(hwc->counter_base + idx, new_raw_count);
+
+       if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count,
+                                       new_raw_count) != prev_raw_count)
+               goto again;
+
+       /*
+        * Now we have the new raw value and have updated the prev
+        * timestamp already. We can now calculate the elapsed delta
+        * (counter-)time and add that to the generic counter.
+        *
+        * Careful, not all hw sign-extends above the physical width
+        * of the count, so we do that by clipping the delta to 32 bits:
+        */
+       delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count);
+       WARN_ON_ONCE((int)delta < 0);
+
+       atomic64_add(delta, &counter->count);
+       atomic64_sub(delta, &hwc->period_left);
+}
+
 /*
  * Setup the hardware configuration for a given hw_event_type
  */
@@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
         * so we install an artificial 1<<31 period regardless of
         * the generic counter period:
         */
-       if (!hwc->irq_period)
+       if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF)
                hwc->irq_period = 0x7FFFFFFF;
 
-       hwc->next_count = -(s32)hwc->irq_period;
+       atomic64_set(&hwc->period_left, hwc->irq_period);
 
        /*
         * Raw event type provide the config in the event structure
@@ -118,12 +160,6 @@ void hw_perf_enable_all(void)
        wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0);
 }
 
-void hw_perf_restore(u64 ctrl)
-{
-       wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
-}
-EXPORT_SYMBOL_GPL(hw_perf_restore);
-
 u64 hw_perf_save_disable(void)
 {
        u64 ctrl;
@@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void)
 }
 EXPORT_SYMBOL_GPL(hw_perf_save_disable);
 
+void hw_perf_restore(u64 ctrl)
+{
+       wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0);
+}
+EXPORT_SYMBOL_GPL(hw_perf_restore);
+
 static inline void
-__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx)
+__x86_perf_counter_disable(struct perf_counter *counter,
+                          struct hw_perf_counter *hwc, unsigned int idx)
 {
-       wrmsr(hwc->config_base + idx, hwc->config, 0);
+       int err;
+
+       err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0);
+       WARN_ON_ONCE(err);
 }
 
-static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]);
+static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]);
 
-static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx)
+/*
+ * Set the next IRQ period, based on the hwc->period_left value.
+ * To be called with the counter disabled in hw:
+ */
+static void
+__hw_perf_counter_set_period(struct perf_counter *counter,
+                            struct hw_perf_counter *hwc, int idx)
 {
-       per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count;
+       s32 left = atomic64_read(&hwc->period_left);
+       s32 period = hwc->irq_period;
+
+       WARN_ON_ONCE(period <= 0);
+
+       /*
+        * If we are way outside a reasoable range then just skip forward:
+        */
+       if (unlikely(left <= -period)) {
+               left = period;
+               atomic64_set(&hwc->period_left, left);
+       }
+
+       if (unlikely(left <= 0)) {
+               left += period;
+               atomic64_set(&hwc->period_left, left);
+       }
 
-       wrmsr(hwc->counter_base + idx, hwc->next_count, 0);
+       WARN_ON_ONCE(left <= 0);
+
+       per_cpu(prev_left[idx], smp_processor_id()) = left;
+
+       /*
+        * The hw counter starts counting from this counter offset,
+        * mark it to be able to extra future deltas:
+        */
+       atomic64_set(&hwc->prev_count, (u64)(s64)-left);
+
+       wrmsr(hwc->counter_base + idx, -left, 0);
 }
 
-static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx)
+static void
+__x86_perf_counter_enable(struct perf_counter *counter,
+                         struct hw_perf_counter *hwc, int idx)
 {
        wrmsr(hwc->config_base + idx,
              hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0);
 }
 
+/*
+ * Find a PMC slot for the freshly enabled / scheduled in counter:
+ */
 static void x86_perf_counter_enable(struct perf_counter *counter)
 {
        struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter)
 
        perf_counters_lapic_init(hwc->nmi);
 
-       __x86_perf_counter_disable(hwc, idx);
+       __x86_perf_counter_disable(counter, hwc, idx);
 
        cpuc->counters[idx] = counter;
 
-       __hw_perf_counter_set_period(hwc, idx);
-       __x86_perf_counter_enable(hwc, idx);
-}
-
-static void __hw_perf_save_counter(struct perf_counter *counter,
-                                  struct hw_perf_counter *hwc, int idx)
-{
-       s64 raw = -1;
-       s64 delta;
-
-       /*
-        * Get the raw hw counter value:
-        */
-       rdmsrl(hwc->counter_base + idx, raw);
-
-       /*
-        * Rebase it to zero (it started counting at -irq_period),
-        * to see the delta since ->prev_count:
-        */
-       delta = (s64)hwc->irq_period + (s64)(s32)raw;
-
-       atomic64_counter_set(counter, hwc->prev_count + delta);
-
-       /*
-        * Adjust the ->prev_count offset - if we went beyond
-        * irq_period of units, then we got an IRQ and the counter
-        * was set back to -irq_period:
-        */
-       while (delta >= (s64)hwc->irq_period) {
-               hwc->prev_count += hwc->irq_period;
-               delta -= (s64)hwc->irq_period;
-       }
-
-       /*
-        * Calculate the next raw counter value we'll write into
-        * the counter at the next sched-in time:
-        */
-       delta -= (s64)hwc->irq_period;
-
-       hwc->next_count = (s32)delta;
+       __hw_perf_counter_set_period(counter, hwc, idx);
+       __x86_perf_counter_enable(counter, hwc, idx);
 }
 
 void perf_counter_print_debug(void)
 {
-       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count;
+       u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left;
        int cpu, idx;
 
        if (!nr_hw_counters)
@@ -241,14 +286,14 @@ void perf_counter_print_debug(void)
                rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
                rdmsrl(MSR_ARCH_PERFMON_PERFCTR0  + idx, pmc_count);
 
-               next_count = per_cpu(prev_next_count[idx], cpu);
+               prev_left = per_cpu(prev_left[idx], cpu);
 
                printk(KERN_INFO "CPU#%d: PMC%d ctrl:  %016llx\n",
                        cpu, idx, pmc_ctrl);
                printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n",
                        cpu, idx, pmc_count);
-               printk(KERN_INFO "CPU#%d: PMC%d next:  %016llx\n",
-                       cpu, idx, next_count);
+               printk(KERN_INFO "CPU#%d: PMC%d left:  %016llx\n",
+                       cpu, idx, prev_left);
        }
        local_irq_enable();
 }
@@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter)
        struct hw_perf_counter *hwc = &counter->hw;
        unsigned int idx = hwc->idx;
 
-       __x86_perf_counter_disable(hwc, idx);
+       __x86_perf_counter_disable(counter, hwc, idx);
 
        clear_bit(idx, cpuc->used);
        cpuc->counters[idx] = NULL;
-       __hw_perf_save_counter(counter, hwc, idx);
-}
 
-static void x86_perf_counter_read(struct perf_counter *counter)
-{
-       struct hw_perf_counter *hwc = &counter->hw;
-       unsigned long addr = hwc->counter_base + hwc->idx;
-       s64 offs, val = -1LL;
-       s32 val32;
-
-       /* Careful: NMI might modify the counter offset */
-       do {
-               offs = hwc->prev_count;
-               rdmsrl(addr, val);
-       } while (offs != hwc->prev_count);
-
-       val32 = (s32) val;
-       val = (s64)hwc->irq_period + (s64)val32;
-       atomic64_counter_set(counter, hwc->prev_count + val);
+       /*
+        * Drain the remaining delta count out of a counter
+        * that we are disabling:
+        */
+       x86_perf_counter_update(counter, hwc, idx);
 }
 
 static void perf_store_irq_data(struct perf_counter *counter, u64 data)
@@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data)
 }
 
 /*
- * NMI-safe enable method:
+ * Save and restart an expired counter. Called by NMI contexts,
+ * so it has to be careful about preempting normal counter ops:
  */
 static void perf_save_and_restart(struct perf_counter *counter)
 {
@@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter)
 
        rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl);
 
-       __hw_perf_save_counter(counter, hwc, idx);
-       __hw_perf_counter_set_period(hwc, idx);
+       x86_perf_counter_update(counter, hwc, idx);
+       __hw_perf_counter_set_period(counter, hwc, idx);
 
        if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE)
-               __x86_perf_counter_enable(hwc, idx);
+               __x86_perf_counter_enable(counter, hwc, idx);
 }
 
 static void
 perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown)
 {
        struct perf_counter *counter, *group_leader = sibling->group_leader;
-       int bit;
-
-       /*
-        * Store the counter's own timestamp first:
-        */
-       perf_store_irq_data(sibling, sibling->hw_event.type);
-       perf_store_irq_data(sibling, atomic64_counter_read(sibling));
 
        /*
-        * Then store sibling timestamps (if any):
+        * Store sibling timestamps (if any):
         */
        list_for_each_entry(counter, &group_leader->sibling_list, list_entry) {
-               if (counter->state != PERF_COUNTER_STATE_ACTIVE) {
-                       /*
-                        * When counter was not in the overflow mask, we have to
-                        * read it from hardware. We read it as well, when it
-                        * has not been read yet and clear the bit in the
-                        * status mask.
-                        */
-                       bit = counter->hw.idx;
-                       if (!test_bit(bit, (unsigned long *) overflown) ||
-                           test_bit(bit, (unsigned long *) status)) {
-                               clear_bit(bit, (unsigned long *) status);
-                               perf_save_and_restart(counter);
-                       }
-               }
+               x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
                perf_store_irq_data(sibling, counter->hw_event.type);
-               perf_store_irq_data(sibling, atomic64_counter_read(counter));
+               perf_store_irq_data(sibling, atomic64_read(&counter->count));
        }
 }
 
@@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void)
        perf_counters_initialized = true;
 }
 
+static void x86_perf_counter_read(struct perf_counter *counter)
+{
+       x86_perf_counter_update(counter, &counter->hw, counter->hw.idx);
+}
+
 static const struct hw_perf_counter_ops x86_perf_counter_ops = {
        .hw_perf_counter_enable         = x86_perf_counter_enable,
        .hw_perf_counter_disable        = x86_perf_counter_disable,
index 8cb095fa442c38ad7d5122ec9d0e5fe2488e1489..72460289c654af7191da8e0a87def5c60ea94518 100644 (file)
@@ -91,14 +91,16 @@ struct perf_counter_hw_event {
  * struct hw_perf_counter - performance counter hardware details:
  */
 struct hw_perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
        u64                             config;
        unsigned long                   config_base;
        unsigned long                   counter_base;
        int                             nmi;
        unsigned int                    idx;
-       u64                             prev_count;
+       atomic64_t                      prev_count;
        u64                             irq_period;
-       s32                             next_count;
+       atomic64_t                      period_left;
+#endif
 };
 
 /*
@@ -140,17 +142,15 @@ enum perf_counter_active_state {
  * struct perf_counter - performance counter kernel representation:
  */
 struct perf_counter {
+#ifdef CONFIG_PERF_COUNTERS
        struct list_head                list_entry;
        struct list_head                sibling_list;
        struct perf_counter             *group_leader;
        const struct hw_perf_counter_ops *hw_ops;
 
        enum perf_counter_active_state  state;
-#if BITS_PER_LONG == 64
        atomic64_t                      count;
-#else
-       atomic_t                        count32[2];
-#endif
+
        struct perf_counter_hw_event    hw_event;
        struct hw_perf_counter          hw;
 
@@ -172,6 +172,7 @@ struct perf_counter {
        struct perf_data                *irqdata;
        struct perf_data                *usrdata;
        struct perf_data                data[2];
+#endif
 };
 
 /**
@@ -220,8 +221,6 @@ extern void perf_counter_notify(struct pt_regs *regs);
 extern void perf_counter_print_debug(void);
 extern u64 hw_perf_save_disable(void);
 extern void hw_perf_restore(u64 ctrl);
-extern void atomic64_counter_set(struct perf_counter *counter, u64 val64);
-extern u64 atomic64_counter_read(struct perf_counter *counter);
 extern int perf_counter_task_disable(void);
 extern int perf_counter_task_enable(void);
 
index 559130b8774ded9c1b0a64bb6846b1e41adde6a5..416861ce8b272ca4a66ba812ec6e5f5346e438f8 100644 (file)
@@ -44,67 +44,9 @@ hw_perf_counter_init(struct perf_counter *counter)
 }
 
 u64 __weak hw_perf_save_disable(void)          { return 0; }
-void __weak hw_perf_restore(u64 ctrl)  { }
+void __weak hw_perf_restore(u64 ctrl)          { }
 void __weak hw_perf_counter_setup(void)                { }
 
-#if BITS_PER_LONG == 64
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 64 bit version - no complications.
- */
-static inline u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-       return (u64) atomic64_read(&counter->count);
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val)
-{
-       atomic64_set(&counter->count, val);
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-       return atomic64_read(&counter->count);
-}
-
-#else
-
-/*
- * Read the cached counter in counter safe against cross CPU / NMI
- * modifications. 32 bit version.
- */
-static u64 perf_counter_read_safe(struct perf_counter *counter)
-{
-       u32 cntl, cnth;
-
-       local_irq_disable();
-       do {
-               cnth = atomic_read(&counter->count32[1]);
-               cntl = atomic_read(&counter->count32[0]);
-       } while (cnth != atomic_read(&counter->count32[1]));
-
-       local_irq_enable();
-
-       return cntl | ((u64) cnth) << 32;
-}
-
-void atomic64_counter_set(struct perf_counter *counter, u64 val64)
-{
-       u32 *val32 = (void *)&val64;
-
-       atomic_set(counter->count32 + 0, *(val32 + 0));
-       atomic_set(counter->count32 + 1, *(val32 + 1));
-}
-
-u64 atomic64_counter_read(struct perf_counter *counter)
-{
-       return atomic_read(counter->count32 + 0) |
-               (u64) atomic_read(counter->count32 + 1) << 32;
-}
-
-#endif
-
 static void
 list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
@@ -280,11 +222,11 @@ static void __perf_install_in_context(void *info)
        ctx->nr_counters++;
 
        if (cpuctx->active_oncpu < perf_max_counters) {
-               counter->hw_ops->hw_perf_counter_enable(counter);
                counter->state = PERF_COUNTER_STATE_ACTIVE;
                counter->oncpu = cpu;
                ctx->nr_active++;
                cpuctx->active_oncpu++;
+               counter->hw_ops->hw_perf_counter_enable(counter);
        }
 
        if (!ctx->task && cpuctx->max_pertask)
@@ -624,7 +566,7 @@ static u64 perf_counter_read(struct perf_counter *counter)
                                         __hw_perf_counter_read, counter, 1);
        }
 
-       return perf_counter_read_safe(counter);
+       return atomic64_read(&counter->count);
 }
 
 /*
@@ -921,7 +863,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter)
 {
        int cpu = raw_smp_processor_id();
 
-       atomic64_counter_set(counter, cpu_clock(cpu));
+       atomic64_set(&counter->count, cpu_clock(cpu));
 }
 
 static const struct hw_perf_counter_ops perf_ops_cpu_clock = {
@@ -940,7 +882,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter)
 
 static void task_clock_perf_counter_read(struct perf_counter *counter)
 {
-       atomic64_counter_set(counter, current->se.sum_exec_runtime);
+       atomic64_set(&counter->count, current->se.sum_exec_runtime);
 }
 
 static const struct hw_perf_counter_ops perf_ops_task_clock = {