perf: Use hrtimers for event multiplexing
authorStephane Eranian <eranian@google.com>
Wed, 3 Apr 2013 12:21:33 +0000 (14:21 +0200)
committerIngo Molnar <mingo@kernel.org>
Tue, 28 May 2013 07:07:10 +0000 (09:07 +0200)
The current scheme of using the timer tick was fine for per-thread
events. However, it was causing bias issues in system-wide mode
(including for uncore PMUs). Event groups would not get their fair
share of runtime on the PMU. With tickless kernels, if a core is idle
there is no timer tick, and thus no event rotation (multiplexing).
However, there are events (especially uncore events) which do count
even though cores are asleep.

This patch changes the timer source for multiplexing.  It introduces a
per-PMU per-cpu hrtimer. The advantage is that even when a core goes
idle, it will come back to service the hrtimer, thus multiplexing on
system-wide events works much better.

The per-PMU implementation (suggested by PeterZ) enables adjusting the
multiplexing interval per PMU. The preferred interval is stashed into
the struct pmu. If not set, it will be forced to the default interval
value.

In order to minimize the impact of the hrtimer, it is turned on and
off on demand. When the PMU on a CPU is overcommited, the hrtimer is
activated.  It is stopped when the PMU is not overcommitted.

In order for this to work properly, we had to change the order of
initialization in start_kernel() such that hrtimer_init() is run
before perf_event_init().

The default interval in milliseconds is set to a timer tick just like
with the old code. We will provide a sysctl to tune this in another
patch.

Signed-off-by: Stephane Eranian <eranian@google.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Link: http://lkml.kernel.org/r/1364991694-5876-2-git-send-email-eranian@google.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
include/linux/perf_event.h
init/main.c
kernel/events/core.c

index fa38612d70b662f6afca09bf93282a9bd3c3b51c..72138d75a60ae26854a4c6ed52f7c325bef49fad 100644 (file)
@@ -501,8 +501,9 @@ struct perf_cpu_context {
        struct perf_event_context       *task_ctx;
        int                             active_oncpu;
        int                             exclusive;
+       struct hrtimer                  hrtimer;
+       ktime_t                         hrtimer_interval;
        struct list_head                rotation_list;
-       int                             jiffies_interval;
        struct pmu                      *unique_pmu;
        struct perf_cgroup              *cgrp;
 };
index 9484f4ba88d05aa589bb737d96836a1a01db55ce..ec549581d732f633ca002c823a38ed15a6d31a60 100644 (file)
@@ -542,7 +542,6 @@ asmlinkage void __init start_kernel(void)
        if (WARN(!irqs_disabled(), "Interrupts were enabled *very* early, fixing it\n"))
                local_irq_disable();
        idr_init_cache();
-       perf_event_init();
        rcu_init();
        tick_nohz_init();
        radix_tree_init();
@@ -555,6 +554,7 @@ asmlinkage void __init start_kernel(void)
        softirq_init();
        timekeeping_init();
        time_init();
+       perf_event_init();
        profile_init();
        call_function_init();
        WARN(!irqs_disabled(), "Interrupts were enabled early\n");
index e0dcced282e4f67ace8923ff5a57e56c003e3b3c..97bfac7e6f4585e0e28bd2c23f8e68428550abca 100644 (file)
@@ -170,6 +170,8 @@ int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
 static int max_samples_per_tick __read_mostly =
        DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
 
+static int perf_rotate_context(struct perf_cpu_context *cpuctx);
+
 int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
@@ -658,6 +660,98 @@ perf_cgroup_mark_enabled(struct perf_event *event,
 }
 #endif
 
+/*
+ * set default to be dependent on timer tick just
+ * like original code
+ */
+#define PERF_CPU_HRTIMER (1000 / HZ)
+/*
+ * function must be called with interrupts disbled
+ */
+static enum hrtimer_restart perf_cpu_hrtimer_handler(struct hrtimer *hr)
+{
+       struct perf_cpu_context *cpuctx;
+       enum hrtimer_restart ret = HRTIMER_NORESTART;
+       int rotations = 0;
+
+       WARN_ON(!irqs_disabled());
+
+       cpuctx = container_of(hr, struct perf_cpu_context, hrtimer);
+
+       rotations = perf_rotate_context(cpuctx);
+
+       /*
+        * arm timer if needed
+        */
+       if (rotations) {
+               hrtimer_forward_now(hr, cpuctx->hrtimer_interval);
+               ret = HRTIMER_RESTART;
+       }
+
+       return ret;
+}
+
+/* CPU is going down */
+void perf_cpu_hrtimer_cancel(int cpu)
+{
+       struct perf_cpu_context *cpuctx;
+       struct pmu *pmu;
+       unsigned long flags;
+
+       if (WARN_ON(cpu != smp_processor_id()))
+               return;
+
+       local_irq_save(flags);
+
+       rcu_read_lock();
+
+       list_for_each_entry_rcu(pmu, &pmus, entry) {
+               cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
+
+               if (pmu->task_ctx_nr == perf_sw_context)
+                       continue;
+
+               hrtimer_cancel(&cpuctx->hrtimer);
+       }
+
+       rcu_read_unlock();
+
+       local_irq_restore(flags);
+}
+
+static void __perf_cpu_hrtimer_init(struct perf_cpu_context *cpuctx, int cpu)
+{
+       struct hrtimer *hr = &cpuctx->hrtimer;
+       struct pmu *pmu = cpuctx->ctx.pmu;
+
+       /* no multiplexing needed for SW PMU */
+       if (pmu->task_ctx_nr == perf_sw_context)
+               return;
+
+       cpuctx->hrtimer_interval =
+               ns_to_ktime(NSEC_PER_MSEC * PERF_CPU_HRTIMER);
+
+       hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL_PINNED);
+       hr->function = perf_cpu_hrtimer_handler;
+}
+
+static void perf_cpu_hrtimer_restart(struct perf_cpu_context *cpuctx)
+{
+       struct hrtimer *hr = &cpuctx->hrtimer;
+       struct pmu *pmu = cpuctx->ctx.pmu;
+
+       /* not for SW PMU */
+       if (pmu->task_ctx_nr == perf_sw_context)
+               return;
+
+       if (hrtimer_active(hr))
+               return;
+
+       if (!hrtimer_callback_running(hr))
+               __hrtimer_start_range_ns(hr, cpuctx->hrtimer_interval,
+                                        0, HRTIMER_MODE_REL_PINNED, 0);
+}
+
 void perf_pmu_disable(struct pmu *pmu)
 {
        int *count = this_cpu_ptr(pmu->pmu_disable_count);
@@ -1506,6 +1600,7 @@ group_sched_in(struct perf_event *group_event,
 
        if (event_sched_in(group_event, cpuctx, ctx)) {
                pmu->cancel_txn(pmu);
+               perf_cpu_hrtimer_restart(cpuctx);
                return -EAGAIN;
        }
 
@@ -1552,6 +1647,8 @@ group_error:
 
        pmu->cancel_txn(pmu);
 
+       perf_cpu_hrtimer_restart(cpuctx);
+
        return -EAGAIN;
 }
 
@@ -1807,8 +1904,10 @@ static int __perf_event_enable(void *info)
                 * If this event can't go on and it's part of a
                 * group, then the whole group has to come off.
                 */
-               if (leader != event)
+               if (leader != event) {
                        group_sched_out(leader, cpuctx, ctx);
+                       perf_cpu_hrtimer_restart(cpuctx);
+               }
                if (leader->attr.pinned) {
                        update_group_times(leader);
                        leader->state = PERF_EVENT_STATE_ERROR;
@@ -2555,7 +2654,7 @@ static void rotate_ctx(struct perf_event_context *ctx)
  * because they're strictly cpu affine and rotate_start is called with IRQs
  * disabled, while rotate_context is called from IRQ context.
  */
-static void perf_rotate_context(struct perf_cpu_context *cpuctx)
+static int perf_rotate_context(struct perf_cpu_context *cpuctx)
 {
        struct perf_event_context *ctx = NULL;
        int rotate = 0, remove = 1;
@@ -2594,6 +2693,8 @@ static void perf_rotate_context(struct perf_cpu_context *cpuctx)
 done:
        if (remove)
                list_del_init(&cpuctx->rotation_list);
+
+       return rotate;
 }
 
 #ifdef CONFIG_NO_HZ_FULL
@@ -2625,10 +2726,6 @@ void perf_event_task_tick(void)
                ctx = cpuctx->task_ctx;
                if (ctx)
                        perf_adjust_freq_unthr_context(ctx, throttled);
-
-               if (cpuctx->jiffies_interval == 1 ||
-                               !(jiffies % cpuctx->jiffies_interval))
-                       perf_rotate_context(cpuctx);
        }
 }
 
@@ -6001,7 +6098,9 @@ skip_type:
                lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
                cpuctx->ctx.type = cpu_context;
                cpuctx->ctx.pmu = pmu;
-               cpuctx->jiffies_interval = 1;
+
+               __perf_cpu_hrtimer_init(cpuctx, cpu);
+
                INIT_LIST_HEAD(&cpuctx->rotation_list);
                cpuctx->unique_pmu = pmu;
        }
@@ -7387,7 +7486,6 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
        case CPU_DOWN_PREPARE:
                perf_event_exit_cpu(cpu);
                break;
-
        default:
                break;
        }