perf: Fix race in callchains
authorFrederic Weisbecker <fweisbec@gmail.com>
Thu, 1 Jul 2010 14:20:36 +0000 (16:20 +0200)
committerFrederic Weisbecker <fweisbec@gmail.com>
Wed, 18 Aug 2010 23:32:31 +0000 (01:32 +0200)
Now that software events don't have interrupt disabled anymore in
the event path, callchains can nest on any context. So seperating
nmi and others contexts in two buffers has become racy.

Fix this by providing one buffer per nesting level. Given the size
of the callchain entries (2040 bytes * 4), we now need to allocate
them dynamically.

v2: Fixed put_callchain_entry call after recursion.
    Fix the type of the recursion, it must be an array.

v3: Use a manual pr cpu allocation (temporary solution until NMIs
    can safely access vmalloc'ed memory).
    Do a better separation between callchain reference tracking and
    allocation. Make the "put" path lockless for non-release cases.

v4: Protect the callchain buffers with rcu.

v5: Do the cpu buffers allocations node affine.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Will Deacon <will.deacon@arm.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: David Miller <davem@davemloft.net>
Cc: Borislav Petkov <bp@amd64.org>
arch/x86/kernel/cpu/perf_event.c
include/linux/perf_event.h
kernel/perf_event.c

index a3c922288cc063ace45b012da1bed1a86e6ed058..8e91cf34a9c886a85ba9c77db49c3f5eb5a93674 100644 (file)
@@ -1608,6 +1608,11 @@ static const struct stacktrace_ops backtrace_ops = {
 void
 perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
 {
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return NULL;
+       }
+
        perf_callchain_store(entry, regs->ip);
 
        dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
@@ -1656,6 +1661,10 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
        struct stack_frame frame;
        const void __user *fp;
 
+       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
+               /* TODO: We don't support guest os callchain now */
+               return NULL;
+       }
 
        fp = (void __user *)regs->bp;
 
@@ -1681,19 +1690,6 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
        }
 }
 
-struct perf_callchain_entry *perf_callchain_buffer(void)
-{
-       if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
-               /* TODO: We don't support guest os callchain now */
-               return NULL;
-       }
-
-       if (in_nmi())
-               return &__get_cpu_var(perf_callchain_entry_nmi);
-
-       return &__get_cpu_var(perf_callchain_entry);
-}
-
 unsigned long perf_instruction_pointer(struct pt_regs *regs)
 {
        unsigned long ip;
index 4db61dded388d3386bb3db1b439c2b1a3ea6638d..d7e8ea6908642f9ed7b26ffbb6a2b061ddcaaf23 100644 (file)
@@ -983,7 +983,6 @@ extern void perf_callchain_user(struct perf_callchain_entry *entry,
                                struct pt_regs *regs);
 extern void perf_callchain_kernel(struct perf_callchain_entry *entry,
                                  struct pt_regs *regs);
-extern struct perf_callchain_entry *perf_callchain_buffer(void);
 
 
 static inline void
index 615d024894cf6c110452f8c2d425238acde529dc..75ab8a2df6b2dc8020dbbeeb54fd4b6068e5bc76 100644 (file)
@@ -1763,6 +1763,216 @@ static u64 perf_event_read(struct perf_event *event)
        return perf_event_count(event);
 }
 
+/*
+ * Callchain support
+ */
+
+struct callchain_cpus_entries {
+       struct rcu_head                 rcu_head;
+       struct perf_callchain_entry     *cpu_entries[0];
+};
+
+static DEFINE_PER_CPU(int, callchain_recursion[4]);
+static atomic_t nr_callchain_events;
+static DEFINE_MUTEX(callchain_mutex);
+struct callchain_cpus_entries *callchain_cpus_entries;
+
+
+__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
+                                 struct pt_regs *regs)
+{
+}
+
+__weak void perf_callchain_user(struct perf_callchain_entry *entry,
+                               struct pt_regs *regs)
+{
+}
+
+static void release_callchain_buffers_rcu(struct rcu_head *head)
+{
+       struct callchain_cpus_entries *entries;
+       int cpu;
+
+       entries = container_of(head, struct callchain_cpus_entries, rcu_head);
+
+       for_each_possible_cpu(cpu)
+               kfree(entries->cpu_entries[cpu]);
+
+       kfree(entries);
+}
+
+static void release_callchain_buffers(void)
+{
+       struct callchain_cpus_entries *entries;
+
+       entries = callchain_cpus_entries;
+       rcu_assign_pointer(callchain_cpus_entries, NULL);
+       call_rcu(&entries->rcu_head, release_callchain_buffers_rcu);
+}
+
+static int alloc_callchain_buffers(void)
+{
+       int cpu;
+       int size;
+       struct callchain_cpus_entries *entries;
+
+       /*
+        * We can't use the percpu allocation API for data that can be
+        * accessed from NMI. Use a temporary manual per cpu allocation
+        * until that gets sorted out.
+        */
+       size = sizeof(*entries) + sizeof(struct perf_callchain_entry *) *
+               num_possible_cpus();
+
+       entries = kzalloc(size, GFP_KERNEL);
+       if (!entries)
+               return -ENOMEM;
+
+       size = sizeof(struct perf_callchain_entry) * 4;
+
+       for_each_possible_cpu(cpu) {
+               entries->cpu_entries[cpu] = kmalloc_node(size, GFP_KERNEL,
+                                                        cpu_to_node(cpu));
+               if (!entries->cpu_entries[cpu])
+                       goto fail;
+       }
+
+       rcu_assign_pointer(callchain_cpus_entries, entries);
+
+       return 0;
+
+fail:
+       for_each_possible_cpu(cpu)
+               kfree(entries->cpu_entries[cpu]);
+       kfree(entries);
+
+       return -ENOMEM;
+}
+
+static int get_callchain_buffers(void)
+{
+       int err = 0;
+       int count;
+
+       mutex_lock(&callchain_mutex);
+
+       count = atomic_inc_return(&nr_callchain_events);
+       if (WARN_ON_ONCE(count < 1)) {
+               err = -EINVAL;
+               goto exit;
+       }
+
+       if (count > 1) {
+               /* If the allocation failed, give up */
+               if (!callchain_cpus_entries)
+                       err = -ENOMEM;
+               goto exit;
+       }
+
+       err = alloc_callchain_buffers();
+       if (err)
+               release_callchain_buffers();
+exit:
+       mutex_unlock(&callchain_mutex);
+
+       return err;
+}
+
+static void put_callchain_buffers(void)
+{
+       if (atomic_dec_and_mutex_lock(&nr_callchain_events, &callchain_mutex)) {
+               release_callchain_buffers();
+               mutex_unlock(&callchain_mutex);
+       }
+}
+
+static int get_recursion_context(int *recursion)
+{
+       int rctx;
+
+       if (in_nmi())
+               rctx = 3;
+       else if (in_irq())
+               rctx = 2;
+       else if (in_softirq())
+               rctx = 1;
+       else
+               rctx = 0;
+
+       if (recursion[rctx])
+               return -1;
+
+       recursion[rctx]++;
+       barrier();
+
+       return rctx;
+}
+
+static inline void put_recursion_context(int *recursion, int rctx)
+{
+       barrier();
+       recursion[rctx]--;
+}
+
+static struct perf_callchain_entry *get_callchain_entry(int *rctx)
+{
+       int cpu;
+       struct callchain_cpus_entries *entries;
+
+       *rctx = get_recursion_context(__get_cpu_var(callchain_recursion));
+       if (*rctx == -1)
+               return NULL;
+
+       entries = rcu_dereference(callchain_cpus_entries);
+       if (!entries)
+               return NULL;
+
+       cpu = smp_processor_id();
+
+       return &entries->cpu_entries[cpu][*rctx];
+}
+
+static void
+put_callchain_entry(int rctx)
+{
+       put_recursion_context(__get_cpu_var(callchain_recursion), rctx);
+}
+
+static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
+{
+       int rctx;
+       struct perf_callchain_entry *entry;
+
+
+       entry = get_callchain_entry(&rctx);
+       if (rctx == -1)
+               return NULL;
+
+       if (!entry)
+               goto exit_put;
+
+       entry->nr = 0;
+
+       if (!user_mode(regs)) {
+               perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
+               perf_callchain_kernel(entry, regs);
+               if (current->mm)
+                       regs = task_pt_regs(current);
+               else
+                       regs = NULL;
+       }
+
+       if (regs) {
+               perf_callchain_store(entry, PERF_CONTEXT_USER);
+               perf_callchain_user(entry, regs);
+       }
+
+exit_put:
+       put_callchain_entry(rctx);
+
+       return entry;
+}
+
 /*
  * Initialize the perf_event context in a task_struct:
  */
@@ -1895,6 +2105,8 @@ static void free_event(struct perf_event *event)
                        atomic_dec(&nr_comm_events);
                if (event->attr.task)
                        atomic_dec(&nr_task_events);
+               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
+                       put_callchain_buffers();
        }
 
        if (event->buffer) {
@@ -2937,55 +3149,6 @@ void perf_event_do_pending(void)
        __perf_pending_run();
 }
 
-DEFINE_PER_CPU(struct perf_callchain_entry, perf_callchain_entry);
-
-/*
- * Callchain support -- arch specific
- */
-
-__weak struct perf_callchain_entry *perf_callchain_buffer(void)
-{
-       return &__get_cpu_var(perf_callchain_entry);
-}
-
-__weak void perf_callchain_kernel(struct perf_callchain_entry *entry,
-                                 struct pt_regs *regs)
-{
-}
-
-__weak void perf_callchain_user(struct perf_callchain_entry *entry,
-                               struct pt_regs *regs)
-{
-}
-
-static struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
-{
-       struct perf_callchain_entry *entry;
-
-       entry = perf_callchain_buffer();
-       if (!entry)
-               return NULL;
-
-       entry->nr = 0;
-
-       if (!user_mode(regs)) {
-               perf_callchain_store(entry, PERF_CONTEXT_KERNEL);
-               perf_callchain_kernel(entry, regs);
-               if (current->mm)
-                       regs = task_pt_regs(current);
-               else
-                       regs = NULL;
-       }
-
-       if (regs) {
-               perf_callchain_store(entry, PERF_CONTEXT_USER);
-               perf_callchain_user(entry, regs);
-       }
-
-       return entry;
-}
-
-
 /*
  * We assume there is only KVM supporting the callbacks.
  * Later on, we might change it to a list if there is
@@ -3480,14 +3643,20 @@ static void perf_event_output(struct perf_event *event, int nmi,
        struct perf_output_handle handle;
        struct perf_event_header header;
 
+       /* protect the callchain buffers */
+       rcu_read_lock();
+
        perf_prepare_sample(&header, data, event, regs);
 
        if (perf_output_begin(&handle, event, header.size, nmi, 1))
-               return;
+               goto exit;
 
        perf_output_sample(&handle, &header, data, event);
 
        perf_output_end(&handle);
+
+exit:
+       rcu_read_unlock();
 }
 
 /*
@@ -4243,32 +4412,16 @@ end:
 int perf_swevent_get_recursion_context(void)
 {
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       int rctx;
 
-       if (in_nmi())
-               rctx = 3;
-       else if (in_irq())
-               rctx = 2;
-       else if (in_softirq())
-               rctx = 1;
-       else
-               rctx = 0;
-
-       if (cpuctx->recursion[rctx])
-               return -1;
-
-       cpuctx->recursion[rctx]++;
-       barrier();
-
-       return rctx;
+       return get_recursion_context(cpuctx->recursion);
 }
 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
 
 void inline perf_swevent_put_recursion_context(int rctx)
 {
        struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context);
-       barrier();
-       cpuctx->recursion[rctx]--;
+
+       put_recursion_context(cpuctx->recursion, rctx);
 }
 
 void __perf_sw_event(u32 event_id, u64 nr, int nmi,
@@ -4968,6 +5121,13 @@ done:
                        atomic_inc(&nr_comm_events);
                if (event->attr.task)
                        atomic_inc(&nr_task_events);
+               if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
+                       err = get_callchain_buffers();
+                       if (err) {
+                               free_event(event);
+                               return ERR_PTR(err);
+                       }
+               }
        }
 
        return event;