perf_counter: Full task tracing
authorPeter Zijlstra <a.p.zijlstra@chello.nl>
Thu, 23 Jul 2009 12:46:33 +0000 (14:46 +0200)
committerIngo Molnar <mingo@elte.hu>
Sun, 2 Aug 2009 11:47:56 +0000 (13:47 +0200)
In order to be able to distinguish between no samples due to
inactivity and no samples due to task ended, Arjan asked for
PERF_EVENT_EXIT events. This is useful to the boot delay
instrumentation (bootchart) app.

This patch changes the PERF_EVENT_FORK to be emitted on every
clone, and adds PERF_EVENT_EXIT to be emitted on task exit,
after the task's counters have been closed.

This task tracing is controlled through: attr.comm || attr.mmap
and through the new attr.task field.

Suggested-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
[ cleaned up perf_counter.h a bit ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
include/linux/perf_counter.h
kernel/fork.c
kernel/perf_counter.c

index bd15d7a5f5ce39474d4c845e1cb7d85bf9150bd9..e604e6ef72dd5af13b6265d688451cef47fd56fa 100644 (file)
@@ -181,8 +181,9 @@ struct perf_counter_attr {
                                freq           :  1, /* use freq, not period  */
                                inherit_stat   :  1, /* per task counts       */
                                enable_on_exec :  1, /* next exec enables     */
+                               task           :  1, /* trace fork/exit       */
 
-                               __reserved_1   : 51;
+                               __reserved_1   : 50;
 
        __u32                   wakeup_events;  /* wakeup every n events */
        __u32                   __reserved_2;
@@ -308,6 +309,15 @@ enum perf_event_type {
         */
        PERF_EVENT_COMM                 = 3,
 
+       /*
+        * struct {
+        *      struct perf_event_header        header;
+        *      u32                             pid, ppid;
+        *      u32                             tid, ptid;
+        * };
+        */
+       PERF_EVENT_EXIT                 = 4,
+
        /*
         * struct {
         *      struct perf_event_header        header;
@@ -323,6 +333,7 @@ enum perf_event_type {
         * struct {
         *      struct perf_event_header        header;
         *      u32                             pid, ppid;
+        *      u32                             tid, ptid;
         * };
         */
        PERF_EVENT_FORK                 = 7,
index 29b532e718f7cf0a9055e9564fc265b6f7922cf2..466531eb92cce0730cc4ac9161a7af1e927d15a9 100644 (file)
@@ -1269,6 +1269,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
        write_unlock_irq(&tasklist_lock);
        proc_fork_connector(p);
        cgroup_post_fork(p);
+       perf_counter_fork(p);
        return p;
 
 bad_fork_free_pid:
@@ -1410,9 +1411,6 @@ long do_fork(unsigned long clone_flags,
                        init_completion(&vfork);
                }
 
-               if (!(clone_flags & CLONE_THREAD))
-                       perf_counter_fork(p);
-
                audit_finish_fork(p);
                tracehook_report_clone(regs, clone_flags, nr, p);
 
index 48471d75ae0169abd5d92a1cf1b1caae3de392fb..199ed477131508691df9325c8b20b2c07638ef21 100644 (file)
@@ -42,6 +42,7 @@ static int perf_overcommit __read_mostly = 1;
 static atomic_t nr_counters __read_mostly;
 static atomic_t nr_mmap_counters __read_mostly;
 static atomic_t nr_comm_counters __read_mostly;
+static atomic_t nr_task_counters __read_mostly;
 
 /*
  * perf counter paranoia level:
@@ -1654,6 +1655,8 @@ static void free_counter(struct perf_counter *counter)
                        atomic_dec(&nr_mmap_counters);
                if (counter->attr.comm)
                        atomic_dec(&nr_comm_counters);
+               if (counter->attr.task)
+                       atomic_dec(&nr_task_counters);
        }
 
        if (counter->destroy)
@@ -2831,10 +2834,12 @@ perf_counter_read_event(struct perf_counter *counter,
 }
 
 /*
- * fork tracking
+ * task tracking -- fork/exit
+ *
+ * enabled by: attr.comm | attr.mmap | attr.task
  */
 
-struct perf_fork_event {
+struct perf_task_event {
        struct task_struct      *task;
 
        struct {
@@ -2842,37 +2847,42 @@ struct perf_fork_event {
 
                u32                             pid;
                u32                             ppid;
+               u32                             tid;
+               u32                             ptid;
        } event;
 };
 
-static void perf_counter_fork_output(struct perf_counter *counter,
-                                    struct perf_fork_event *fork_event)
+static void perf_counter_task_output(struct perf_counter *counter,
+                                    struct perf_task_event *task_event)
 {
        struct perf_output_handle handle;
-       int size = fork_event->event.header.size;
-       struct task_struct *task = fork_event->task;
+       int size = task_event->event.header.size;
+       struct task_struct *task = task_event->task;
        int ret = perf_output_begin(&handle, counter, size, 0, 0);
 
        if (ret)
                return;
 
-       fork_event->event.pid = perf_counter_pid(counter, task);
-       fork_event->event.ppid = perf_counter_pid(counter, task->real_parent);
+       task_event->event.pid = perf_counter_pid(counter, task);
+       task_event->event.ppid = perf_counter_pid(counter, task->real_parent);
 
-       perf_output_put(&handle, fork_event->event);
+       task_event->event.tid = perf_counter_tid(counter, task);
+       task_event->event.ptid = perf_counter_tid(counter, task->real_parent);
+
+       perf_output_put(&handle, task_event->event);
        perf_output_end(&handle);
 }
 
-static int perf_counter_fork_match(struct perf_counter *counter)
+static int perf_counter_task_match(struct perf_counter *counter)
 {
-       if (counter->attr.comm || counter->attr.mmap)
+       if (counter->attr.comm || counter->attr.mmap || counter->attr.task)
                return 1;
 
        return 0;
 }
 
-static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
-                                 struct perf_fork_event *fork_event)
+static void perf_counter_task_ctx(struct perf_counter_context *ctx,
+                                 struct perf_task_event *task_event)
 {
        struct perf_counter *counter;
 
@@ -2881,19 +2891,19 @@ static void perf_counter_fork_ctx(struct perf_counter_context *ctx,
 
        rcu_read_lock();
        list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-               if (perf_counter_fork_match(counter))
-                       perf_counter_fork_output(counter, fork_event);
+               if (perf_counter_task_match(counter))
+                       perf_counter_task_output(counter, task_event);
        }
        rcu_read_unlock();
 }
 
-static void perf_counter_fork_event(struct perf_fork_event *fork_event)
+static void perf_counter_task_event(struct perf_task_event *task_event)
 {
        struct perf_cpu_context *cpuctx;
        struct perf_counter_context *ctx;
 
        cpuctx = &get_cpu_var(perf_cpu_context);
-       perf_counter_fork_ctx(&cpuctx->ctx, fork_event);
+       perf_counter_task_ctx(&cpuctx->ctx, task_event);
        put_cpu_var(perf_cpu_context);
 
        rcu_read_lock();
@@ -2903,32 +2913,40 @@ static void perf_counter_fork_event(struct perf_fork_event *fork_event)
         */
        ctx = rcu_dereference(current->perf_counter_ctxp);
        if (ctx)
-               perf_counter_fork_ctx(ctx, fork_event);
+               perf_counter_task_ctx(ctx, task_event);
        rcu_read_unlock();
 }
 
-void perf_counter_fork(struct task_struct *task)
+static void perf_counter_task(struct task_struct *task, int new)
 {
-       struct perf_fork_event fork_event;
+       struct perf_task_event task_event;
 
        if (!atomic_read(&nr_comm_counters) &&
-           !atomic_read(&nr_mmap_counters))
+           !atomic_read(&nr_mmap_counters) &&
+           !atomic_read(&nr_task_counters))
                return;
 
-       fork_event = (struct perf_fork_event){
+       task_event = (struct perf_task_event){
                .task   = task,
                .event  = {
                        .header = {
-                               .type = PERF_EVENT_FORK,
+                               .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT,
                                .misc = 0,
-                               .size = sizeof(fork_event.event),
+                               .size = sizeof(task_event.event),
                        },
                        /* .pid  */
                        /* .ppid */
+                       /* .tid  */
+                       /* .ptid */
                },
        };
 
-       perf_counter_fork_event(&fork_event);
+       perf_counter_task_event(&task_event);
+}
+
+void perf_counter_fork(struct task_struct *task)
+{
+       perf_counter_task(task, 1);
 }
 
 /*
@@ -3887,6 +3905,8 @@ done:
                        atomic_inc(&nr_mmap_counters);
                if (counter->attr.comm)
                        atomic_inc(&nr_comm_counters);
+               if (counter->attr.task)
+                       atomic_inc(&nr_task_counters);
        }
 
        return counter;
@@ -4248,8 +4268,10 @@ void perf_counter_exit_task(struct task_struct *child)
        struct perf_counter_context *child_ctx;
        unsigned long flags;
 
-       if (likely(!child->perf_counter_ctxp))
+       if (likely(!child->perf_counter_ctxp)) {
+               perf_counter_task(child, 0);
                return;
+       }
 
        local_irq_save(flags);
        /*
@@ -4267,15 +4289,22 @@ void perf_counter_exit_task(struct task_struct *child)
         * incremented the context's refcount before we do put_ctx below.
         */
        spin_lock(&child_ctx->lock);
-       child->perf_counter_ctxp = NULL;
        /*
         * If this context is a clone; unclone it so it can't get
         * swapped to another process while we're removing all
         * the counters from it.
         */
        unclone_ctx(child_ctx);
-       spin_unlock(&child_ctx->lock);
-       local_irq_restore(flags);
+       spin_unlock_irqrestore(&child_ctx->lock, flags);
+
+       /*
+        * Report the task dead after unscheduling the counters so that we
+        * won't get any samples after PERF_EVENT_EXIT. We can however still
+        * get a few PERF_EVENT_READ events.
+        */
+       perf_counter_task(child, 0);
+
+       child->perf_counter_ctxp = NULL;
 
        /*
         * We can recurse on the same lock type through: