perf intel-pt: Add support for PERF_RECORD_SWITCH
authorAdrian Hunter <adrian.hunter@intel.com>
Thu, 13 Aug 2015 09:40:57 +0000 (12:40 +0300)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Fri, 4 Sep 2015 15:01:05 +0000 (12:01 -0300)
Add support for selecting and processing PERF_RECORD_SWITCH events for
use by Intel PT.  If they are available, they will be used in preference
to sched_switch events.

This enables an unprivileged user to trace multi-threaded or
multi-process workloads with any level of perf_event_paranoid.  However
it depends on kernel support for PERF_RECORD_SWITCH.

Without this patch, tracing a multi-threaded workload will decode
without error but all the data will be attributed to the main thread.

Without this patch, tracing a multi-process workload will result in
decoder errors because the decoder will not know which executable is
executing.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1439458857-30636-3-git-send-email-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/arch/x86/util/intel-pt.c
tools/perf/util/intel-pt.c

index 2ca10d796c0bb5bf0b591a070a5835dfd76fa700..b02af064f0f98333b6f90a5d6dd8778e6a7a6bb4 100644 (file)
@@ -624,13 +624,49 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
         * threads.
         */
        if (have_timing_info && !cpu_map__empty(cpus)) {
-               err = intel_pt_track_switches(evlist);
-               if (err == -EPERM)
-                       pr_debug2("Unable to select sched:sched_switch\n");
-               else if (err)
-                       return err;
-               else
-                       ptr->have_sched_switch = 1;
+               if (perf_can_record_switch_events()) {
+                       bool cpu_wide = !target__none(&opts->target) &&
+                                       !target__has_task(&opts->target);
+
+                       if (!cpu_wide && perf_can_record_cpu_wide()) {
+                               struct perf_evsel *switch_evsel;
+
+                               err = parse_events(evlist, "dummy:u", NULL);
+                               if (err)
+                                       return err;
+
+                               switch_evsel = perf_evlist__last(evlist);
+
+                               switch_evsel->attr.freq = 0;
+                               switch_evsel->attr.sample_period = 1;
+                               switch_evsel->attr.context_switch = 1;
+
+                               switch_evsel->system_wide = true;
+                               switch_evsel->no_aux_samples = true;
+                               switch_evsel->immediate = true;
+
+                               perf_evsel__set_sample_bit(switch_evsel, TID);
+                               perf_evsel__set_sample_bit(switch_evsel, TIME);
+                               perf_evsel__set_sample_bit(switch_evsel, CPU);
+
+                               opts->record_switch_events = false;
+                               ptr->have_sched_switch = 3;
+                       } else {
+                               opts->record_switch_events = true;
+                               if (cpu_wide)
+                                       ptr->have_sched_switch = 3;
+                               else
+                                       ptr->have_sched_switch = 2;
+                       }
+               } else {
+                       err = intel_pt_track_switches(evlist);
+                       if (err == -EPERM)
+                               pr_debug2("Unable to select sched:sched_switch\n");
+                       else if (err)
+                               return err;
+                       else
+                               ptr->have_sched_switch = 1;
+               }
        }
 
        if (intel_pt_evsel) {
@@ -663,8 +699,11 @@ static int intel_pt_recording_options(struct auxtrace_record *itr,
                tracking_evsel->attr.sample_period = 1;
 
                /* In per-cpu case, always need the time of mmap events etc */
-               if (!cpu_map__empty(cpus))
+               if (!cpu_map__empty(cpus)) {
                        perf_evsel__set_sample_bit(tracking_evsel, TIME);
+                       /* And the CPU for switch events */
+                       perf_evsel__set_sample_bit(tracking_evsel, CPU);
+               }
        }
 
        /*
index bb41c20e6005e1a2d986656fc94eead62de7a4ee..2968b37ed9b0c8331a8a859eee8664575f346db4 100644 (file)
@@ -1145,11 +1145,13 @@ static int intel_pt_sample(struct intel_pt_queue *ptq)
        return 0;
 }
 
-static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip)
+static u64 intel_pt_switch_ip(struct intel_pt *pt, u64 *ptss_ip)
 {
+       struct machine *machine = pt->machine;
        struct map *map;
        struct symbol *sym, *start;
        u64 ip, switch_ip = 0;
+       const char *ptss;
 
        if (ptss_ip)
                *ptss_ip = 0;
@@ -1177,8 +1179,13 @@ static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip)
        if (!switch_ip || !ptss_ip)
                return 0;
 
+       if (pt->have_sched_switch == 1)
+               ptss = "perf_trace_sched_switch";
+       else
+               ptss = "__perf_event_task_sched_out";
+
        for (sym = start; sym; sym = dso__next_symbol(sym)) {
-               if (!strcmp(sym->name, "perf_trace_sched_switch")) {
+               if (!strcmp(sym->name, ptss)) {
                        ip = map->unmap_ip(map, sym->start);
                        if (ip >= map->start && ip < map->end) {
                                *ptss_ip = ip;
@@ -1198,11 +1205,11 @@ static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)
 
        if (!pt->kernel_start) {
                pt->kernel_start = machine__kernel_start(pt->machine);
-               if (pt->per_cpu_mmaps && pt->have_sched_switch &&
+               if (pt->per_cpu_mmaps &&
+                   (pt->have_sched_switch == 1 || pt->have_sched_switch == 3) &&
                    !pt->timeless_decoding && intel_pt_tracing_kernel(pt) &&
                    !pt->sampling_mode) {
-                       pt->switch_ip = intel_pt_switch_ip(pt->machine,
-                                                          &pt->ptss_ip);
+                       pt->switch_ip = intel_pt_switch_ip(pt, &pt->ptss_ip);
                        if (pt->switch_ip) {
                                intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n",
                                             pt->switch_ip, pt->ptss_ip);
@@ -1387,31 +1394,18 @@ static struct intel_pt_queue *intel_pt_cpu_to_ptq(struct intel_pt *pt, int cpu)
        return NULL;
 }
 
-static int intel_pt_process_switch(struct intel_pt *pt,
-                                  struct perf_sample *sample)
+static int intel_pt_sync_switch(struct intel_pt *pt, int cpu, pid_t tid,
+                               u64 timestamp)
 {
        struct intel_pt_queue *ptq;
-       struct perf_evsel *evsel;
-       pid_t tid;
-       int cpu, err;
-
-       evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id);
-       if (evsel != pt->switch_evsel)
-               return 0;
-
-       tid = perf_evsel__intval(evsel, sample, "next_pid");
-       cpu = sample->cpu;
-
-       intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
-                    cpu, tid, sample->time, perf_time_to_tsc(sample->time,
-                    &pt->tc));
+       int err;
 
        if (!pt->sync_switch)
-               goto out;
+               return 1;
 
        ptq = intel_pt_cpu_to_ptq(pt, cpu);
        if (!ptq)
-               goto out;
+               return 1;
 
        switch (ptq->switch_state) {
        case INTEL_PT_SS_NOT_TRACING:
@@ -1424,7 +1418,7 @@ static int intel_pt_process_switch(struct intel_pt *pt,
                return 0;
        case INTEL_PT_SS_EXPECTING_SWITCH_EVENT:
                if (!ptq->on_heap) {
-                       ptq->timestamp = perf_time_to_tsc(sample->time,
+                       ptq->timestamp = perf_time_to_tsc(timestamp,
                                                          &pt->tc);
                        err = auxtrace_heap__add(&pt->heap, ptq->queue_nr,
                                                 ptq->timestamp);
@@ -1441,10 +1435,76 @@ static int intel_pt_process_switch(struct intel_pt *pt,
        default:
                break;
        }
-out:
+
+       return 1;
+}
+
+static int intel_pt_process_switch(struct intel_pt *pt,
+                                  struct perf_sample *sample)
+{
+       struct perf_evsel *evsel;
+       pid_t tid;
+       int cpu, ret;
+
+       evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id);
+       if (evsel != pt->switch_evsel)
+               return 0;
+
+       tid = perf_evsel__intval(evsel, sample, "next_pid");
+       cpu = sample->cpu;
+
+       intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
+                    cpu, tid, sample->time, perf_time_to_tsc(sample->time,
+                    &pt->tc));
+
+       ret = intel_pt_sync_switch(pt, cpu, tid, sample->time);
+       if (ret <= 0)
+               return ret;
+
        return machine__set_current_tid(pt->machine, cpu, -1, tid);
 }
 
+static int intel_pt_context_switch(struct intel_pt *pt, union perf_event *event,
+                                  struct perf_sample *sample)
+{
+       bool out = event->header.misc & PERF_RECORD_MISC_SWITCH_OUT;
+       pid_t pid, tid;
+       int cpu, ret;
+
+       cpu = sample->cpu;
+
+       if (pt->have_sched_switch == 3) {
+               if (!out)
+                       return 0;
+               if (event->header.type != PERF_RECORD_SWITCH_CPU_WIDE) {
+                       pr_err("Expecting CPU-wide context switch event\n");
+                       return -EINVAL;
+               }
+               pid = event->context_switch.next_prev_pid;
+               tid = event->context_switch.next_prev_tid;
+       } else {
+               if (out)
+                       return 0;
+               pid = sample->pid;
+               tid = sample->tid;
+       }
+
+       if (tid == -1) {
+               pr_err("context_switch event has no tid\n");
+               return -EINVAL;
+       }
+
+       intel_pt_log("context_switch: cpu %d pid %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
+                    cpu, pid, tid, sample->time, perf_time_to_tsc(sample->time,
+                    &pt->tc));
+
+       ret = intel_pt_sync_switch(pt, cpu, tid, sample->time);
+       if (ret <= 0)
+               return ret;
+
+       return machine__set_current_tid(pt->machine, cpu, pid, tid);
+}
+
 static int intel_pt_process_itrace_start(struct intel_pt *pt,
                                         union perf_event *event,
                                         struct perf_sample *sample)
@@ -1515,6 +1575,9 @@ static int intel_pt_process_event(struct perf_session *session,
                err = intel_pt_process_switch(pt, sample);
        else if (event->header.type == PERF_RECORD_ITRACE_START)
                err = intel_pt_process_itrace_start(pt, event, sample);
+       else if (event->header.type == PERF_RECORD_SWITCH ||
+                event->header.type == PERF_RECORD_SWITCH_CPU_WIDE)
+               err = intel_pt_context_switch(pt, event, sample);
 
        intel_pt_log("event %s (%u): cpu %d time %"PRIu64" tsc %#"PRIx64"\n",
                     perf_event__name(event->header.type), event->header.type,
@@ -1777,6 +1840,18 @@ static struct perf_evsel *intel_pt_find_sched_switch(struct perf_evlist *evlist)
        return NULL;
 }
 
+static bool intel_pt_find_switch(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+
+       evlist__for_each(evlist, evsel) {
+               if (evsel->attr.context_switch)
+                       return true;
+       }
+
+       return false;
+}
+
 static const char * const intel_pt_info_fmts[] = {
        [INTEL_PT_PMU_TYPE]             = "  PMU Type            %"PRId64"\n",
        [INTEL_PT_TIME_SHIFT]           = "  Time Shift          %"PRIu64"\n",
@@ -1888,6 +1963,10 @@ int intel_pt_process_auxtrace_info(union perf_event *event,
                        pr_err("%s: missing sched_switch event\n", __func__);
                        goto err_delete_thread;
                }
+       } else if (pt->have_sched_switch == 2 &&
+                  !intel_pt_find_switch(session->evlist)) {
+               pr_err("%s: missing context_switch attribute flag\n", __func__);
+               goto err_delete_thread;
        }
 
        if (session->itrace_synth_opts && session->itrace_synth_opts->set) {