perf tools: Add Intel PT support
authorAdrian Hunter <adrian.hunter@intel.com>
Fri, 17 Jul 2015 16:33:41 +0000 (19:33 +0300)
committerArnaldo Carvalho de Melo <acme@redhat.com>
Mon, 17 Aug 2015 14:11:36 +0000 (11:11 -0300)
Add support for Intel Processor Trace.

Intel PT support fits within the new auxtrace infrastructure.  Recording
is supporting by identifying the Intel PT PMU, parsing options and
setting up events.

Decoding is supported by queuing up trace data by cpu or thread and then
decoding synchronously delivering synthesized event samples into the
session processing for tools to consume.

Signed-off-by: Adrian Hunter <adrian.hunter@intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Link: http://lkml.kernel.org/r/1437150840-31811-7-git-send-email-adrian.hunter@intel.com
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
tools/perf/arch/x86/util/Build
tools/perf/arch/x86/util/intel-pt.c [new file with mode: 0644]
tools/perf/util/Build
tools/perf/util/intel-pt.c [new file with mode: 0644]
tools/perf/util/intel-pt.h [new file with mode: 0644]

index cfbccc4e31874bc049ede79763255eace0e4da8a..1396088788883e64439e0516558aecb99cea62e3 100644 (file)
@@ -6,3 +6,5 @@ libperf-$(CONFIG_DWARF) += dwarf-regs.o
 
 libperf-$(CONFIG_LIBUNWIND)          += unwind-libunwind.o
 libperf-$(CONFIG_LIBDW_DWARF_UNWIND) += unwind-libdw.o
+
+libperf-$(CONFIG_AUXTRACE) += intel-pt.o
diff --git a/tools/perf/arch/x86/util/intel-pt.c b/tools/perf/arch/x86/util/intel-pt.c
new file mode 100644 (file)
index 0000000..da7d2c1
--- /dev/null
@@ -0,0 +1,752 @@
+/*
+ * intel_pt.c: Intel Processor Trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <stdbool.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+#include <linux/bitops.h>
+#include <linux/log2.h>
+
+#include "../../perf.h"
+#include "../../util/session.h"
+#include "../../util/event.h"
+#include "../../util/evlist.h"
+#include "../../util/evsel.h"
+#include "../../util/cpumap.h"
+#include "../../util/parse-options.h"
+#include "../../util/parse-events.h"
+#include "../../util/pmu.h"
+#include "../../util/debug.h"
+#include "../../util/auxtrace.h"
+#include "../../util/tsc.h"
+#include "../../util/intel-pt.h"
+
+#define KiB(x) ((x) * 1024)
+#define MiB(x) ((x) * 1024 * 1024)
+#define KiB_MASK(x) (KiB(x) - 1)
+#define MiB_MASK(x) (MiB(x) - 1)
+
+#define INTEL_PT_DEFAULT_SAMPLE_SIZE   KiB(4)
+
+#define INTEL_PT_MAX_SAMPLE_SIZE       KiB(60)
+
+#define INTEL_PT_PSB_PERIOD_NEAR       256
+
+struct intel_pt_snapshot_ref {
+       void *ref_buf;
+       size_t ref_offset;
+       bool wrapped;
+};
+
+struct intel_pt_recording {
+       struct auxtrace_record          itr;
+       struct perf_pmu                 *intel_pt_pmu;
+       int                             have_sched_switch;
+       struct perf_evlist              *evlist;
+       bool                            snapshot_mode;
+       bool                            snapshot_init_done;
+       size_t                          snapshot_size;
+       size_t                          snapshot_ref_buf_size;
+       int                             snapshot_ref_cnt;
+       struct intel_pt_snapshot_ref    *snapshot_refs;
+};
+
+static int intel_pt_parse_terms_with_default(struct list_head *formats,
+                                            const char *str,
+                                            u64 *config)
+{
+       struct list_head *terms;
+       struct perf_event_attr attr = { .size = 0, };
+       int err;
+
+       terms = malloc(sizeof(struct list_head));
+       if (!terms)
+               return -ENOMEM;
+
+       INIT_LIST_HEAD(terms);
+
+       err = parse_events_terms(terms, str);
+       if (err)
+               goto out_free;
+
+       attr.config = *config;
+       err = perf_pmu__config_terms(formats, &attr, terms, true, NULL);
+       if (err)
+               goto out_free;
+
+       *config = attr.config;
+out_free:
+       parse_events__free_terms(terms);
+       return err;
+}
+
+static int intel_pt_parse_terms(struct list_head *formats, const char *str,
+                               u64 *config)
+{
+       *config = 0;
+       return intel_pt_parse_terms_with_default(formats, str, config);
+}
+
+static size_t intel_pt_psb_period(struct perf_pmu *intel_pt_pmu __maybe_unused,
+                                 struct perf_evlist *evlist __maybe_unused)
+{
+       return 256;
+}
+
+static u64 intel_pt_default_config(struct perf_pmu *intel_pt_pmu)
+{
+       u64 config;
+
+       intel_pt_parse_terms(&intel_pt_pmu->format, "tsc", &config);
+       return config;
+}
+
+static int intel_pt_parse_snapshot_options(struct auxtrace_record *itr,
+                                          struct record_opts *opts,
+                                          const char *str)
+{
+       struct intel_pt_recording *ptr =
+                       container_of(itr, struct intel_pt_recording, itr);
+       unsigned long long snapshot_size = 0;
+       char *endptr;
+
+       if (str) {
+               snapshot_size = strtoull(str, &endptr, 0);
+               if (*endptr || snapshot_size > SIZE_MAX)
+                       return -1;
+       }
+
+       opts->auxtrace_snapshot_mode = true;
+       opts->auxtrace_snapshot_size = snapshot_size;
+
+       ptr->snapshot_size = snapshot_size;
+
+       return 0;
+}
+
+struct perf_event_attr *
+intel_pt_pmu_default_config(struct perf_pmu *intel_pt_pmu)
+{
+       struct perf_event_attr *attr;
+
+       attr = zalloc(sizeof(struct perf_event_attr));
+       if (!attr)
+               return NULL;
+
+       attr->config = intel_pt_default_config(intel_pt_pmu);
+
+       intel_pt_pmu->selectable = true;
+
+       return attr;
+}
+
+static size_t intel_pt_info_priv_size(struct auxtrace_record *itr __maybe_unused)
+{
+       return INTEL_PT_AUXTRACE_PRIV_SIZE;
+}
+
+static int intel_pt_info_fill(struct auxtrace_record *itr,
+                             struct perf_session *session,
+                             struct auxtrace_info_event *auxtrace_info,
+                             size_t priv_size)
+{
+       struct intel_pt_recording *ptr =
+                       container_of(itr, struct intel_pt_recording, itr);
+       struct perf_pmu *intel_pt_pmu = ptr->intel_pt_pmu;
+       struct perf_event_mmap_page *pc;
+       struct perf_tsc_conversion tc = { .time_mult = 0, };
+       bool cap_user_time_zero = false, per_cpu_mmaps;
+       u64 tsc_bit, noretcomp_bit;
+       int err;
+
+       if (priv_size != INTEL_PT_AUXTRACE_PRIV_SIZE)
+               return -EINVAL;
+
+       intel_pt_parse_terms(&intel_pt_pmu->format, "tsc", &tsc_bit);
+       intel_pt_parse_terms(&intel_pt_pmu->format, "noretcomp",
+                            &noretcomp_bit);
+
+       if (!session->evlist->nr_mmaps)
+               return -EINVAL;
+
+       pc = session->evlist->mmap[0].base;
+       if (pc) {
+               err = perf_read_tsc_conversion(pc, &tc);
+               if (err) {
+                       if (err != -EOPNOTSUPP)
+                               return err;
+               } else {
+                       cap_user_time_zero = tc.time_mult != 0;
+               }
+               if (!cap_user_time_zero)
+                       ui__warning("Intel Processor Trace: TSC not available\n");
+       }
+
+       per_cpu_mmaps = !cpu_map__empty(session->evlist->cpus);
+
+       auxtrace_info->type = PERF_AUXTRACE_INTEL_PT;
+       auxtrace_info->priv[INTEL_PT_PMU_TYPE] = intel_pt_pmu->type;
+       auxtrace_info->priv[INTEL_PT_TIME_SHIFT] = tc.time_shift;
+       auxtrace_info->priv[INTEL_PT_TIME_MULT] = tc.time_mult;
+       auxtrace_info->priv[INTEL_PT_TIME_ZERO] = tc.time_zero;
+       auxtrace_info->priv[INTEL_PT_CAP_USER_TIME_ZERO] = cap_user_time_zero;
+       auxtrace_info->priv[INTEL_PT_TSC_BIT] = tsc_bit;
+       auxtrace_info->priv[INTEL_PT_NORETCOMP_BIT] = noretcomp_bit;
+       auxtrace_info->priv[INTEL_PT_HAVE_SCHED_SWITCH] = ptr->have_sched_switch;
+       auxtrace_info->priv[INTEL_PT_SNAPSHOT_MODE] = ptr->snapshot_mode;
+       auxtrace_info->priv[INTEL_PT_PER_CPU_MMAPS] = per_cpu_mmaps;
+
+       return 0;
+}
+
+static int intel_pt_track_switches(struct perf_evlist *evlist)
+{
+       const char *sched_switch = "sched:sched_switch";
+       struct perf_evsel *evsel;
+       int err;
+
+       if (!perf_evlist__can_select_event(evlist, sched_switch))
+               return -EPERM;
+
+       err = parse_events(evlist, sched_switch, NULL);
+       if (err) {
+               pr_debug2("%s: failed to parse %s, error %d\n",
+                         __func__, sched_switch, err);
+               return err;
+       }
+
+       evsel = perf_evlist__last(evlist);
+
+       perf_evsel__set_sample_bit(evsel, CPU);
+       perf_evsel__set_sample_bit(evsel, TIME);
+
+       evsel->system_wide = true;
+       evsel->no_aux_samples = true;
+       evsel->immediate = true;
+
+       return 0;
+}
+
+static int intel_pt_recording_options(struct auxtrace_record *itr,
+                                     struct perf_evlist *evlist,
+                                     struct record_opts *opts)
+{
+       struct intel_pt_recording *ptr =
+                       container_of(itr, struct intel_pt_recording, itr);
+       struct perf_pmu *intel_pt_pmu = ptr->intel_pt_pmu;
+       bool have_timing_info;
+       struct perf_evsel *evsel, *intel_pt_evsel = NULL;
+       const struct cpu_map *cpus = evlist->cpus;
+       bool privileged = geteuid() == 0 || perf_event_paranoid() < 0;
+       u64 tsc_bit;
+
+       ptr->evlist = evlist;
+       ptr->snapshot_mode = opts->auxtrace_snapshot_mode;
+
+       evlist__for_each(evlist, evsel) {
+               if (evsel->attr.type == intel_pt_pmu->type) {
+                       if (intel_pt_evsel) {
+                               pr_err("There may be only one " INTEL_PT_PMU_NAME " event\n");
+                               return -EINVAL;
+                       }
+                       evsel->attr.freq = 0;
+                       evsel->attr.sample_period = 1;
+                       intel_pt_evsel = evsel;
+                       opts->full_auxtrace = true;
+               }
+       }
+
+       if (opts->auxtrace_snapshot_mode && !opts->full_auxtrace) {
+               pr_err("Snapshot mode (-S option) requires " INTEL_PT_PMU_NAME " PMU event (-e " INTEL_PT_PMU_NAME ")\n");
+               return -EINVAL;
+       }
+
+       if (opts->use_clockid) {
+               pr_err("Cannot use clockid (-k option) with " INTEL_PT_PMU_NAME "\n");
+               return -EINVAL;
+       }
+
+       if (!opts->full_auxtrace)
+               return 0;
+
+       /* Set default sizes for snapshot mode */
+       if (opts->auxtrace_snapshot_mode) {
+               size_t psb_period = intel_pt_psb_period(intel_pt_pmu, evlist);
+
+               if (!opts->auxtrace_snapshot_size && !opts->auxtrace_mmap_pages) {
+                       if (privileged) {
+                               opts->auxtrace_mmap_pages = MiB(4) / page_size;
+                       } else {
+                               opts->auxtrace_mmap_pages = KiB(128) / page_size;
+                               if (opts->mmap_pages == UINT_MAX)
+                                       opts->mmap_pages = KiB(256) / page_size;
+                       }
+               } else if (!opts->auxtrace_mmap_pages && !privileged &&
+                          opts->mmap_pages == UINT_MAX) {
+                       opts->mmap_pages = KiB(256) / page_size;
+               }
+               if (!opts->auxtrace_snapshot_size)
+                       opts->auxtrace_snapshot_size =
+                               opts->auxtrace_mmap_pages * (size_t)page_size;
+               if (!opts->auxtrace_mmap_pages) {
+                       size_t sz = opts->auxtrace_snapshot_size;
+
+                       sz = round_up(sz, page_size) / page_size;
+                       opts->auxtrace_mmap_pages = roundup_pow_of_two(sz);
+               }
+               if (opts->auxtrace_snapshot_size >
+                               opts->auxtrace_mmap_pages * (size_t)page_size) {
+                       pr_err("Snapshot size %zu must not be greater than AUX area tracing mmap size %zu\n",
+                              opts->auxtrace_snapshot_size,
+                              opts->auxtrace_mmap_pages * (size_t)page_size);
+                       return -EINVAL;
+               }
+               if (!opts->auxtrace_snapshot_size || !opts->auxtrace_mmap_pages) {
+                       pr_err("Failed to calculate default snapshot size and/or AUX area tracing mmap pages\n");
+                       return -EINVAL;
+               }
+               pr_debug2("Intel PT snapshot size: %zu\n",
+                         opts->auxtrace_snapshot_size);
+               if (psb_period &&
+                   opts->auxtrace_snapshot_size <= psb_period +
+                                                 INTEL_PT_PSB_PERIOD_NEAR)
+                       ui__warning("Intel PT snapshot size (%zu) may be too small for PSB period (%zu)\n",
+                                   opts->auxtrace_snapshot_size, psb_period);
+       }
+
+       /* Set default sizes for full trace mode */
+       if (opts->full_auxtrace && !opts->auxtrace_mmap_pages) {
+               if (privileged) {
+                       opts->auxtrace_mmap_pages = MiB(4) / page_size;
+               } else {
+                       opts->auxtrace_mmap_pages = KiB(128) / page_size;
+                       if (opts->mmap_pages == UINT_MAX)
+                               opts->mmap_pages = KiB(256) / page_size;
+               }
+       }
+
+       /* Validate auxtrace_mmap_pages */
+       if (opts->auxtrace_mmap_pages) {
+               size_t sz = opts->auxtrace_mmap_pages * (size_t)page_size;
+               size_t min_sz;
+
+               if (opts->auxtrace_snapshot_mode)
+                       min_sz = KiB(4);
+               else
+                       min_sz = KiB(8);
+
+               if (sz < min_sz || !is_power_of_2(sz)) {
+                       pr_err("Invalid mmap size for Intel Processor Trace: must be at least %zuKiB and a power of 2\n",
+                              min_sz / 1024);
+                       return -EINVAL;
+               }
+       }
+
+       intel_pt_parse_terms(&intel_pt_pmu->format, "tsc", &tsc_bit);
+
+       if (opts->full_auxtrace && (intel_pt_evsel->attr.config & tsc_bit))
+               have_timing_info = true;
+       else
+               have_timing_info = false;
+
+       /*
+        * Per-cpu recording needs sched_switch events to distinguish different
+        * threads.
+        */
+       if (have_timing_info && !cpu_map__empty(cpus)) {
+               int err;
+
+               err = intel_pt_track_switches(evlist);
+               if (err == -EPERM)
+                       pr_debug2("Unable to select sched:sched_switch\n");
+               else if (err)
+                       return err;
+               else
+                       ptr->have_sched_switch = 1;
+       }
+
+       if (intel_pt_evsel) {
+               /*
+                * To obtain the auxtrace buffer file descriptor, the auxtrace
+                * event must come first.
+                */
+               perf_evlist__to_front(evlist, intel_pt_evsel);
+               /*
+                * In the case of per-cpu mmaps, we need the CPU on the
+                * AUX event.
+                */
+               if (!cpu_map__empty(cpus))
+                       perf_evsel__set_sample_bit(intel_pt_evsel, CPU);
+       }
+
+       /* Add dummy event to keep tracking */
+       if (opts->full_auxtrace) {
+               struct perf_evsel *tracking_evsel;
+               int err;
+
+               err = parse_events(evlist, "dummy:u", NULL);
+               if (err)
+                       return err;
+
+               tracking_evsel = perf_evlist__last(evlist);
+
+               perf_evlist__set_tracking_event(evlist, tracking_evsel);
+
+               tracking_evsel->attr.freq = 0;
+               tracking_evsel->attr.sample_period = 1;
+
+               /* In per-cpu case, always need the time of mmap events etc */
+               if (!cpu_map__empty(cpus))
+                       perf_evsel__set_sample_bit(tracking_evsel, TIME);
+       }
+
+       /*
+        * Warn the user when we do not have enough information to decode i.e.
+        * per-cpu with no sched_switch (except workload-only).
+        */
+       if (!ptr->have_sched_switch && !cpu_map__empty(cpus) &&
+           !target__none(&opts->target))
+               ui__warning("Intel Processor Trace decoding will not be possible except for kernel tracing!\n");
+
+       return 0;
+}
+
+static int intel_pt_snapshot_start(struct auxtrace_record *itr)
+{
+       struct intel_pt_recording *ptr =
+                       container_of(itr, struct intel_pt_recording, itr);
+       struct perf_evsel *evsel;
+
+       evlist__for_each(ptr->evlist, evsel) {
+               if (evsel->attr.type == ptr->intel_pt_pmu->type)
+                       return perf_evlist__disable_event(ptr->evlist, evsel);
+       }
+       return -EINVAL;
+}
+
+static int intel_pt_snapshot_finish(struct auxtrace_record *itr)
+{
+       struct intel_pt_recording *ptr =
+                       container_of(itr, struct intel_pt_recording, itr);
+       struct perf_evsel *evsel;
+
+       evlist__for_each(ptr->evlist, evsel) {
+               if (evsel->attr.type == ptr->intel_pt_pmu->type)
+                       return perf_evlist__enable_event(ptr->evlist, evsel);
+       }
+       return -EINVAL;
+}
+
+static int intel_pt_alloc_snapshot_refs(struct intel_pt_recording *ptr, int idx)
+{
+       const size_t sz = sizeof(struct intel_pt_snapshot_ref);
+       int cnt = ptr->snapshot_ref_cnt, new_cnt = cnt * 2;
+       struct intel_pt_snapshot_ref *refs;
+
+       if (!new_cnt)
+               new_cnt = 16;
+
+       while (new_cnt <= idx)
+               new_cnt *= 2;
+
+       refs = calloc(new_cnt, sz);
+       if (!refs)
+               return -ENOMEM;
+
+       memcpy(refs, ptr->snapshot_refs, cnt * sz);
+
+       ptr->snapshot_refs = refs;
+       ptr->snapshot_ref_cnt = new_cnt;
+
+       return 0;
+}
+
+static void intel_pt_free_snapshot_refs(struct intel_pt_recording *ptr)
+{
+       int i;
+
+       for (i = 0; i < ptr->snapshot_ref_cnt; i++)
+               zfree(&ptr->snapshot_refs[i].ref_buf);
+       zfree(&ptr->snapshot_refs);
+}
+
+static void intel_pt_recording_free(struct auxtrace_record *itr)
+{
+       struct intel_pt_recording *ptr =
+                       container_of(itr, struct intel_pt_recording, itr);
+
+       intel_pt_free_snapshot_refs(ptr);
+       free(ptr);
+}
+
+static int intel_pt_alloc_snapshot_ref(struct intel_pt_recording *ptr, int idx,
+                                      size_t snapshot_buf_size)
+{
+       size_t ref_buf_size = ptr->snapshot_ref_buf_size;
+       void *ref_buf;
+
+       ref_buf = zalloc(ref_buf_size);
+       if (!ref_buf)
+               return -ENOMEM;
+
+       ptr->snapshot_refs[idx].ref_buf = ref_buf;
+       ptr->snapshot_refs[idx].ref_offset = snapshot_buf_size - ref_buf_size;
+
+       return 0;
+}
+
+static size_t intel_pt_snapshot_ref_buf_size(struct intel_pt_recording *ptr,
+                                            size_t snapshot_buf_size)
+{
+       const size_t max_size = 256 * 1024;
+       size_t buf_size = 0, psb_period;
+
+       if (ptr->snapshot_size <= 64 * 1024)
+               return 0;
+
+       psb_period = intel_pt_psb_period(ptr->intel_pt_pmu, ptr->evlist);
+       if (psb_period)
+               buf_size = psb_period * 2;
+
+       if (!buf_size || buf_size > max_size)
+               buf_size = max_size;
+
+       if (buf_size >= snapshot_buf_size)
+               return 0;
+
+       if (buf_size >= ptr->snapshot_size / 2)
+               return 0;
+
+       return buf_size;
+}
+
+static int intel_pt_snapshot_init(struct intel_pt_recording *ptr,
+                                 size_t snapshot_buf_size)
+{
+       if (ptr->snapshot_init_done)
+               return 0;
+
+       ptr->snapshot_init_done = true;
+
+       ptr->snapshot_ref_buf_size = intel_pt_snapshot_ref_buf_size(ptr,
+                                                       snapshot_buf_size);
+
+       return 0;
+}
+
+/**
+ * intel_pt_compare_buffers - compare bytes in a buffer to a circular buffer.
+ * @buf1: first buffer
+ * @compare_size: number of bytes to compare
+ * @buf2: second buffer (a circular buffer)
+ * @offs2: offset in second buffer
+ * @buf2_size: size of second buffer
+ *
+ * The comparison allows for the possibility that the bytes to compare in the
+ * circular buffer are not contiguous.  It is assumed that @compare_size <=
+ * @buf2_size.  This function returns %false if the bytes are identical, %true
+ * otherwise.
+ */
+static bool intel_pt_compare_buffers(void *buf1, size_t compare_size,
+                                    void *buf2, size_t offs2, size_t buf2_size)
+{
+       size_t end2 = offs2 + compare_size, part_size;
+
+       if (end2 <= buf2_size)
+               return memcmp(buf1, buf2 + offs2, compare_size);
+
+       part_size = end2 - buf2_size;
+       if (memcmp(buf1, buf2 + offs2, part_size))
+               return true;
+
+       compare_size -= part_size;
+
+       return memcmp(buf1 + part_size, buf2, compare_size);
+}
+
+static bool intel_pt_compare_ref(void *ref_buf, size_t ref_offset,
+                                size_t ref_size, size_t buf_size,
+                                void *data, size_t head)
+{
+       size_t ref_end = ref_offset + ref_size;
+
+       if (ref_end > buf_size) {
+               if (head > ref_offset || head < ref_end - buf_size)
+                       return true;
+       } else if (head > ref_offset && head < ref_end) {
+               return true;
+       }
+
+       return intel_pt_compare_buffers(ref_buf, ref_size, data, ref_offset,
+                                       buf_size);
+}
+
+static void intel_pt_copy_ref(void *ref_buf, size_t ref_size, size_t buf_size,
+                             void *data, size_t head)
+{
+       if (head >= ref_size) {
+               memcpy(ref_buf, data + head - ref_size, ref_size);
+       } else {
+               memcpy(ref_buf, data, head);
+               ref_size -= head;
+               memcpy(ref_buf + head, data + buf_size - ref_size, ref_size);
+       }
+}
+
+static bool intel_pt_wrapped(struct intel_pt_recording *ptr, int idx,
+                            struct auxtrace_mmap *mm, unsigned char *data,
+                            u64 head)
+{
+       struct intel_pt_snapshot_ref *ref = &ptr->snapshot_refs[idx];
+       bool wrapped;
+
+       wrapped = intel_pt_compare_ref(ref->ref_buf, ref->ref_offset,
+                                      ptr->snapshot_ref_buf_size, mm->len,
+                                      data, head);
+
+       intel_pt_copy_ref(ref->ref_buf, ptr->snapshot_ref_buf_size, mm->len,
+                         data, head);
+
+       return wrapped;
+}
+
+static bool intel_pt_first_wrap(u64 *data, size_t buf_size)
+{
+       int i, a, b;
+
+       b = buf_size >> 3;
+       a = b - 512;
+       if (a < 0)
+               a = 0;
+
+       for (i = a; i < b; i++) {
+               if (data[i])
+                       return true;
+       }
+
+       return false;
+}
+
+static int intel_pt_find_snapshot(struct auxtrace_record *itr, int idx,
+                                 struct auxtrace_mmap *mm, unsigned char *data,
+                                 u64 *head, u64 *old)
+{
+       struct intel_pt_recording *ptr =
+                       container_of(itr, struct intel_pt_recording, itr);
+       bool wrapped;
+       int err;
+
+       pr_debug3("%s: mmap index %d old head %zu new head %zu\n",
+                 __func__, idx, (size_t)*old, (size_t)*head);
+
+       err = intel_pt_snapshot_init(ptr, mm->len);
+       if (err)
+               goto out_err;
+
+       if (idx >= ptr->snapshot_ref_cnt) {
+               err = intel_pt_alloc_snapshot_refs(ptr, idx);
+               if (err)
+                       goto out_err;
+       }
+
+       if (ptr->snapshot_ref_buf_size) {
+               if (!ptr->snapshot_refs[idx].ref_buf) {
+                       err = intel_pt_alloc_snapshot_ref(ptr, idx, mm->len);
+                       if (err)
+                               goto out_err;
+               }
+               wrapped = intel_pt_wrapped(ptr, idx, mm, data, *head);
+       } else {
+               wrapped = ptr->snapshot_refs[idx].wrapped;
+               if (!wrapped && intel_pt_first_wrap((u64 *)data, mm->len)) {
+                       ptr->snapshot_refs[idx].wrapped = true;
+                       wrapped = true;
+               }
+       }
+
+       /*
+        * In full trace mode 'head' continually increases.  However in snapshot
+        * mode 'head' is an offset within the buffer.  Here 'old' and 'head'
+        * are adjusted to match the full trace case which expects that 'old' is
+        * always less than 'head'.
+        */
+       if (wrapped) {
+               *old = *head;
+               *head += mm->len;
+       } else {
+               if (mm->mask)
+                       *old &= mm->mask;
+               else
+                       *old %= mm->len;
+               if (*old > *head)
+                       *head += mm->len;
+       }
+
+       pr_debug3("%s: wrap-around %sdetected, adjusted old head %zu adjusted new head %zu\n",
+                 __func__, wrapped ? "" : "not ", (size_t)*old, (size_t)*head);
+
+       return 0;
+
+out_err:
+       pr_err("%s: failed, error %d\n", __func__, err);
+       return err;
+}
+
+static u64 intel_pt_reference(struct auxtrace_record *itr __maybe_unused)
+{
+       return rdtsc();
+}
+
+static int intel_pt_read_finish(struct auxtrace_record *itr, int idx)
+{
+       struct intel_pt_recording *ptr =
+                       container_of(itr, struct intel_pt_recording, itr);
+       struct perf_evsel *evsel;
+
+       evlist__for_each(ptr->evlist, evsel) {
+               if (evsel->attr.type == ptr->intel_pt_pmu->type)
+                       return perf_evlist__enable_event_idx(ptr->evlist, evsel,
+                                                            idx);
+       }
+       return -EINVAL;
+}
+
+struct auxtrace_record *intel_pt_recording_init(int *err)
+{
+       struct perf_pmu *intel_pt_pmu = perf_pmu__find(INTEL_PT_PMU_NAME);
+       struct intel_pt_recording *ptr;
+
+       if (!intel_pt_pmu)
+               return NULL;
+
+       ptr = zalloc(sizeof(struct intel_pt_recording));
+       if (!ptr) {
+               *err = -ENOMEM;
+               return NULL;
+       }
+
+       ptr->intel_pt_pmu = intel_pt_pmu;
+       ptr->itr.recording_options = intel_pt_recording_options;
+       ptr->itr.info_priv_size = intel_pt_info_priv_size;
+       ptr->itr.info_fill = intel_pt_info_fill;
+       ptr->itr.free = intel_pt_recording_free;
+       ptr->itr.snapshot_start = intel_pt_snapshot_start;
+       ptr->itr.snapshot_finish = intel_pt_snapshot_finish;
+       ptr->itr.find_snapshot = intel_pt_find_snapshot;
+       ptr->itr.parse_snapshot_options = intel_pt_parse_snapshot_options;
+       ptr->itr.reference = intel_pt_reference;
+       ptr->itr.read_finish = intel_pt_read_finish;
+       return &ptr->itr;
+}
index 615ca12c2e44980227b0021cec9ab63b8f603e8a..c20473d1369e36b6f1e5cb307b950f62ebc20177 100644 (file)
@@ -79,6 +79,7 @@ libperf-y += cloexec.o
 libperf-y += thread-stack.o
 libperf-$(CONFIG_AUXTRACE) += auxtrace.o
 libperf-$(CONFIG_AUXTRACE) += intel-pt-decoder/
+libperf-$(CONFIG_AUXTRACE) += intel-pt.o
 libperf-y += parse-branch-options.o
 
 libperf-$(CONFIG_LIBELF) += symbol-elf.o
diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c
new file mode 100644 (file)
index 0000000..2a4a412
--- /dev/null
@@ -0,0 +1,1911 @@
+/*
+ * intel_pt.c: Intel Processor Trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <stdio.h>
+#include <stdbool.h>
+#include <errno.h>
+#include <linux/kernel.h>
+#include <linux/types.h>
+
+#include "../perf.h"
+#include "session.h"
+#include "machine.h"
+#include "tool.h"
+#include "event.h"
+#include "evlist.h"
+#include "evsel.h"
+#include "map.h"
+#include "color.h"
+#include "util.h"
+#include "thread.h"
+#include "thread-stack.h"
+#include "symbol.h"
+#include "callchain.h"
+#include "dso.h"
+#include "debug.h"
+#include "auxtrace.h"
+#include "tsc.h"
+#include "intel-pt.h"
+
+#include "intel-pt-decoder/intel-pt-log.h"
+#include "intel-pt-decoder/intel-pt-decoder.h"
+#include "intel-pt-decoder/intel-pt-insn-decoder.h"
+#include "intel-pt-decoder/intel-pt-pkt-decoder.h"
+
+#define MAX_TIMESTAMP (~0ULL)
+
+struct intel_pt {
+       struct auxtrace auxtrace;
+       struct auxtrace_queues queues;
+       struct auxtrace_heap heap;
+       u32 auxtrace_type;
+       struct perf_session *session;
+       struct machine *machine;
+       struct perf_evsel *switch_evsel;
+       struct thread *unknown_thread;
+       bool timeless_decoding;
+       bool sampling_mode;
+       bool snapshot_mode;
+       bool per_cpu_mmaps;
+       bool have_tsc;
+       bool data_queued;
+       bool est_tsc;
+       bool sync_switch;
+       int have_sched_switch;
+       u32 pmu_type;
+       u64 kernel_start;
+       u64 switch_ip;
+       u64 ptss_ip;
+
+       struct perf_tsc_conversion tc;
+       bool cap_user_time_zero;
+
+       struct itrace_synth_opts synth_opts;
+
+       bool sample_instructions;
+       u64 instructions_sample_type;
+       u64 instructions_sample_period;
+       u64 instructions_id;
+
+       bool sample_branches;
+       u32 branches_filter;
+       u64 branches_sample_type;
+       u64 branches_id;
+
+       bool sample_transactions;
+       u64 transactions_sample_type;
+       u64 transactions_id;
+
+       bool synth_needs_swap;
+
+       u64 tsc_bit;
+       u64 noretcomp_bit;
+       unsigned max_non_turbo_ratio;
+};
+
+enum switch_state {
+       INTEL_PT_SS_NOT_TRACING,
+       INTEL_PT_SS_UNKNOWN,
+       INTEL_PT_SS_TRACING,
+       INTEL_PT_SS_EXPECTING_SWITCH_EVENT,
+       INTEL_PT_SS_EXPECTING_SWITCH_IP,
+};
+
+struct intel_pt_queue {
+       struct intel_pt *pt;
+       unsigned int queue_nr;
+       struct auxtrace_buffer *buffer;
+       void *decoder;
+       const struct intel_pt_state *state;
+       struct ip_callchain *chain;
+       union perf_event *event_buf;
+       bool on_heap;
+       bool stop;
+       bool step_through_buffers;
+       bool use_buffer_pid_tid;
+       pid_t pid, tid;
+       int cpu;
+       int switch_state;
+       pid_t next_tid;
+       struct thread *thread;
+       bool exclude_kernel;
+       bool have_sample;
+       u64 time;
+       u64 timestamp;
+       u32 flags;
+       u16 insn_len;
+};
+
+static void intel_pt_dump(struct intel_pt *pt __maybe_unused,
+                         unsigned char *buf, size_t len)
+{
+       struct intel_pt_pkt packet;
+       size_t pos = 0;
+       int ret, pkt_len, i;
+       char desc[INTEL_PT_PKT_DESC_MAX];
+       const char *color = PERF_COLOR_BLUE;
+
+       color_fprintf(stdout, color,
+                     ". ... Intel Processor Trace data: size %zu bytes\n",
+                     len);
+
+       while (len) {
+               ret = intel_pt_get_packet(buf, len, &packet);
+               if (ret > 0)
+                       pkt_len = ret;
+               else
+                       pkt_len = 1;
+               printf(".");
+               color_fprintf(stdout, color, "  %08x: ", pos);
+               for (i = 0; i < pkt_len; i++)
+                       color_fprintf(stdout, color, " %02x", buf[i]);
+               for (; i < 16; i++)
+                       color_fprintf(stdout, color, "   ");
+               if (ret > 0) {
+                       ret = intel_pt_pkt_desc(&packet, desc,
+                                               INTEL_PT_PKT_DESC_MAX);
+                       if (ret > 0)
+                               color_fprintf(stdout, color, " %s\n", desc);
+               } else {
+                       color_fprintf(stdout, color, " Bad packet!\n");
+               }
+               pos += pkt_len;
+               buf += pkt_len;
+               len -= pkt_len;
+       }
+}
+
+static void intel_pt_dump_event(struct intel_pt *pt, unsigned char *buf,
+                               size_t len)
+{
+       printf(".\n");
+       intel_pt_dump(pt, buf, len);
+}
+
+static int intel_pt_do_fix_overlap(struct intel_pt *pt, struct auxtrace_buffer *a,
+                                  struct auxtrace_buffer *b)
+{
+       void *start;
+
+       start = intel_pt_find_overlap(a->data, a->size, b->data, b->size,
+                                     pt->have_tsc);
+       if (!start)
+               return -EINVAL;
+       b->use_size = b->data + b->size - start;
+       b->use_data = start;
+       return 0;
+}
+
+static void intel_pt_use_buffer_pid_tid(struct intel_pt_queue *ptq,
+                                       struct auxtrace_queue *queue,
+                                       struct auxtrace_buffer *buffer)
+{
+       if (queue->cpu == -1 && buffer->cpu != -1)
+               ptq->cpu = buffer->cpu;
+
+       ptq->pid = buffer->pid;
+       ptq->tid = buffer->tid;
+
+       intel_pt_log("queue %u cpu %d pid %d tid %d\n",
+                    ptq->queue_nr, ptq->cpu, ptq->pid, ptq->tid);
+
+       thread__zput(ptq->thread);
+
+       if (ptq->tid != -1) {
+               if (ptq->pid != -1)
+                       ptq->thread = machine__findnew_thread(ptq->pt->machine,
+                                                             ptq->pid,
+                                                             ptq->tid);
+               else
+                       ptq->thread = machine__find_thread(ptq->pt->machine, -1,
+                                                          ptq->tid);
+       }
+}
+
+/* This function assumes data is processed sequentially only */
+static int intel_pt_get_trace(struct intel_pt_buffer *b, void *data)
+{
+       struct intel_pt_queue *ptq = data;
+       struct auxtrace_buffer *buffer = ptq->buffer, *old_buffer = buffer;
+       struct auxtrace_queue *queue;
+
+       if (ptq->stop) {
+               b->len = 0;
+               return 0;
+       }
+
+       queue = &ptq->pt->queues.queue_array[ptq->queue_nr];
+
+       buffer = auxtrace_buffer__next(queue, buffer);
+       if (!buffer) {
+               if (old_buffer)
+                       auxtrace_buffer__drop_data(old_buffer);
+               b->len = 0;
+               return 0;
+       }
+
+       ptq->buffer = buffer;
+
+       if (!buffer->data) {
+               int fd = perf_data_file__fd(ptq->pt->session->file);
+
+               buffer->data = auxtrace_buffer__get_data(buffer, fd);
+               if (!buffer->data)
+                       return -ENOMEM;
+       }
+
+       if (ptq->pt->snapshot_mode && !buffer->consecutive && old_buffer &&
+           intel_pt_do_fix_overlap(ptq->pt, old_buffer, buffer))
+               return -ENOMEM;
+
+       if (old_buffer)
+               auxtrace_buffer__drop_data(old_buffer);
+
+       if (buffer->use_data) {
+               b->len = buffer->use_size;
+               b->buf = buffer->use_data;
+       } else {
+               b->len = buffer->size;
+               b->buf = buffer->data;
+       }
+       b->ref_timestamp = buffer->reference;
+
+       if (!old_buffer || ptq->pt->sampling_mode || (ptq->pt->snapshot_mode &&
+                                                     !buffer->consecutive)) {
+               b->consecutive = false;
+               b->trace_nr = buffer->buffer_nr + 1;
+       } else {
+               b->consecutive = true;
+       }
+
+       if (ptq->use_buffer_pid_tid && (ptq->pid != buffer->pid ||
+                                       ptq->tid != buffer->tid))
+               intel_pt_use_buffer_pid_tid(ptq, queue, buffer);
+
+       if (ptq->step_through_buffers)
+               ptq->stop = true;
+
+       if (!b->len)
+               return intel_pt_get_trace(b, data);
+
+       return 0;
+}
+
+struct intel_pt_cache_entry {
+       struct auxtrace_cache_entry     entry;
+       u64                             insn_cnt;
+       u64                             byte_cnt;
+       enum intel_pt_insn_op           op;
+       enum intel_pt_insn_branch       branch;
+       int                             length;
+       int32_t                         rel;
+};
+
+static int intel_pt_config_div(const char *var, const char *value, void *data)
+{
+       int *d = data;
+       long val;
+
+       if (!strcmp(var, "intel-pt.cache-divisor")) {
+               val = strtol(value, NULL, 0);
+               if (val > 0 && val <= INT_MAX)
+                       *d = val;
+       }
+
+       return 0;
+}
+
+static int intel_pt_cache_divisor(void)
+{
+       static int d;
+
+       if (d)
+               return d;
+
+       perf_config(intel_pt_config_div, &d);
+
+       if (!d)
+               d = 64;
+
+       return d;
+}
+
+static unsigned int intel_pt_cache_size(struct dso *dso,
+                                       struct machine *machine)
+{
+       off_t size;
+
+       size = dso__data_size(dso, machine);
+       size /= intel_pt_cache_divisor();
+       if (size < 1000)
+               return 10;
+       if (size > (1 << 21))
+               return 21;
+       return 32 - __builtin_clz(size);
+}
+
+static struct auxtrace_cache *intel_pt_cache(struct dso *dso,
+                                            struct machine *machine)
+{
+       struct auxtrace_cache *c;
+       unsigned int bits;
+
+       if (dso->auxtrace_cache)
+               return dso->auxtrace_cache;
+
+       bits = intel_pt_cache_size(dso, machine);
+
+       /* Ignoring cache creation failure */
+       c = auxtrace_cache__new(bits, sizeof(struct intel_pt_cache_entry), 200);
+
+       dso->auxtrace_cache = c;
+
+       return c;
+}
+
+static int intel_pt_cache_add(struct dso *dso, struct machine *machine,
+                             u64 offset, u64 insn_cnt, u64 byte_cnt,
+                             struct intel_pt_insn *intel_pt_insn)
+{
+       struct auxtrace_cache *c = intel_pt_cache(dso, machine);
+       struct intel_pt_cache_entry *e;
+       int err;
+
+       if (!c)
+               return -ENOMEM;
+
+       e = auxtrace_cache__alloc_entry(c);
+       if (!e)
+               return -ENOMEM;
+
+       e->insn_cnt = insn_cnt;
+       e->byte_cnt = byte_cnt;
+       e->op = intel_pt_insn->op;
+       e->branch = intel_pt_insn->branch;
+       e->length = intel_pt_insn->length;
+       e->rel = intel_pt_insn->rel;
+
+       err = auxtrace_cache__add(c, offset, &e->entry);
+       if (err)
+               auxtrace_cache__free_entry(c, e);
+
+       return err;
+}
+
+static struct intel_pt_cache_entry *
+intel_pt_cache_lookup(struct dso *dso, struct machine *machine, u64 offset)
+{
+       struct auxtrace_cache *c = intel_pt_cache(dso, machine);
+
+       if (!c)
+               return NULL;
+
+       return auxtrace_cache__lookup(dso->auxtrace_cache, offset);
+}
+
+static int intel_pt_walk_next_insn(struct intel_pt_insn *intel_pt_insn,
+                                  uint64_t *insn_cnt_ptr, uint64_t *ip,
+                                  uint64_t to_ip, uint64_t max_insn_cnt,
+                                  void *data)
+{
+       struct intel_pt_queue *ptq = data;
+       struct machine *machine = ptq->pt->machine;
+       struct thread *thread;
+       struct addr_location al;
+       unsigned char buf[1024];
+       size_t bufsz;
+       ssize_t len;
+       int x86_64;
+       u8 cpumode;
+       u64 offset, start_offset, start_ip;
+       u64 insn_cnt = 0;
+       bool one_map = true;
+
+       if (to_ip && *ip == to_ip)
+               goto out_no_cache;
+
+       bufsz = intel_pt_insn_max_size();
+
+       if (*ip >= ptq->pt->kernel_start)
+               cpumode = PERF_RECORD_MISC_KERNEL;
+       else
+               cpumode = PERF_RECORD_MISC_USER;
+
+       thread = ptq->thread;
+       if (!thread) {
+               if (cpumode != PERF_RECORD_MISC_KERNEL)
+                       return -EINVAL;
+               thread = ptq->pt->unknown_thread;
+       }
+
+       while (1) {
+               thread__find_addr_map(thread, cpumode, MAP__FUNCTION, *ip, &al);
+               if (!al.map || !al.map->dso)
+                       return -EINVAL;
+
+               if (al.map->dso->data.status == DSO_DATA_STATUS_ERROR &&
+                   dso__data_status_seen(al.map->dso,
+                                         DSO_DATA_STATUS_SEEN_ITRACE))
+                       return -ENOENT;
+
+               offset = al.map->map_ip(al.map, *ip);
+
+               if (!to_ip && one_map) {
+                       struct intel_pt_cache_entry *e;
+
+                       e = intel_pt_cache_lookup(al.map->dso, machine, offset);
+                       if (e &&
+                           (!max_insn_cnt || e->insn_cnt <= max_insn_cnt)) {
+                               *insn_cnt_ptr = e->insn_cnt;
+                               *ip += e->byte_cnt;
+                               intel_pt_insn->op = e->op;
+                               intel_pt_insn->branch = e->branch;
+                               intel_pt_insn->length = e->length;
+                               intel_pt_insn->rel = e->rel;
+                               intel_pt_log_insn_no_data(intel_pt_insn, *ip);
+                               return 0;
+                       }
+               }
+
+               start_offset = offset;
+               start_ip = *ip;
+
+               /* Load maps to ensure dso->is_64_bit has been updated */
+               map__load(al.map, machine->symbol_filter);
+
+               x86_64 = al.map->dso->is_64_bit;
+
+               while (1) {
+                       len = dso__data_read_offset(al.map->dso, machine,
+                                                   offset, buf, bufsz);
+                       if (len <= 0)
+                               return -EINVAL;
+
+                       if (intel_pt_get_insn(buf, len, x86_64, intel_pt_insn))
+                               return -EINVAL;
+
+                       intel_pt_log_insn(intel_pt_insn, *ip);
+
+                       insn_cnt += 1;
+
+                       if (intel_pt_insn->branch != INTEL_PT_BR_NO_BRANCH)
+                               goto out;
+
+                       if (max_insn_cnt && insn_cnt >= max_insn_cnt)
+                               goto out_no_cache;
+
+                       *ip += intel_pt_insn->length;
+
+                       if (to_ip && *ip == to_ip)
+                               goto out_no_cache;
+
+                       if (*ip >= al.map->end)
+                               break;
+
+                       offset += intel_pt_insn->length;
+               }
+               one_map = false;
+       }
+out:
+       *insn_cnt_ptr = insn_cnt;
+
+       if (!one_map)
+               goto out_no_cache;
+
+       /*
+        * Didn't lookup in the 'to_ip' case, so do it now to prevent duplicate
+        * entries.
+        */
+       if (to_ip) {
+               struct intel_pt_cache_entry *e;
+
+               e = intel_pt_cache_lookup(al.map->dso, machine, start_offset);
+               if (e)
+                       return 0;
+       }
+
+       /* Ignore cache errors */
+       intel_pt_cache_add(al.map->dso, machine, start_offset, insn_cnt,
+                          *ip - start_ip, intel_pt_insn);
+
+       return 0;
+
+out_no_cache:
+       *insn_cnt_ptr = insn_cnt;
+       return 0;
+}
+
+static bool intel_pt_get_config(struct intel_pt *pt,
+                               struct perf_event_attr *attr, u64 *config)
+{
+       if (attr->type == pt->pmu_type) {
+               if (config)
+                       *config = attr->config;
+               return true;
+       }
+
+       return false;
+}
+
+static bool intel_pt_exclude_kernel(struct intel_pt *pt)
+{
+       struct perf_evsel *evsel;
+
+       evlist__for_each(pt->session->evlist, evsel) {
+               if (intel_pt_get_config(pt, &evsel->attr, NULL) &&
+                   !evsel->attr.exclude_kernel)
+                       return false;
+       }
+       return true;
+}
+
+static bool intel_pt_return_compression(struct intel_pt *pt)
+{
+       struct perf_evsel *evsel;
+       u64 config;
+
+       if (!pt->noretcomp_bit)
+               return true;
+
+       evlist__for_each(pt->session->evlist, evsel) {
+               if (intel_pt_get_config(pt, &evsel->attr, &config) &&
+                   (config & pt->noretcomp_bit))
+                       return false;
+       }
+       return true;
+}
+
+static bool intel_pt_timeless_decoding(struct intel_pt *pt)
+{
+       struct perf_evsel *evsel;
+       bool timeless_decoding = true;
+       u64 config;
+
+       if (!pt->tsc_bit || !pt->cap_user_time_zero)
+               return true;
+
+       evlist__for_each(pt->session->evlist, evsel) {
+               if (!(evsel->attr.sample_type & PERF_SAMPLE_TIME))
+                       return true;
+               if (intel_pt_get_config(pt, &evsel->attr, &config)) {
+                       if (config & pt->tsc_bit)
+                               timeless_decoding = false;
+                       else
+                               return true;
+               }
+       }
+       return timeless_decoding;
+}
+
+static bool intel_pt_tracing_kernel(struct intel_pt *pt)
+{
+       struct perf_evsel *evsel;
+
+       evlist__for_each(pt->session->evlist, evsel) {
+               if (intel_pt_get_config(pt, &evsel->attr, NULL) &&
+                   !evsel->attr.exclude_kernel)
+                       return true;
+       }
+       return false;
+}
+
+static bool intel_pt_have_tsc(struct intel_pt *pt)
+{
+       struct perf_evsel *evsel;
+       bool have_tsc = false;
+       u64 config;
+
+       if (!pt->tsc_bit)
+               return false;
+
+       evlist__for_each(pt->session->evlist, evsel) {
+               if (intel_pt_get_config(pt, &evsel->attr, &config)) {
+                       if (config & pt->tsc_bit)
+                               have_tsc = true;
+                       else
+                               return false;
+               }
+       }
+       return have_tsc;
+}
+
+static u64 intel_pt_ns_to_ticks(const struct intel_pt *pt, u64 ns)
+{
+       u64 quot, rem;
+
+       quot = ns / pt->tc.time_mult;
+       rem  = ns % pt->tc.time_mult;
+       return (quot << pt->tc.time_shift) + (rem << pt->tc.time_shift) /
+               pt->tc.time_mult;
+}
+
+static struct intel_pt_queue *intel_pt_alloc_queue(struct intel_pt *pt,
+                                                  unsigned int queue_nr)
+{
+       struct intel_pt_params params = { .get_trace = 0, };
+       struct intel_pt_queue *ptq;
+
+       ptq = zalloc(sizeof(struct intel_pt_queue));
+       if (!ptq)
+               return NULL;
+
+       if (pt->synth_opts.callchain) {
+               size_t sz = sizeof(struct ip_callchain);
+
+               sz += pt->synth_opts.callchain_sz * sizeof(u64);
+               ptq->chain = zalloc(sz);
+               if (!ptq->chain)
+                       goto out_free;
+       }
+
+       ptq->event_buf = malloc(PERF_SAMPLE_MAX_SIZE);
+       if (!ptq->event_buf)
+               goto out_free;
+
+       ptq->pt = pt;
+       ptq->queue_nr = queue_nr;
+       ptq->exclude_kernel = intel_pt_exclude_kernel(pt);
+       ptq->pid = -1;
+       ptq->tid = -1;
+       ptq->cpu = -1;
+       ptq->next_tid = -1;
+
+       params.get_trace = intel_pt_get_trace;
+       params.walk_insn = intel_pt_walk_next_insn;
+       params.data = ptq;
+       params.return_compression = intel_pt_return_compression(pt);
+       params.max_non_turbo_ratio = pt->max_non_turbo_ratio;
+
+       if (pt->synth_opts.instructions) {
+               if (pt->synth_opts.period) {
+                       switch (pt->synth_opts.period_type) {
+                       case PERF_ITRACE_PERIOD_INSTRUCTIONS:
+                               params.period_type =
+                                               INTEL_PT_PERIOD_INSTRUCTIONS;
+                               params.period = pt->synth_opts.period;
+                               break;
+                       case PERF_ITRACE_PERIOD_TICKS:
+                               params.period_type = INTEL_PT_PERIOD_TICKS;
+                               params.period = pt->synth_opts.period;
+                               break;
+                       case PERF_ITRACE_PERIOD_NANOSECS:
+                               params.period_type = INTEL_PT_PERIOD_TICKS;
+                               params.period = intel_pt_ns_to_ticks(pt,
+                                                       pt->synth_opts.period);
+                               break;
+                       default:
+                               break;
+                       }
+               }
+
+               if (!params.period) {
+                       params.period_type = INTEL_PT_PERIOD_INSTRUCTIONS;
+                       params.period = 1000;
+               }
+       }
+
+       ptq->decoder = intel_pt_decoder_new(&params);
+       if (!ptq->decoder)
+               goto out_free;
+
+       return ptq;
+
+out_free:
+       zfree(&ptq->event_buf);
+       zfree(&ptq->chain);
+       free(ptq);
+       return NULL;
+}
+
+static void intel_pt_free_queue(void *priv)
+{
+       struct intel_pt_queue *ptq = priv;
+
+       if (!ptq)
+               return;
+       thread__zput(ptq->thread);
+       intel_pt_decoder_free(ptq->decoder);
+       zfree(&ptq->event_buf);
+       zfree(&ptq->chain);
+       free(ptq);
+}
+
+static void intel_pt_set_pid_tid_cpu(struct intel_pt *pt,
+                                    struct auxtrace_queue *queue)
+{
+       struct intel_pt_queue *ptq = queue->priv;
+
+       if (queue->tid == -1 || pt->have_sched_switch) {
+               ptq->tid = machine__get_current_tid(pt->machine, ptq->cpu);
+               thread__zput(ptq->thread);
+       }
+
+       if (!ptq->thread && ptq->tid != -1)
+               ptq->thread = machine__find_thread(pt->machine, -1, ptq->tid);
+
+       if (ptq->thread) {
+               ptq->pid = ptq->thread->pid_;
+               if (queue->cpu == -1)
+                       ptq->cpu = ptq->thread->cpu;
+       }
+}
+
+static void intel_pt_sample_flags(struct intel_pt_queue *ptq)
+{
+       if (ptq->state->flags & INTEL_PT_ABORT_TX) {
+               ptq->flags = PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_TX_ABORT;
+       } else if (ptq->state->flags & INTEL_PT_ASYNC) {
+               if (ptq->state->to_ip)
+                       ptq->flags = PERF_IP_FLAG_BRANCH | PERF_IP_FLAG_CALL |
+                                    PERF_IP_FLAG_ASYNC |
+                                    PERF_IP_FLAG_INTERRUPT;
+               else
+                       ptq->flags = PERF_IP_FLAG_BRANCH |
+                                    PERF_IP_FLAG_TRACE_END;
+               ptq->insn_len = 0;
+       } else {
+               if (ptq->state->from_ip)
+                       ptq->flags = intel_pt_insn_type(ptq->state->insn_op);
+               else
+                       ptq->flags = PERF_IP_FLAG_BRANCH |
+                                    PERF_IP_FLAG_TRACE_BEGIN;
+               if (ptq->state->flags & INTEL_PT_IN_TX)
+                       ptq->flags |= PERF_IP_FLAG_IN_TX;
+               ptq->insn_len = ptq->state->insn_len;
+       }
+}
+
+static int intel_pt_setup_queue(struct intel_pt *pt,
+                               struct auxtrace_queue *queue,
+                               unsigned int queue_nr)
+{
+       struct intel_pt_queue *ptq = queue->priv;
+
+       if (list_empty(&queue->head))
+               return 0;
+
+       if (!ptq) {
+               ptq = intel_pt_alloc_queue(pt, queue_nr);
+               if (!ptq)
+                       return -ENOMEM;
+               queue->priv = ptq;
+
+               if (queue->cpu != -1)
+                       ptq->cpu = queue->cpu;
+               ptq->tid = queue->tid;
+
+               if (pt->sampling_mode) {
+                       if (pt->timeless_decoding)
+                               ptq->step_through_buffers = true;
+                       if (pt->timeless_decoding || !pt->have_sched_switch)
+                               ptq->use_buffer_pid_tid = true;
+               }
+       }
+
+       if (!ptq->on_heap &&
+           (!pt->sync_switch ||
+            ptq->switch_state != INTEL_PT_SS_EXPECTING_SWITCH_EVENT)) {
+               const struct intel_pt_state *state;
+               int ret;
+
+               if (pt->timeless_decoding)
+                       return 0;
+
+               intel_pt_log("queue %u getting timestamp\n", queue_nr);
+               intel_pt_log("queue %u decoding cpu %d pid %d tid %d\n",
+                            queue_nr, ptq->cpu, ptq->pid, ptq->tid);
+               while (1) {
+                       state = intel_pt_decode(ptq->decoder);
+                       if (state->err) {
+                               if (state->err == INTEL_PT_ERR_NODATA) {
+                                       intel_pt_log("queue %u has no timestamp\n",
+                                                    queue_nr);
+                                       return 0;
+                               }
+                               continue;
+                       }
+                       if (state->timestamp)
+                               break;
+               }
+
+               ptq->timestamp = state->timestamp;
+               intel_pt_log("queue %u timestamp 0x%" PRIx64 "\n",
+                            queue_nr, ptq->timestamp);
+               ptq->state = state;
+               ptq->have_sample = true;
+               intel_pt_sample_flags(ptq);
+               ret = auxtrace_heap__add(&pt->heap, queue_nr, ptq->timestamp);
+               if (ret)
+                       return ret;
+               ptq->on_heap = true;
+       }
+
+       return 0;
+}
+
+static int intel_pt_setup_queues(struct intel_pt *pt)
+{
+       unsigned int i;
+       int ret;
+
+       for (i = 0; i < pt->queues.nr_queues; i++) {
+               ret = intel_pt_setup_queue(pt, &pt->queues.queue_array[i], i);
+               if (ret)
+                       return ret;
+       }
+       return 0;
+}
+
+static int intel_pt_inject_event(union perf_event *event,
+                                struct perf_sample *sample, u64 type,
+                                bool swapped)
+{
+       event->header.size = perf_event__sample_event_size(sample, type, 0);
+       return perf_event__synthesize_sample(event, type, 0, sample, swapped);
+}
+
+static int intel_pt_synth_branch_sample(struct intel_pt_queue *ptq)
+{
+       int ret;
+       struct intel_pt *pt = ptq->pt;
+       union perf_event *event = ptq->event_buf;
+       struct perf_sample sample = { .ip = 0, };
+
+       event->sample.header.type = PERF_RECORD_SAMPLE;
+       event->sample.header.misc = PERF_RECORD_MISC_USER;
+       event->sample.header.size = sizeof(struct perf_event_header);
+
+       if (!pt->timeless_decoding)
+               sample.time = tsc_to_perf_time(ptq->timestamp, &pt->tc);
+
+       sample.ip = ptq->state->from_ip;
+       sample.pid = ptq->pid;
+       sample.tid = ptq->tid;
+       sample.addr = ptq->state->to_ip;
+       sample.id = ptq->pt->branches_id;
+       sample.stream_id = ptq->pt->branches_id;
+       sample.period = 1;
+       sample.cpu = ptq->cpu;
+       sample.flags = ptq->flags;
+       sample.insn_len = ptq->insn_len;
+
+       if (pt->branches_filter && !(pt->branches_filter & ptq->flags))
+               return 0;
+
+       if (pt->synth_opts.inject) {
+               ret = intel_pt_inject_event(event, &sample,
+                                           pt->branches_sample_type,
+                                           pt->synth_needs_swap);
+               if (ret)
+                       return ret;
+       }
+
+       ret = perf_session__deliver_synth_event(pt->session, event, &sample);
+       if (ret)
+               pr_err("Intel Processor Trace: failed to deliver branch event, error %d\n",
+                      ret);
+
+       return ret;
+}
+
+static int intel_pt_synth_instruction_sample(struct intel_pt_queue *ptq)
+{
+       int ret;
+       struct intel_pt *pt = ptq->pt;
+       union perf_event *event = ptq->event_buf;
+       struct perf_sample sample = { .ip = 0, };
+
+       event->sample.header.type = PERF_RECORD_SAMPLE;
+       event->sample.header.misc = PERF_RECORD_MISC_USER;
+       event->sample.header.size = sizeof(struct perf_event_header);
+
+       if (!pt->timeless_decoding)
+               sample.time = tsc_to_perf_time(ptq->timestamp, &pt->tc);
+
+       sample.ip = ptq->state->from_ip;
+       sample.pid = ptq->pid;
+       sample.tid = ptq->tid;
+       sample.addr = ptq->state->to_ip;
+       sample.id = ptq->pt->instructions_id;
+       sample.stream_id = ptq->pt->instructions_id;
+       sample.period = ptq->pt->instructions_sample_period;
+       sample.cpu = ptq->cpu;
+       sample.flags = ptq->flags;
+       sample.insn_len = ptq->insn_len;
+
+       if (pt->synth_opts.callchain) {
+               thread_stack__sample(ptq->thread, ptq->chain,
+                                    pt->synth_opts.callchain_sz, sample.ip);
+               sample.callchain = ptq->chain;
+       }
+
+       if (pt->synth_opts.inject) {
+               ret = intel_pt_inject_event(event, &sample,
+                                           pt->instructions_sample_type,
+                                           pt->synth_needs_swap);
+               if (ret)
+                       return ret;
+       }
+
+       ret = perf_session__deliver_synth_event(pt->session, event, &sample);
+       if (ret)
+               pr_err("Intel Processor Trace: failed to deliver instruction event, error %d\n",
+                      ret);
+
+       return ret;
+}
+
+static int intel_pt_synth_transaction_sample(struct intel_pt_queue *ptq)
+{
+       int ret;
+       struct intel_pt *pt = ptq->pt;
+       union perf_event *event = ptq->event_buf;
+       struct perf_sample sample = { .ip = 0, };
+
+       event->sample.header.type = PERF_RECORD_SAMPLE;
+       event->sample.header.misc = PERF_RECORD_MISC_USER;
+       event->sample.header.size = sizeof(struct perf_event_header);
+
+       if (!pt->timeless_decoding)
+               sample.time = tsc_to_perf_time(ptq->timestamp, &pt->tc);
+
+       sample.ip = ptq->state->from_ip;
+       sample.pid = ptq->pid;
+       sample.tid = ptq->tid;
+       sample.addr = ptq->state->to_ip;
+       sample.id = ptq->pt->transactions_id;
+       sample.stream_id = ptq->pt->transactions_id;
+       sample.period = 1;
+       sample.cpu = ptq->cpu;
+       sample.flags = ptq->flags;
+       sample.insn_len = ptq->insn_len;
+
+       if (pt->synth_opts.callchain) {
+               thread_stack__sample(ptq->thread, ptq->chain,
+                                    pt->synth_opts.callchain_sz, sample.ip);
+               sample.callchain = ptq->chain;
+       }
+
+       if (pt->synth_opts.inject) {
+               ret = intel_pt_inject_event(event, &sample,
+                                           pt->transactions_sample_type,
+                                           pt->synth_needs_swap);
+               if (ret)
+                       return ret;
+       }
+
+       ret = perf_session__deliver_synth_event(pt->session, event, &sample);
+       if (ret)
+               pr_err("Intel Processor Trace: failed to deliver transaction event, error %d\n",
+                      ret);
+
+       return ret;
+}
+
+static int intel_pt_synth_error(struct intel_pt *pt, int code, int cpu,
+                               pid_t pid, pid_t tid, u64 ip)
+{
+       union perf_event event;
+       char msg[MAX_AUXTRACE_ERROR_MSG];
+       int err;
+
+       intel_pt__strerror(code, msg, MAX_AUXTRACE_ERROR_MSG);
+
+       auxtrace_synth_error(&event.auxtrace_error, PERF_AUXTRACE_ERROR_ITRACE,
+                            code, cpu, pid, tid, ip, msg);
+
+       err = perf_session__deliver_synth_event(pt->session, &event, NULL);
+       if (err)
+               pr_err("Intel Processor Trace: failed to deliver error event, error %d\n",
+                      err);
+
+       return err;
+}
+
+static int intel_pt_next_tid(struct intel_pt *pt, struct intel_pt_queue *ptq)
+{
+       struct auxtrace_queue *queue;
+       pid_t tid = ptq->next_tid;
+       int err;
+
+       if (tid == -1)
+               return 0;
+
+       intel_pt_log("switch: cpu %d tid %d\n", ptq->cpu, tid);
+
+       err = machine__set_current_tid(pt->machine, ptq->cpu, -1, tid);
+
+       queue = &pt->queues.queue_array[ptq->queue_nr];
+       intel_pt_set_pid_tid_cpu(pt, queue);
+
+       ptq->next_tid = -1;
+
+       return err;
+}
+
+static inline bool intel_pt_is_switch_ip(struct intel_pt_queue *ptq, u64 ip)
+{
+       struct intel_pt *pt = ptq->pt;
+
+       return ip == pt->switch_ip &&
+              (ptq->flags & PERF_IP_FLAG_BRANCH) &&
+              !(ptq->flags & (PERF_IP_FLAG_CONDITIONAL | PERF_IP_FLAG_ASYNC |
+                              PERF_IP_FLAG_INTERRUPT | PERF_IP_FLAG_TX_ABORT));
+}
+
+static int intel_pt_sample(struct intel_pt_queue *ptq)
+{
+       const struct intel_pt_state *state = ptq->state;
+       struct intel_pt *pt = ptq->pt;
+       int err;
+
+       if (!ptq->have_sample)
+               return 0;
+
+       ptq->have_sample = false;
+
+       if (pt->sample_instructions &&
+           (state->type & INTEL_PT_INSTRUCTION)) {
+               err = intel_pt_synth_instruction_sample(ptq);
+               if (err)
+                       return err;
+       }
+
+       if (pt->sample_transactions &&
+           (state->type & INTEL_PT_TRANSACTION)) {
+               err = intel_pt_synth_transaction_sample(ptq);
+               if (err)
+                       return err;
+       }
+
+       if (!(state->type & INTEL_PT_BRANCH))
+               return 0;
+
+       if (pt->synth_opts.callchain)
+               thread_stack__event(ptq->thread, ptq->flags, state->from_ip,
+                                   state->to_ip, ptq->insn_len,
+                                   state->trace_nr);
+       else
+               thread_stack__set_trace_nr(ptq->thread, state->trace_nr);
+
+       if (pt->sample_branches) {
+               err = intel_pt_synth_branch_sample(ptq);
+               if (err)
+                       return err;
+       }
+
+       if (!pt->sync_switch)
+               return 0;
+
+       if (intel_pt_is_switch_ip(ptq, state->to_ip)) {
+               switch (ptq->switch_state) {
+               case INTEL_PT_SS_UNKNOWN:
+               case INTEL_PT_SS_EXPECTING_SWITCH_IP:
+                       err = intel_pt_next_tid(pt, ptq);
+                       if (err)
+                               return err;
+                       ptq->switch_state = INTEL_PT_SS_TRACING;
+                       break;
+               default:
+                       ptq->switch_state = INTEL_PT_SS_EXPECTING_SWITCH_EVENT;
+                       return 1;
+               }
+       } else if (!state->to_ip) {
+               ptq->switch_state = INTEL_PT_SS_NOT_TRACING;
+       } else if (ptq->switch_state == INTEL_PT_SS_NOT_TRACING) {
+               ptq->switch_state = INTEL_PT_SS_UNKNOWN;
+       } else if (ptq->switch_state == INTEL_PT_SS_UNKNOWN &&
+                  state->to_ip == pt->ptss_ip &&
+                  (ptq->flags & PERF_IP_FLAG_CALL)) {
+               ptq->switch_state = INTEL_PT_SS_TRACING;
+       }
+
+       return 0;
+}
+
+static u64 intel_pt_switch_ip(struct machine *machine, u64 *ptss_ip)
+{
+       struct map *map;
+       struct symbol *sym, *start;
+       u64 ip, switch_ip = 0;
+
+       if (ptss_ip)
+               *ptss_ip = 0;
+
+       map = machine__kernel_map(machine, MAP__FUNCTION);
+       if (!map)
+               return 0;
+
+       if (map__load(map, machine->symbol_filter))
+               return 0;
+
+       start = dso__first_symbol(map->dso, MAP__FUNCTION);
+
+       for (sym = start; sym; sym = dso__next_symbol(sym)) {
+               if (sym->binding == STB_GLOBAL &&
+                   !strcmp(sym->name, "__switch_to")) {
+                       ip = map->unmap_ip(map, sym->start);
+                       if (ip >= map->start && ip < map->end) {
+                               switch_ip = ip;
+                               break;
+                       }
+               }
+       }
+
+       if (!switch_ip || !ptss_ip)
+               return 0;
+
+       for (sym = start; sym; sym = dso__next_symbol(sym)) {
+               if (!strcmp(sym->name, "perf_trace_sched_switch")) {
+                       ip = map->unmap_ip(map, sym->start);
+                       if (ip >= map->start && ip < map->end) {
+                               *ptss_ip = ip;
+                               break;
+                       }
+               }
+       }
+
+       return switch_ip;
+}
+
+static int intel_pt_run_decoder(struct intel_pt_queue *ptq, u64 *timestamp)
+{
+       const struct intel_pt_state *state = ptq->state;
+       struct intel_pt *pt = ptq->pt;
+       int err;
+
+       if (!pt->kernel_start) {
+               pt->kernel_start = machine__kernel_start(pt->machine);
+               if (pt->per_cpu_mmaps && pt->have_sched_switch &&
+                   !pt->timeless_decoding && intel_pt_tracing_kernel(pt) &&
+                   !pt->sampling_mode) {
+                       pt->switch_ip = intel_pt_switch_ip(pt->machine,
+                                                          &pt->ptss_ip);
+                       if (pt->switch_ip) {
+                               intel_pt_log("switch_ip: %"PRIx64" ptss_ip: %"PRIx64"\n",
+                                            pt->switch_ip, pt->ptss_ip);
+                               pt->sync_switch = true;
+                       }
+               }
+       }
+
+       intel_pt_log("queue %u decoding cpu %d pid %d tid %d\n",
+                    ptq->queue_nr, ptq->cpu, ptq->pid, ptq->tid);
+       while (1) {
+               err = intel_pt_sample(ptq);
+               if (err)
+                       return err;
+
+               state = intel_pt_decode(ptq->decoder);
+               if (state->err) {
+                       if (state->err == INTEL_PT_ERR_NODATA)
+                               return 1;
+                       if (pt->sync_switch &&
+                           state->from_ip >= pt->kernel_start) {
+                               pt->sync_switch = false;
+                               intel_pt_next_tid(pt, ptq);
+                       }
+                       if (pt->synth_opts.errors) {
+                               err = intel_pt_synth_error(pt, state->err,
+                                                          ptq->cpu, ptq->pid,
+                                                          ptq->tid,
+                                                          state->from_ip);
+                               if (err)
+                                       return err;
+                       }
+                       continue;
+               }
+
+               ptq->state = state;
+               ptq->have_sample = true;
+               intel_pt_sample_flags(ptq);
+
+               /* Use estimated TSC upon return to user space */
+               if (pt->est_tsc &&
+                   (state->from_ip >= pt->kernel_start || !state->from_ip) &&
+                   state->to_ip && state->to_ip < pt->kernel_start) {
+                       intel_pt_log("TSC %"PRIx64" est. TSC %"PRIx64"\n",
+                                    state->timestamp, state->est_timestamp);
+                       ptq->timestamp = state->est_timestamp;
+               /* Use estimated TSC in unknown switch state */
+               } else if (pt->sync_switch &&
+                          ptq->switch_state == INTEL_PT_SS_UNKNOWN &&
+                          intel_pt_is_switch_ip(ptq, state->to_ip) &&
+                          ptq->next_tid == -1) {
+                       intel_pt_log("TSC %"PRIx64" est. TSC %"PRIx64"\n",
+                                    state->timestamp, state->est_timestamp);
+                       ptq->timestamp = state->est_timestamp;
+               } else if (state->timestamp > ptq->timestamp) {
+                       ptq->timestamp = state->timestamp;
+               }
+
+               if (!pt->timeless_decoding && ptq->timestamp >= *timestamp) {
+                       *timestamp = ptq->timestamp;
+                       return 0;
+               }
+       }
+       return 0;
+}
+
+static inline int intel_pt_update_queues(struct intel_pt *pt)
+{
+       if (pt->queues.new_data) {
+               pt->queues.new_data = false;
+               return intel_pt_setup_queues(pt);
+       }
+       return 0;
+}
+
+static int intel_pt_process_queues(struct intel_pt *pt, u64 timestamp)
+{
+       unsigned int queue_nr;
+       u64 ts;
+       int ret;
+
+       while (1) {
+               struct auxtrace_queue *queue;
+               struct intel_pt_queue *ptq;
+
+               if (!pt->heap.heap_cnt)
+                       return 0;
+
+               if (pt->heap.heap_array[0].ordinal >= timestamp)
+                       return 0;
+
+               queue_nr = pt->heap.heap_array[0].queue_nr;
+               queue = &pt->queues.queue_array[queue_nr];
+               ptq = queue->priv;
+
+               intel_pt_log("queue %u processing 0x%" PRIx64 " to 0x%" PRIx64 "\n",
+                            queue_nr, pt->heap.heap_array[0].ordinal,
+                            timestamp);
+
+               auxtrace_heap__pop(&pt->heap);
+
+               if (pt->heap.heap_cnt) {
+                       ts = pt->heap.heap_array[0].ordinal + 1;
+                       if (ts > timestamp)
+                               ts = timestamp;
+               } else {
+                       ts = timestamp;
+               }
+
+               intel_pt_set_pid_tid_cpu(pt, queue);
+
+               ret = intel_pt_run_decoder(ptq, &ts);
+
+               if (ret < 0) {
+                       auxtrace_heap__add(&pt->heap, queue_nr, ts);
+                       return ret;
+               }
+
+               if (!ret) {
+                       ret = auxtrace_heap__add(&pt->heap, queue_nr, ts);
+                       if (ret < 0)
+                               return ret;
+               } else {
+                       ptq->on_heap = false;
+               }
+       }
+
+       return 0;
+}
+
+static int intel_pt_process_timeless_queues(struct intel_pt *pt, pid_t tid,
+                                           u64 time_)
+{
+       struct auxtrace_queues *queues = &pt->queues;
+       unsigned int i;
+       u64 ts = 0;
+
+       for (i = 0; i < queues->nr_queues; i++) {
+               struct auxtrace_queue *queue = &pt->queues.queue_array[i];
+               struct intel_pt_queue *ptq = queue->priv;
+
+               if (ptq && (tid == -1 || ptq->tid == tid)) {
+                       ptq->time = time_;
+                       intel_pt_set_pid_tid_cpu(pt, queue);
+                       intel_pt_run_decoder(ptq, &ts);
+               }
+       }
+       return 0;
+}
+
+static int intel_pt_lost(struct intel_pt *pt, struct perf_sample *sample)
+{
+       return intel_pt_synth_error(pt, INTEL_PT_ERR_LOST, sample->cpu,
+                                   sample->pid, sample->tid, 0);
+}
+
+static struct intel_pt_queue *intel_pt_cpu_to_ptq(struct intel_pt *pt, int cpu)
+{
+       unsigned i, j;
+
+       if (cpu < 0 || !pt->queues.nr_queues)
+               return NULL;
+
+       if ((unsigned)cpu >= pt->queues.nr_queues)
+               i = pt->queues.nr_queues - 1;
+       else
+               i = cpu;
+
+       if (pt->queues.queue_array[i].cpu == cpu)
+               return pt->queues.queue_array[i].priv;
+
+       for (j = 0; i > 0; j++) {
+               if (pt->queues.queue_array[--i].cpu == cpu)
+                       return pt->queues.queue_array[i].priv;
+       }
+
+       for (; j < pt->queues.nr_queues; j++) {
+               if (pt->queues.queue_array[j].cpu == cpu)
+                       return pt->queues.queue_array[j].priv;
+       }
+
+       return NULL;
+}
+
+static int intel_pt_process_switch(struct intel_pt *pt,
+                                  struct perf_sample *sample)
+{
+       struct intel_pt_queue *ptq;
+       struct perf_evsel *evsel;
+       pid_t tid;
+       int cpu, err;
+
+       evsel = perf_evlist__id2evsel(pt->session->evlist, sample->id);
+       if (evsel != pt->switch_evsel)
+               return 0;
+
+       tid = perf_evsel__intval(evsel, sample, "next_pid");
+       cpu = sample->cpu;
+
+       intel_pt_log("sched_switch: cpu %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
+                    cpu, tid, sample->time, perf_time_to_tsc(sample->time,
+                    &pt->tc));
+
+       if (!pt->sync_switch)
+               goto out;
+
+       ptq = intel_pt_cpu_to_ptq(pt, cpu);
+       if (!ptq)
+               goto out;
+
+       switch (ptq->switch_state) {
+       case INTEL_PT_SS_NOT_TRACING:
+               ptq->next_tid = -1;
+               break;
+       case INTEL_PT_SS_UNKNOWN:
+       case INTEL_PT_SS_TRACING:
+               ptq->next_tid = tid;
+               ptq->switch_state = INTEL_PT_SS_EXPECTING_SWITCH_IP;
+               return 0;
+       case INTEL_PT_SS_EXPECTING_SWITCH_EVENT:
+               if (!ptq->on_heap) {
+                       ptq->timestamp = perf_time_to_tsc(sample->time,
+                                                         &pt->tc);
+                       err = auxtrace_heap__add(&pt->heap, ptq->queue_nr,
+                                                ptq->timestamp);
+                       if (err)
+                               return err;
+                       ptq->on_heap = true;
+               }
+               ptq->switch_state = INTEL_PT_SS_TRACING;
+               break;
+       case INTEL_PT_SS_EXPECTING_SWITCH_IP:
+               ptq->next_tid = tid;
+               intel_pt_log("ERROR: cpu %d expecting switch ip\n", cpu);
+               break;
+       default:
+               break;
+       }
+out:
+       return machine__set_current_tid(pt->machine, cpu, -1, tid);
+}
+
+static int intel_pt_process_itrace_start(struct intel_pt *pt,
+                                        union perf_event *event,
+                                        struct perf_sample *sample)
+{
+       if (!pt->per_cpu_mmaps)
+               return 0;
+
+       intel_pt_log("itrace_start: cpu %d pid %d tid %d time %"PRIu64" tsc %#"PRIx64"\n",
+                    sample->cpu, event->itrace_start.pid,
+                    event->itrace_start.tid, sample->time,
+                    perf_time_to_tsc(sample->time, &pt->tc));
+
+       return machine__set_current_tid(pt->machine, sample->cpu,
+                                       event->itrace_start.pid,
+                                       event->itrace_start.tid);
+}
+
+static int intel_pt_process_event(struct perf_session *session,
+                                 union perf_event *event,
+                                 struct perf_sample *sample,
+                                 struct perf_tool *tool)
+{
+       struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt,
+                                          auxtrace);
+       u64 timestamp;
+       int err = 0;
+
+       if (dump_trace)
+               return 0;
+
+       if (!tool->ordered_events) {
+               pr_err("Intel Processor Trace requires ordered events\n");
+               return -EINVAL;
+       }
+
+       if (sample->time)
+               timestamp = perf_time_to_tsc(sample->time, &pt->tc);
+       else
+               timestamp = 0;
+
+       if (timestamp || pt->timeless_decoding) {
+               err = intel_pt_update_queues(pt);
+               if (err)
+                       return err;
+       }
+
+       if (pt->timeless_decoding) {
+               if (event->header.type == PERF_RECORD_EXIT) {
+                       err = intel_pt_process_timeless_queues(pt,
+                                                              event->comm.tid,
+                                                              sample->time);
+               }
+       } else if (timestamp) {
+               err = intel_pt_process_queues(pt, timestamp);
+       }
+       if (err)
+               return err;
+
+       if (event->header.type == PERF_RECORD_AUX &&
+           (event->aux.flags & PERF_AUX_FLAG_TRUNCATED) &&
+           pt->synth_opts.errors) {
+               err = intel_pt_lost(pt, sample);
+               if (err)
+                       return err;
+       }
+
+       if (pt->switch_evsel && event->header.type == PERF_RECORD_SAMPLE)
+               err = intel_pt_process_switch(pt, sample);
+       else if (event->header.type == PERF_RECORD_ITRACE_START)
+               err = intel_pt_process_itrace_start(pt, event, sample);
+
+       intel_pt_log("event %s (%u): cpu %d time %"PRIu64" tsc %#"PRIx64"\n",
+                    perf_event__name(event->header.type), event->header.type,
+                    sample->cpu, sample->time, timestamp);
+
+       return err;
+}
+
+static int intel_pt_flush(struct perf_session *session, struct perf_tool *tool)
+{
+       struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt,
+                                          auxtrace);
+       int ret;
+
+       if (dump_trace)
+               return 0;
+
+       if (!tool->ordered_events)
+               return -EINVAL;
+
+       ret = intel_pt_update_queues(pt);
+       if (ret < 0)
+               return ret;
+
+       if (pt->timeless_decoding)
+               return intel_pt_process_timeless_queues(pt, -1,
+                                                       MAX_TIMESTAMP - 1);
+
+       return intel_pt_process_queues(pt, MAX_TIMESTAMP);
+}
+
+static void intel_pt_free_events(struct perf_session *session)
+{
+       struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt,
+                                          auxtrace);
+       struct auxtrace_queues *queues = &pt->queues;
+       unsigned int i;
+
+       for (i = 0; i < queues->nr_queues; i++) {
+               intel_pt_free_queue(queues->queue_array[i].priv);
+               queues->queue_array[i].priv = NULL;
+       }
+       intel_pt_log_disable();
+       auxtrace_queues__free(queues);
+}
+
+static void intel_pt_free(struct perf_session *session)
+{
+       struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt,
+                                          auxtrace);
+
+       auxtrace_heap__free(&pt->heap);
+       intel_pt_free_events(session);
+       session->auxtrace = NULL;
+       thread__delete(pt->unknown_thread);
+       free(pt);
+}
+
+static int intel_pt_process_auxtrace_event(struct perf_session *session,
+                                          union perf_event *event,
+                                          struct perf_tool *tool __maybe_unused)
+{
+       struct intel_pt *pt = container_of(session->auxtrace, struct intel_pt,
+                                          auxtrace);
+
+       if (pt->sampling_mode)
+               return 0;
+
+       if (!pt->data_queued) {
+               struct auxtrace_buffer *buffer;
+               off_t data_offset;
+               int fd = perf_data_file__fd(session->file);
+               int err;
+
+               if (perf_data_file__is_pipe(session->file)) {
+                       data_offset = 0;
+               } else {
+                       data_offset = lseek(fd, 0, SEEK_CUR);
+                       if (data_offset == -1)
+                               return -errno;
+               }
+
+               err = auxtrace_queues__add_event(&pt->queues, session, event,
+                                                data_offset, &buffer);
+               if (err)
+                       return err;
+
+               /* Dump here now we have copied a piped trace out of the pipe */
+               if (dump_trace) {
+                       if (auxtrace_buffer__get_data(buffer, fd)) {
+                               intel_pt_dump_event(pt, buffer->data,
+                                                   buffer->size);
+                               auxtrace_buffer__put_data(buffer);
+                       }
+               }
+       }
+
+       return 0;
+}
+
+struct intel_pt_synth {
+       struct perf_tool dummy_tool;
+       struct perf_session *session;
+};
+
+static int intel_pt_event_synth(struct perf_tool *tool,
+                               union perf_event *event,
+                               struct perf_sample *sample __maybe_unused,
+                               struct machine *machine __maybe_unused)
+{
+       struct intel_pt_synth *intel_pt_synth =
+                       container_of(tool, struct intel_pt_synth, dummy_tool);
+
+       return perf_session__deliver_synth_event(intel_pt_synth->session, event,
+                                                NULL);
+}
+
+static int intel_pt_synth_event(struct perf_session *session,
+                               struct perf_event_attr *attr, u64 id)
+{
+       struct intel_pt_synth intel_pt_synth;
+
+       memset(&intel_pt_synth, 0, sizeof(struct intel_pt_synth));
+       intel_pt_synth.session = session;
+
+       return perf_event__synthesize_attr(&intel_pt_synth.dummy_tool, attr, 1,
+                                          &id, intel_pt_event_synth);
+}
+
+static int intel_pt_synth_events(struct intel_pt *pt,
+                                struct perf_session *session)
+{
+       struct perf_evlist *evlist = session->evlist;
+       struct perf_evsel *evsel;
+       struct perf_event_attr attr;
+       bool found = false;
+       u64 id;
+       int err;
+
+       evlist__for_each(evlist, evsel) {
+               if (evsel->attr.type == pt->pmu_type && evsel->ids) {
+                       found = true;
+                       break;
+               }
+       }
+
+       if (!found) {
+               pr_debug("There are no selected events with Intel Processor Trace data\n");
+               return 0;
+       }
+
+       memset(&attr, 0, sizeof(struct perf_event_attr));
+       attr.size = sizeof(struct perf_event_attr);
+       attr.type = PERF_TYPE_HARDWARE;
+       attr.sample_type = evsel->attr.sample_type & PERF_SAMPLE_MASK;
+       attr.sample_type |= PERF_SAMPLE_IP | PERF_SAMPLE_TID |
+                           PERF_SAMPLE_PERIOD;
+       if (pt->timeless_decoding)
+               attr.sample_type &= ~(u64)PERF_SAMPLE_TIME;
+       else
+               attr.sample_type |= PERF_SAMPLE_TIME;
+       if (!pt->per_cpu_mmaps)
+               attr.sample_type &= ~(u64)PERF_SAMPLE_CPU;
+       attr.exclude_user = evsel->attr.exclude_user;
+       attr.exclude_kernel = evsel->attr.exclude_kernel;
+       attr.exclude_hv = evsel->attr.exclude_hv;
+       attr.exclude_host = evsel->attr.exclude_host;
+       attr.exclude_guest = evsel->attr.exclude_guest;
+       attr.sample_id_all = evsel->attr.sample_id_all;
+       attr.read_format = evsel->attr.read_format;
+
+       id = evsel->id[0] + 1000000000;
+       if (!id)
+               id = 1;
+
+       if (pt->synth_opts.instructions) {
+               attr.config = PERF_COUNT_HW_INSTRUCTIONS;
+               if (pt->synth_opts.period_type == PERF_ITRACE_PERIOD_NANOSECS)
+                       attr.sample_period =
+                               intel_pt_ns_to_ticks(pt, pt->synth_opts.period);
+               else
+                       attr.sample_period = pt->synth_opts.period;
+               pt->instructions_sample_period = attr.sample_period;
+               if (pt->synth_opts.callchain)
+                       attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
+               pr_debug("Synthesizing 'instructions' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
+                        id, (u64)attr.sample_type);
+               err = intel_pt_synth_event(session, &attr, id);
+               if (err) {
+                       pr_err("%s: failed to synthesize 'instructions' event type\n",
+                              __func__);
+                       return err;
+               }
+               pt->sample_instructions = true;
+               pt->instructions_sample_type = attr.sample_type;
+               pt->instructions_id = id;
+               id += 1;
+       }
+
+       if (pt->synth_opts.transactions) {
+               attr.config = PERF_COUNT_HW_INSTRUCTIONS;
+               attr.sample_period = 1;
+               if (pt->synth_opts.callchain)
+                       attr.sample_type |= PERF_SAMPLE_CALLCHAIN;
+               pr_debug("Synthesizing 'transactions' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
+                        id, (u64)attr.sample_type);
+               err = intel_pt_synth_event(session, &attr, id);
+               if (err) {
+                       pr_err("%s: failed to synthesize 'transactions' event type\n",
+                              __func__);
+                       return err;
+               }
+               pt->sample_transactions = true;
+               pt->transactions_id = id;
+               id += 1;
+               evlist__for_each(evlist, evsel) {
+                       if (evsel->id && evsel->id[0] == pt->transactions_id) {
+                               if (evsel->name)
+                                       zfree(&evsel->name);
+                               evsel->name = strdup("transactions");
+                               break;
+                       }
+               }
+       }
+
+       if (pt->synth_opts.branches) {
+               attr.config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS;
+               attr.sample_period = 1;
+               attr.sample_type |= PERF_SAMPLE_ADDR;
+               attr.sample_type &= ~(u64)PERF_SAMPLE_CALLCHAIN;
+               pr_debug("Synthesizing 'branches' event with id %" PRIu64 " sample type %#" PRIx64 "\n",
+                        id, (u64)attr.sample_type);
+               err = intel_pt_synth_event(session, &attr, id);
+               if (err) {
+                       pr_err("%s: failed to synthesize 'branches' event type\n",
+                              __func__);
+                       return err;
+               }
+               pt->sample_branches = true;
+               pt->branches_sample_type = attr.sample_type;
+               pt->branches_id = id;
+       }
+
+       pt->synth_needs_swap = evsel->needs_swap;
+
+       return 0;
+}
+
+static struct perf_evsel *intel_pt_find_sched_switch(struct perf_evlist *evlist)
+{
+       struct perf_evsel *evsel;
+
+       evlist__for_each_reverse(evlist, evsel) {
+               const char *name = perf_evsel__name(evsel);
+
+               if (!strcmp(name, "sched:sched_switch"))
+                       return evsel;
+       }
+
+       return NULL;
+}
+
+static const char * const intel_pt_info_fmts[] = {
+       [INTEL_PT_PMU_TYPE]             = "  PMU Type           %"PRId64"\n",
+       [INTEL_PT_TIME_SHIFT]           = "  Time Shift         %"PRIu64"\n",
+       [INTEL_PT_TIME_MULT]            = "  Time Muliplier     %"PRIu64"\n",
+       [INTEL_PT_TIME_ZERO]            = "  Time Zero          %"PRIu64"\n",
+       [INTEL_PT_CAP_USER_TIME_ZERO]   = "  Cap Time Zero      %"PRId64"\n",
+       [INTEL_PT_TSC_BIT]              = "  TSC bit            %#"PRIx64"\n",
+       [INTEL_PT_NORETCOMP_BIT]        = "  NoRETComp bit      %#"PRIx64"\n",
+       [INTEL_PT_HAVE_SCHED_SWITCH]    = "  Have sched_switch  %"PRId64"\n",
+       [INTEL_PT_SNAPSHOT_MODE]        = "  Snapshot mode      %"PRId64"\n",
+       [INTEL_PT_PER_CPU_MMAPS]        = "  Per-cpu maps       %"PRId64"\n",
+};
+
+static void intel_pt_print_info(u64 *arr, int start, int finish)
+{
+       int i;
+
+       if (!dump_trace)
+               return;
+
+       for (i = start; i <= finish; i++)
+               fprintf(stdout, intel_pt_info_fmts[i], arr[i]);
+}
+
+int intel_pt_process_auxtrace_info(union perf_event *event,
+                                  struct perf_session *session)
+{
+       struct auxtrace_info_event *auxtrace_info = &event->auxtrace_info;
+       size_t min_sz = sizeof(u64) * INTEL_PT_PER_CPU_MMAPS;
+       struct intel_pt *pt;
+       int err;
+
+       if (auxtrace_info->header.size < sizeof(struct auxtrace_info_event) +
+                                       min_sz)
+               return -EINVAL;
+
+       pt = zalloc(sizeof(struct intel_pt));
+       if (!pt)
+               return -ENOMEM;
+
+       err = auxtrace_queues__init(&pt->queues);
+       if (err)
+               goto err_free;
+
+       intel_pt_log_set_name(INTEL_PT_PMU_NAME);
+
+       pt->session = session;
+       pt->machine = &session->machines.host; /* No kvm support */
+       pt->auxtrace_type = auxtrace_info->type;
+       pt->pmu_type = auxtrace_info->priv[INTEL_PT_PMU_TYPE];
+       pt->tc.time_shift = auxtrace_info->priv[INTEL_PT_TIME_SHIFT];
+       pt->tc.time_mult = auxtrace_info->priv[INTEL_PT_TIME_MULT];
+       pt->tc.time_zero = auxtrace_info->priv[INTEL_PT_TIME_ZERO];
+       pt->cap_user_time_zero = auxtrace_info->priv[INTEL_PT_CAP_USER_TIME_ZERO];
+       pt->tsc_bit = auxtrace_info->priv[INTEL_PT_TSC_BIT];
+       pt->noretcomp_bit = auxtrace_info->priv[INTEL_PT_NORETCOMP_BIT];
+       pt->have_sched_switch = auxtrace_info->priv[INTEL_PT_HAVE_SCHED_SWITCH];
+       pt->snapshot_mode = auxtrace_info->priv[INTEL_PT_SNAPSHOT_MODE];
+       pt->per_cpu_mmaps = auxtrace_info->priv[INTEL_PT_PER_CPU_MMAPS];
+       intel_pt_print_info(&auxtrace_info->priv[0], INTEL_PT_PMU_TYPE,
+                           INTEL_PT_PER_CPU_MMAPS);
+
+       pt->timeless_decoding = intel_pt_timeless_decoding(pt);
+       pt->have_tsc = intel_pt_have_tsc(pt);
+       pt->sampling_mode = false;
+       pt->est_tsc = !pt->timeless_decoding;
+
+       pt->unknown_thread = thread__new(999999999, 999999999);
+       if (!pt->unknown_thread) {
+               err = -ENOMEM;
+               goto err_free_queues;
+       }
+       err = thread__set_comm(pt->unknown_thread, "unknown", 0);
+       if (err)
+               goto err_delete_thread;
+       if (thread__init_map_groups(pt->unknown_thread, pt->machine)) {
+               err = -ENOMEM;
+               goto err_delete_thread;
+       }
+
+       pt->auxtrace.process_event = intel_pt_process_event;
+       pt->auxtrace.process_auxtrace_event = intel_pt_process_auxtrace_event;
+       pt->auxtrace.flush_events = intel_pt_flush;
+       pt->auxtrace.free_events = intel_pt_free_events;
+       pt->auxtrace.free = intel_pt_free;
+       session->auxtrace = &pt->auxtrace;
+
+       if (dump_trace)
+               return 0;
+
+       if (pt->have_sched_switch == 1) {
+               pt->switch_evsel = intel_pt_find_sched_switch(session->evlist);
+               if (!pt->switch_evsel) {
+                       pr_err("%s: missing sched_switch event\n", __func__);
+                       goto err_delete_thread;
+               }
+       }
+
+       if (session->itrace_synth_opts && session->itrace_synth_opts->set) {
+               pt->synth_opts = *session->itrace_synth_opts;
+       } else {
+               itrace_synth_opts__set_default(&pt->synth_opts);
+               if (use_browser != -1) {
+                       pt->synth_opts.branches = false;
+                       pt->synth_opts.callchain = true;
+               }
+       }
+
+       if (pt->synth_opts.log)
+               intel_pt_log_enable();
+
+       /* Maximum non-turbo ratio is TSC freq / 100 MHz */
+       if (pt->tc.time_mult) {
+               u64 tsc_freq = intel_pt_ns_to_ticks(pt, 1000000000);
+
+               pt->max_non_turbo_ratio = (tsc_freq + 50000000) / 100000000;
+               intel_pt_log("TSC frequency %"PRIu64"\n", tsc_freq);
+               intel_pt_log("Maximum non-turbo ratio %u\n",
+                            pt->max_non_turbo_ratio);
+       }
+
+       if (pt->synth_opts.calls)
+               pt->branches_filter |= PERF_IP_FLAG_CALL | PERF_IP_FLAG_ASYNC |
+                                      PERF_IP_FLAG_TRACE_END;
+       if (pt->synth_opts.returns)
+               pt->branches_filter |= PERF_IP_FLAG_RETURN |
+                                      PERF_IP_FLAG_TRACE_BEGIN;
+
+       if (pt->synth_opts.callchain && !symbol_conf.use_callchain) {
+               symbol_conf.use_callchain = true;
+               if (callchain_register_param(&callchain_param) < 0) {
+                       symbol_conf.use_callchain = false;
+                       pt->synth_opts.callchain = false;
+               }
+       }
+
+       err = intel_pt_synth_events(pt, session);
+       if (err)
+               goto err_delete_thread;
+
+       err = auxtrace_queues__process_index(&pt->queues, session);
+       if (err)
+               goto err_delete_thread;
+
+       if (pt->queues.populated)
+               pt->data_queued = true;
+
+       if (pt->timeless_decoding)
+               pr_debug2("Intel PT decoding without timestamps\n");
+
+       return 0;
+
+err_delete_thread:
+       thread__delete(pt->unknown_thread);
+err_free_queues:
+       intel_pt_log_disable();
+       auxtrace_queues__free(&pt->queues);
+       session->auxtrace = NULL;
+err_free:
+       free(pt);
+       return err;
+}
diff --git a/tools/perf/util/intel-pt.h b/tools/perf/util/intel-pt.h
new file mode 100644 (file)
index 0000000..a1bfe93
--- /dev/null
@@ -0,0 +1,51 @@
+/*
+ * intel_pt.h: Intel Processor Trace support
+ * Copyright (c) 2013-2015, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#ifndef INCLUDE__PERF_INTEL_PT_H__
+#define INCLUDE__PERF_INTEL_PT_H__
+
+#define INTEL_PT_PMU_NAME "intel_pt"
+
+enum {
+       INTEL_PT_PMU_TYPE,
+       INTEL_PT_TIME_SHIFT,
+       INTEL_PT_TIME_MULT,
+       INTEL_PT_TIME_ZERO,
+       INTEL_PT_CAP_USER_TIME_ZERO,
+       INTEL_PT_TSC_BIT,
+       INTEL_PT_NORETCOMP_BIT,
+       INTEL_PT_HAVE_SCHED_SWITCH,
+       INTEL_PT_SNAPSHOT_MODE,
+       INTEL_PT_PER_CPU_MMAPS,
+       INTEL_PT_AUXTRACE_PRIV_MAX,
+};
+
+#define INTEL_PT_AUXTRACE_PRIV_SIZE (INTEL_PT_AUXTRACE_PRIV_MAX * sizeof(u64))
+
+struct auxtrace_record;
+struct perf_tool;
+union perf_event;
+struct perf_session;
+struct perf_event_attr;
+struct perf_pmu;
+
+struct auxtrace_record *intel_pt_recording_init(int *err);
+
+int intel_pt_process_auxtrace_info(union perf_event *event,
+                                  struct perf_session *session);
+
+struct perf_event_attr *intel_pt_pmu_default_config(struct perf_pmu *pmu);
+
+#endif