perf/core, x86: Add PERF_SAMPLE_PHYS_ADDR
authorKan Liang <kan.liang@intel.com>
Tue, 29 Aug 2017 00:52:49 +0000 (20:52 -0400)
committerIngo Molnar <mingo@kernel.org>
Tue, 29 Aug 2017 13:09:25 +0000 (15:09 +0200)
For understanding how the workload maps to memory channels and hardware
behavior, it's very important to collect address maps with physical
addresses. For example, 3D XPoint access can only be found by filtering
the physical address.

Add a new sample type for physical address.

perf already has a facility to collect data virtual address. This patch
introduces a function to convert the virtual address to physical address.
The function is quite generic and can be extended to any architecture as
long as a virtual address is provided.

 - For kernel direct mapping addresses, virt_to_phys is used to convert
   the virtual addresses to physical address.

 - For user virtual addresses, __get_user_pages_fast is used to walk the
   pages tables for user physical address.

 - This does not work for vmalloc addresses right now. These are not
   resolved, but code to do that could be added.

The new sample type requires collecting the virtual address. The
virtual address will not be output unless SAMPLE_ADDR is applied.

For security, the physical address can only be exposed to root or
privileged user.

Tested-by: Madhavan Srinivasan <maddy@linux.vnet.ibm.com>
Signed-off-by: Kan Liang <kan.liang@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stephane Eranian <eranian@google.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vince Weaver <vincent.weaver@maine.edu>
Cc: acme@kernel.org
Cc: mpe@ellerman.id.au
Link: http://lkml.kernel.org/r/1503967969-48278-1-git-send-email-kan.liang@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
arch/powerpc/perf/core-book3s.c
arch/x86/events/intel/ds.c
arch/x86/events/perf_event.h
include/linux/perf_event.h
include/uapi/linux/perf_event.h
kernel/events/core.c

index 6c2d4168daec9a426ff2d9d5fb622fe3338dc836..2e3eb7431571603fc9cd2452df25a32c6cd4c6da 100644 (file)
@@ -2039,7 +2039,8 @@ static void record_and_restart(struct perf_event *event, unsigned long val,
 
                perf_sample_data_init(&data, ~0ULL, event->hw.last_period);
 
-               if (event->attr.sample_type & PERF_SAMPLE_ADDR)
+               if (event->attr.sample_type &
+                   (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR))
                        perf_get_data_addr(regs, &data.addr);
 
                if (event->attr.sample_type & PERF_SAMPLE_BRANCH_STACK) {
index 98e36e0c791c7e1eb473b138bf44784da59e23ba..e1965e5ff570835be004e753a718858473bc8325 100644 (file)
@@ -1185,7 +1185,7 @@ static void setup_pebs_sample_data(struct perf_event *event,
        else
                regs->flags &= ~PERF_EFLAGS_EXACT;
 
-       if ((sample_type & PERF_SAMPLE_ADDR) &&
+       if ((sample_type & (PERF_SAMPLE_ADDR | PERF_SAMPLE_PHYS_ADDR)) &&
            x86_pmu.intel_cap.pebs_format >= 1)
                data->addr = pebs->dla;
 
index 9337589014cce31112a3cfe9d61ba9e44a0a53c7..4196f81ec0e1b0de71483cbc999837ec48b51f33 100644 (file)
@@ -91,7 +91,7 @@ struct amd_nb {
        (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \
        PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \
        PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \
-       PERF_SAMPLE_TRANSACTION)
+       PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR)
 
 /*
  * A debug store configuration.
index adda0aaae6c8182252af5c52207feac07ee2c106..718ba163c1b939abf85be7a19d35fc31915a0300 100644 (file)
@@ -943,6 +943,8 @@ struct perf_sample_data {
 
        struct perf_regs                regs_intr;
        u64                             stack_user_size;
+
+       u64                             phys_addr;
 } ____cacheline_aligned;
 
 /* default value for data source */
index 2a37ae925d854cca62961ed52a9d2a916e2cc0a9..140ae638cfd618b931ec1d298a80d1dffa4c1680 100644 (file)
@@ -139,8 +139,9 @@ enum perf_event_sample_format {
        PERF_SAMPLE_IDENTIFIER                  = 1U << 16,
        PERF_SAMPLE_TRANSACTION                 = 1U << 17,
        PERF_SAMPLE_REGS_INTR                   = 1U << 18,
+       PERF_SAMPLE_PHYS_ADDR                   = 1U << 19,
 
-       PERF_SAMPLE_MAX = 1U << 19,             /* non-ABI */
+       PERF_SAMPLE_MAX = 1U << 20,             /* non-ABI */
 };
 
 /*
@@ -814,6 +815,7 @@ enum perf_event_type {
         *      { u64                   transaction; } && PERF_SAMPLE_TRANSACTION
         *      { u64                   abi; # enum perf_sample_regs_abi
         *        u64                   regs[weight(mask)]; } && PERF_SAMPLE_REGS_INTR
+        *      { u64                   phys_addr;} && PERF_SAMPLE_PHYS_ADDR
         * };
         */
        PERF_RECORD_SAMPLE                      = 9,
index 77fd6b11ef225f4db76a4384a2aa1c6917911f4f..ce64f3fed5c64e9f258ce6a97a8cb6b66a8c3fc6 100644 (file)
@@ -1575,6 +1575,9 @@ static void __perf_event_header_size(struct perf_event *event, u64 sample_type)
        if (sample_type & PERF_SAMPLE_TRANSACTION)
                size += sizeof(data->txn);
 
+       if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+               size += sizeof(data->phys_addr);
+
        event->header_size = size;
 }
 
@@ -6017,6 +6020,9 @@ void perf_output_sample(struct perf_output_handle *handle,
                }
        }
 
+       if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+               perf_output_put(handle, data->phys_addr);
+
        if (!event->attr.watermark) {
                int wakeup_events = event->attr.wakeup_events;
 
@@ -6032,6 +6038,38 @@ void perf_output_sample(struct perf_output_handle *handle,
        }
 }
 
+static u64 perf_virt_to_phys(u64 virt)
+{
+       u64 phys_addr = 0;
+       struct page *p = NULL;
+
+       if (!virt)
+               return 0;
+
+       if (virt >= TASK_SIZE) {
+               /* If it's vmalloc()d memory, leave phys_addr as 0 */
+               if (virt_addr_valid((void *)(uintptr_t)virt) &&
+                   !(virt >= VMALLOC_START && virt < VMALLOC_END))
+                       phys_addr = (u64)virt_to_phys((void *)(uintptr_t)virt);
+       } else {
+               /*
+                * Walking the pages tables for user address.
+                * Interrupts are disabled, so it prevents any tear down
+                * of the page tables.
+                * Try IRQ-safe __get_user_pages_fast first.
+                * If failed, leave phys_addr as 0.
+                */
+               if ((current->mm != NULL) &&
+                   (__get_user_pages_fast(virt, 1, 0, &p) == 1))
+                       phys_addr = page_to_phys(p) + virt % PAGE_SIZE;
+
+               if (p)
+                       put_page(p);
+       }
+
+       return phys_addr;
+}
+
 void perf_prepare_sample(struct perf_event_header *header,
                         struct perf_sample_data *data,
                         struct perf_event *event,
@@ -6150,6 +6188,9 @@ void perf_prepare_sample(struct perf_event_header *header,
 
                header->size += size;
        }
+
+       if (sample_type & PERF_SAMPLE_PHYS_ADDR)
+               data->phys_addr = perf_virt_to_phys(data->addr);
 }
 
 static void __always_inline
@@ -9909,6 +9950,11 @@ SYSCALL_DEFINE5(perf_event_open,
                        return -EINVAL;
        }
 
+       /* Only privileged users can get physical addresses */
+       if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+           perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
+               return -EACCES;
+
        if (!attr.sample_max_stack)
                attr.sample_max_stack = sysctl_perf_event_max_stack;