samples: bpf: add bpf_perf_event_output example
authorAlexei Starovoitov <ast@plumgrid.com>
Wed, 21 Oct 2015 03:02:35 +0000 (20:02 -0700)
committerDavid S. Miller <davem@davemloft.net>
Thu, 22 Oct 2015 13:42:15 +0000 (06:42 -0700)
Performance test and example of bpf_perf_event_output().
kprobe is attached to sys_write() and trivial bpf program streams
pid+cookie into userspace via PERF_COUNT_SW_BPF_OUTPUT event.

Usage:
$ sudo ./bld_x64/samples/bpf/trace_output
recv 2968913 events per sec

Signed-off-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
samples/bpf/Makefile
samples/bpf/bpf_helpers.h
samples/bpf/trace_output_kern.c [new file with mode: 0644]
samples/bpf/trace_output_user.c [new file with mode: 0644]

index 63e7d50e6a4fe3ca46cbd3b7ae064600bf6696ff..b30514514e370e1de4fed734d1dcc7d28c927695 100644 (file)
@@ -13,6 +13,7 @@ hostprogs-y += tracex3
 hostprogs-y += tracex4
 hostprogs-y += tracex5
 hostprogs-y += tracex6
+hostprogs-y += trace_output
 hostprogs-y += lathist
 
 test_verifier-objs := test_verifier.o libbpf.o
@@ -27,6 +28,7 @@ tracex3-objs := bpf_load.o libbpf.o tracex3_user.o
 tracex4-objs := bpf_load.o libbpf.o tracex4_user.o
 tracex5-objs := bpf_load.o libbpf.o tracex5_user.o
 tracex6-objs := bpf_load.o libbpf.o tracex6_user.o
+trace_output-objs := bpf_load.o libbpf.o trace_output_user.o
 lathist-objs := bpf_load.o libbpf.o lathist_user.o
 
 # Tell kbuild to always build the programs
@@ -40,6 +42,7 @@ always += tracex3_kern.o
 always += tracex4_kern.o
 always += tracex5_kern.o
 always += tracex6_kern.o
+always += trace_output_kern.o
 always += tcbpf1_kern.o
 always += lathist_kern.o
 
@@ -55,6 +58,7 @@ HOSTLOADLIBES_tracex3 += -lelf
 HOSTLOADLIBES_tracex4 += -lelf -lrt
 HOSTLOADLIBES_tracex5 += -lelf
 HOSTLOADLIBES_tracex6 += -lelf
+HOSTLOADLIBES_trace_output += -lelf -lrt
 HOSTLOADLIBES_lathist += -lelf
 
 # point this to your LLVM backend with bpf support
@@ -64,3 +68,6 @@ $(obj)/%.o: $(src)/%.c
        clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
                -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
                -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=obj -o $@
+       clang $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(EXTRA_CFLAGS) \
+               -D__KERNEL__ -Wno-unused-value -Wno-pointer-sign \
+               -O2 -emit-llvm -c $< -o -| $(LLC) -march=bpf -filetype=asm -o $@.s
index 21aa1b44c30ca1ff8ba369de84f7492df01aa518..b35c21e0b43f68a6dd57fe784e3183c821aaefa9 100644 (file)
@@ -37,6 +37,8 @@ static int (*bpf_clone_redirect)(void *ctx, int ifindex, int flags) =
        (void *) BPF_FUNC_clone_redirect;
 static int (*bpf_redirect)(int ifindex, int flags) =
        (void *) BPF_FUNC_redirect;
+static int (*bpf_perf_event_output)(void *ctx, void *map, int index, void *data, int size) =
+       (void *) BPF_FUNC_perf_event_output;
 
 /* llvm builtin functions that eBPF C program may use to
  * emit BPF_LD_ABS and BPF_LD_IND instructions
diff --git a/samples/bpf/trace_output_kern.c b/samples/bpf/trace_output_kern.c
new file mode 100644 (file)
index 0000000..8d8d1ec
--- /dev/null
@@ -0,0 +1,31 @@
+#include <linux/ptrace.h>
+#include <linux/version.h>
+#include <uapi/linux/bpf.h>
+#include "bpf_helpers.h"
+
+struct bpf_map_def SEC("maps") my_map = {
+       .type = BPF_MAP_TYPE_PERF_EVENT_ARRAY,
+       .key_size = sizeof(int),
+       .value_size = sizeof(u32),
+       .max_entries = 2,
+};
+
+SEC("kprobe/sys_write")
+int bpf_prog1(struct pt_regs *ctx)
+{
+       struct S {
+               u64 pid;
+               u64 cookie;
+       } data;
+
+       memset(&data, 0, sizeof(data));
+       data.pid = bpf_get_current_pid_tgid();
+       data.cookie = 0x12345678;
+
+       bpf_perf_event_output(ctx, &my_map, 0, &data, sizeof(data));
+
+       return 0;
+}
+
+char _license[] SEC("license") = "GPL";
+u32 _version SEC("version") = LINUX_VERSION_CODE;
diff --git a/samples/bpf/trace_output_user.c b/samples/bpf/trace_output_user.c
new file mode 100644 (file)
index 0000000..661a7d0
--- /dev/null
@@ -0,0 +1,196 @@
+/* This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#include <stdio.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdbool.h>
+#include <string.h>
+#include <fcntl.h>
+#include <poll.h>
+#include <sys/ioctl.h>
+#include <linux/perf_event.h>
+#include <linux/bpf.h>
+#include <errno.h>
+#include <assert.h>
+#include <sys/syscall.h>
+#include <sys/ioctl.h>
+#include <sys/mman.h>
+#include <time.h>
+#include <signal.h>
+#include "libbpf.h"
+#include "bpf_load.h"
+
+static int pmu_fd;
+
+int page_size;
+int page_cnt = 8;
+volatile struct perf_event_mmap_page *header;
+
+typedef void (*print_fn)(void *data, int size);
+
+static int perf_event_mmap(int fd)
+{
+       void *base;
+       int mmap_size;
+
+       page_size = getpagesize();
+       mmap_size = page_size * (page_cnt + 1);
+
+       base = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
+       if (base == MAP_FAILED) {
+               printf("mmap err\n");
+               return -1;
+       }
+
+       header = base;
+       return 0;
+}
+
+static int perf_event_poll(int fd)
+{
+       struct pollfd pfd = { .fd = fd, .events = POLLIN };
+
+       return poll(&pfd, 1, 1000);
+}
+
+struct perf_event_sample {
+       struct perf_event_header header;
+       __u32 size;
+       char data[];
+};
+
+void perf_event_read(print_fn fn)
+{
+       __u64 data_tail = header->data_tail;
+       __u64 data_head = header->data_head;
+       __u64 buffer_size = page_cnt * page_size;
+       void *base, *begin, *end;
+       char buf[256];
+
+       asm volatile("" ::: "memory"); /* in real code it should be smp_rmb() */
+       if (data_head == data_tail)
+               return;
+
+       base = ((char *)header) + page_size;
+
+       begin = base + data_tail % buffer_size;
+       end = base + data_head % buffer_size;
+
+       while (begin != end) {
+               struct perf_event_sample *e;
+
+               e = begin;
+               if (begin + e->header.size > base + buffer_size) {
+                       long len = base + buffer_size - begin;
+
+                       assert(len < e->header.size);
+                       memcpy(buf, begin, len);
+                       memcpy(buf + len, base, e->header.size - len);
+                       e = (void *) buf;
+                       begin = base + e->header.size - len;
+               } else if (begin + e->header.size == base + buffer_size) {
+                       begin = base;
+               } else {
+                       begin += e->header.size;
+               }
+
+               if (e->header.type == PERF_RECORD_SAMPLE) {
+                       fn(e->data, e->size);
+               } else if (e->header.type == PERF_RECORD_LOST) {
+                       struct {
+                               struct perf_event_header header;
+                               __u64 id;
+                               __u64 lost;
+                       } *lost = (void *) e;
+                       printf("lost %lld events\n", lost->lost);
+               } else {
+                       printf("unknown event type=%d size=%d\n",
+                              e->header.type, e->header.size);
+               }
+       }
+
+       __sync_synchronize(); /* smp_mb() */
+       header->data_tail = data_head;
+}
+
+static __u64 time_get_ns(void)
+{
+       struct timespec ts;
+
+       clock_gettime(CLOCK_MONOTONIC, &ts);
+       return ts.tv_sec * 1000000000ull + ts.tv_nsec;
+}
+
+static __u64 start_time;
+
+#define MAX_CNT 100000ll
+
+static void print_bpf_output(void *data, int size)
+{
+       static __u64 cnt;
+       struct {
+               __u64 pid;
+               __u64 cookie;
+       } *e = data;
+
+       if (e->cookie != 0x12345678) {
+               printf("BUG pid %llx cookie %llx sized %d\n",
+                      e->pid, e->cookie, size);
+               kill(0, SIGINT);
+       }
+
+       cnt++;
+
+       if (cnt == MAX_CNT) {
+               printf("recv %lld events per sec\n",
+                      MAX_CNT * 1000000000ll / (time_get_ns() - start_time));
+               kill(0, SIGINT);
+       }
+}
+
+static void test_bpf_perf_event(void)
+{
+       struct perf_event_attr attr = {
+               .sample_type = PERF_SAMPLE_RAW,
+               .type = PERF_TYPE_SOFTWARE,
+               .config = PERF_COUNT_SW_BPF_OUTPUT,
+       };
+       int key = 0;
+
+       pmu_fd = perf_event_open(&attr, -1/*pid*/, 0/*cpu*/, -1/*group_fd*/, 0);
+
+       assert(pmu_fd >= 0);
+       assert(bpf_update_elem(map_fd[0], &key, &pmu_fd, BPF_ANY) == 0);
+       ioctl(pmu_fd, PERF_EVENT_IOC_ENABLE, 0);
+}
+
+int main(int argc, char **argv)
+{
+       char filename[256];
+       FILE *f;
+
+       snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]);
+
+       if (load_bpf_file(filename)) {
+               printf("%s", bpf_log_buf);
+               return 1;
+       }
+
+       test_bpf_perf_event();
+
+       if (perf_event_mmap(pmu_fd) < 0)
+               return 1;
+
+       f = popen("taskset 1 dd if=/dev/zero of=/dev/null", "r");
+       (void) f;
+
+       start_time = time_get_ns();
+       for (;;) {
+               perf_event_poll(pmu_fd);
+               perf_event_read(print_bpf_output);
+       }
+
+       return 0;
+}