perf trace: Support multiple "vfs_getname" probes
[GitHub/moto-9609/android_kernel_motorola_exynos9610.git] / tools / perf / builtin-trace.c
1 /*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 *
16 * Released under the GPL v2. (and only v2, not any later version)
17 */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60
61 #include "sane_ctype.h"
62
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC 02000000
65 #endif
66
67 #ifndef F_LINUX_SPECIFIC_BASE
68 # define F_LINUX_SPECIFIC_BASE 1024
69 #endif
70
71 struct trace {
72 struct perf_tool tool;
73 struct syscalltbl *sctbl;
74 struct {
75 int max;
76 struct syscall *table;
77 struct {
78 struct perf_evsel *sys_enter,
79 *sys_exit;
80 } events;
81 } syscalls;
82 struct record_opts opts;
83 struct perf_evlist *evlist;
84 struct machine *host;
85 struct thread *current;
86 u64 base_time;
87 FILE *output;
88 unsigned long nr_events;
89 struct strlist *ev_qualifier;
90 struct {
91 size_t nr;
92 int *entries;
93 } ev_qualifier_ids;
94 struct {
95 size_t nr;
96 pid_t *entries;
97 } filter_pids;
98 double duration_filter;
99 double runtime_ms;
100 struct {
101 u64 vfs_getname,
102 proc_getname;
103 } stats;
104 unsigned int max_stack;
105 unsigned int min_stack;
106 bool not_ev_qualifier;
107 bool live;
108 bool full_time;
109 bool sched;
110 bool multiple_threads;
111 bool summary;
112 bool summary_only;
113 bool show_comm;
114 bool show_tool_stats;
115 bool trace_syscalls;
116 bool kernel_syscallchains;
117 bool force;
118 bool vfs_getname;
119 int trace_pgfaults;
120 int open_id;
121 };
122
123 struct tp_field {
124 int offset;
125 union {
126 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
127 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
128 };
129 };
130
131 #define TP_UINT_FIELD(bits) \
132 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
133 { \
134 u##bits value; \
135 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
136 return value; \
137 }
138
139 TP_UINT_FIELD(8);
140 TP_UINT_FIELD(16);
141 TP_UINT_FIELD(32);
142 TP_UINT_FIELD(64);
143
144 #define TP_UINT_FIELD__SWAPPED(bits) \
145 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
146 { \
147 u##bits value; \
148 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
149 return bswap_##bits(value);\
150 }
151
152 TP_UINT_FIELD__SWAPPED(16);
153 TP_UINT_FIELD__SWAPPED(32);
154 TP_UINT_FIELD__SWAPPED(64);
155
156 static int tp_field__init_uint(struct tp_field *field,
157 struct format_field *format_field,
158 bool needs_swap)
159 {
160 field->offset = format_field->offset;
161
162 switch (format_field->size) {
163 case 1:
164 field->integer = tp_field__u8;
165 break;
166 case 2:
167 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
168 break;
169 case 4:
170 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
171 break;
172 case 8:
173 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
174 break;
175 default:
176 return -1;
177 }
178
179 return 0;
180 }
181
182 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
183 {
184 return sample->raw_data + field->offset;
185 }
186
187 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
188 {
189 field->offset = format_field->offset;
190 field->pointer = tp_field__ptr;
191 return 0;
192 }
193
194 struct syscall_tp {
195 struct tp_field id;
196 union {
197 struct tp_field args, ret;
198 };
199 };
200
201 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
202 struct tp_field *field,
203 const char *name)
204 {
205 struct format_field *format_field = perf_evsel__field(evsel, name);
206
207 if (format_field == NULL)
208 return -1;
209
210 return tp_field__init_uint(field, format_field, evsel->needs_swap);
211 }
212
213 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
214 ({ struct syscall_tp *sc = evsel->priv;\
215 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
216
217 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
218 struct tp_field *field,
219 const char *name)
220 {
221 struct format_field *format_field = perf_evsel__field(evsel, name);
222
223 if (format_field == NULL)
224 return -1;
225
226 return tp_field__init_ptr(field, format_field);
227 }
228
229 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
230 ({ struct syscall_tp *sc = evsel->priv;\
231 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
232
233 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
234 {
235 zfree(&evsel->priv);
236 perf_evsel__delete(evsel);
237 }
238
239 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
240 {
241 evsel->priv = malloc(sizeof(struct syscall_tp));
242 if (evsel->priv != NULL) {
243 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
244 goto out_delete;
245
246 evsel->handler = handler;
247 return 0;
248 }
249
250 return -ENOMEM;
251
252 out_delete:
253 zfree(&evsel->priv);
254 return -ENOENT;
255 }
256
257 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
258 {
259 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
260
261 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
262 if (IS_ERR(evsel))
263 evsel = perf_evsel__newtp("syscalls", direction);
264
265 if (IS_ERR(evsel))
266 return NULL;
267
268 if (perf_evsel__init_syscall_tp(evsel, handler))
269 goto out_delete;
270
271 return evsel;
272
273 out_delete:
274 perf_evsel__delete_priv(evsel);
275 return NULL;
276 }
277
278 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
279 ({ struct syscall_tp *fields = evsel->priv; \
280 fields->name.integer(&fields->name, sample); })
281
282 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
283 ({ struct syscall_tp *fields = evsel->priv; \
284 fields->name.pointer(&fields->name, sample); })
285
286 size_t strarray__scnprintf(struct strarray *sa, char *bf, size_t size, const char *intfmt, int val)
287 {
288 int idx = val - sa->offset;
289
290 if (idx < 0 || idx >= sa->nr_entries)
291 return scnprintf(bf, size, intfmt, val);
292
293 return scnprintf(bf, size, "%s", sa->entries[idx]);
294 }
295
296 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
297 const char *intfmt,
298 struct syscall_arg *arg)
299 {
300 return strarray__scnprintf(arg->parm, bf, size, intfmt, arg->val);
301 }
302
303 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
304 struct syscall_arg *arg)
305 {
306 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
307 }
308
309 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
310
311 struct strarrays {
312 int nr_entries;
313 struct strarray **entries;
314 };
315
316 #define DEFINE_STRARRAYS(array) struct strarrays strarrays__##array = { \
317 .nr_entries = ARRAY_SIZE(array), \
318 .entries = array, \
319 }
320
321 size_t syscall_arg__scnprintf_strarrays(char *bf, size_t size,
322 struct syscall_arg *arg)
323 {
324 struct strarrays *sas = arg->parm;
325 int i;
326
327 for (i = 0; i < sas->nr_entries; ++i) {
328 struct strarray *sa = sas->entries[i];
329 int idx = arg->val - sa->offset;
330
331 if (idx >= 0 && idx < sa->nr_entries) {
332 if (sa->entries[idx] == NULL)
333 break;
334 return scnprintf(bf, size, "%s", sa->entries[idx]);
335 }
336 }
337
338 return scnprintf(bf, size, "%d", arg->val);
339 }
340
341 #ifndef AT_FDCWD
342 #define AT_FDCWD -100
343 #endif
344
345 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
346 struct syscall_arg *arg)
347 {
348 int fd = arg->val;
349
350 if (fd == AT_FDCWD)
351 return scnprintf(bf, size, "CWD");
352
353 return syscall_arg__scnprintf_fd(bf, size, arg);
354 }
355
356 #define SCA_FDAT syscall_arg__scnprintf_fd_at
357
358 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
359 struct syscall_arg *arg);
360
361 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
362
363 size_t syscall_arg__scnprintf_hex(char *bf, size_t size, struct syscall_arg *arg)
364 {
365 return scnprintf(bf, size, "%#lx", arg->val);
366 }
367
368 size_t syscall_arg__scnprintf_int(char *bf, size_t size, struct syscall_arg *arg)
369 {
370 return scnprintf(bf, size, "%d", arg->val);
371 }
372
373 size_t syscall_arg__scnprintf_long(char *bf, size_t size, struct syscall_arg *arg)
374 {
375 return scnprintf(bf, size, "%ld", arg->val);
376 }
377
378 static const char *bpf_cmd[] = {
379 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
380 "MAP_GET_NEXT_KEY", "PROG_LOAD",
381 };
382 static DEFINE_STRARRAY(bpf_cmd);
383
384 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
385 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
386
387 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
388 static DEFINE_STRARRAY(itimers);
389
390 static const char *keyctl_options[] = {
391 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
392 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
393 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
394 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
395 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
396 };
397 static DEFINE_STRARRAY(keyctl_options);
398
399 static const char *whences[] = { "SET", "CUR", "END",
400 #ifdef SEEK_DATA
401 "DATA",
402 #endif
403 #ifdef SEEK_HOLE
404 "HOLE",
405 #endif
406 };
407 static DEFINE_STRARRAY(whences);
408
409 static const char *fcntl_cmds[] = {
410 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
411 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "GETLK64",
412 "SETLK64", "SETLKW64", "SETOWN_EX", "GETOWN_EX",
413 "GETOWNER_UIDS",
414 };
415 static DEFINE_STRARRAY(fcntl_cmds);
416
417 static const char *fcntl_linux_specific_cmds[] = {
418 "SETLEASE", "GETLEASE", "NOTIFY", [5] = "CANCELLK", "DUPFD_CLOEXEC",
419 "SETPIPE_SZ", "GETPIPE_SZ", "ADD_SEALS", "GET_SEALS",
420 "GET_RW_HINT", "SET_RW_HINT", "GET_FILE_RW_HINT", "SET_FILE_RW_HINT",
421 };
422
423 static DEFINE_STRARRAY_OFFSET(fcntl_linux_specific_cmds, F_LINUX_SPECIFIC_BASE);
424
425 static struct strarray *fcntl_cmds_arrays[] = {
426 &strarray__fcntl_cmds,
427 &strarray__fcntl_linux_specific_cmds,
428 };
429
430 static DEFINE_STRARRAYS(fcntl_cmds_arrays);
431
432 static const char *rlimit_resources[] = {
433 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
434 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
435 "RTTIME",
436 };
437 static DEFINE_STRARRAY(rlimit_resources);
438
439 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
440 static DEFINE_STRARRAY(sighow);
441
442 static const char *clockid[] = {
443 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
444 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
445 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
446 };
447 static DEFINE_STRARRAY(clockid);
448
449 static const char *socket_families[] = {
450 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
451 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
452 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
453 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
454 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
455 "ALG", "NFC", "VSOCK",
456 };
457 static DEFINE_STRARRAY(socket_families);
458
459 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
460 struct syscall_arg *arg)
461 {
462 size_t printed = 0;
463 int mode = arg->val;
464
465 if (mode == F_OK) /* 0 */
466 return scnprintf(bf, size, "F");
467 #define P_MODE(n) \
468 if (mode & n##_OK) { \
469 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
470 mode &= ~n##_OK; \
471 }
472
473 P_MODE(R);
474 P_MODE(W);
475 P_MODE(X);
476 #undef P_MODE
477
478 if (mode)
479 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
480
481 return printed;
482 }
483
484 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
485
486 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
487 struct syscall_arg *arg);
488
489 #define SCA_FILENAME syscall_arg__scnprintf_filename
490
491 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
492 struct syscall_arg *arg)
493 {
494 int printed = 0, flags = arg->val;
495
496 #define P_FLAG(n) \
497 if (flags & O_##n) { \
498 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
499 flags &= ~O_##n; \
500 }
501
502 P_FLAG(CLOEXEC);
503 P_FLAG(NONBLOCK);
504 #undef P_FLAG
505
506 if (flags)
507 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
508
509 return printed;
510 }
511
512 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
513
514 #ifndef GRND_NONBLOCK
515 #define GRND_NONBLOCK 0x0001
516 #endif
517 #ifndef GRND_RANDOM
518 #define GRND_RANDOM 0x0002
519 #endif
520
521 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
522 struct syscall_arg *arg)
523 {
524 int printed = 0, flags = arg->val;
525
526 #define P_FLAG(n) \
527 if (flags & GRND_##n) { \
528 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
529 flags &= ~GRND_##n; \
530 }
531
532 P_FLAG(RANDOM);
533 P_FLAG(NONBLOCK);
534 #undef P_FLAG
535
536 if (flags)
537 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
538
539 return printed;
540 }
541
542 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
543
544 #define STRARRAY(name, array) \
545 { .scnprintf = SCA_STRARRAY, \
546 .parm = &strarray__##array, }
547
548 #include "trace/beauty/eventfd.c"
549 #include "trace/beauty/flock.c"
550 #include "trace/beauty/futex_op.c"
551 #include "trace/beauty/mmap.c"
552 #include "trace/beauty/mode_t.c"
553 #include "trace/beauty/msg_flags.c"
554 #include "trace/beauty/open_flags.c"
555 #include "trace/beauty/perf_event_open.c"
556 #include "trace/beauty/pid.c"
557 #include "trace/beauty/sched_policy.c"
558 #include "trace/beauty/seccomp.c"
559 #include "trace/beauty/signum.c"
560 #include "trace/beauty/socket_type.c"
561 #include "trace/beauty/waitid_options.c"
562
563 struct syscall_arg_fmt {
564 size_t (*scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
565 void *parm;
566 const char *name;
567 bool show_zero;
568 };
569
570 static struct syscall_fmt {
571 const char *name;
572 const char *alias;
573 struct syscall_arg_fmt arg[6];
574 u8 nr_args;
575 bool errpid;
576 bool timeout;
577 bool hexret;
578 } syscall_fmts[] = {
579 { .name = "access",
580 .arg = { [1] = { .scnprintf = SCA_ACCMODE, /* mode */ }, }, },
581 { .name = "arch_prctl", .alias = "prctl", },
582 { .name = "bpf",
583 .arg = { [0] = STRARRAY(cmd, bpf_cmd), }, },
584 { .name = "brk", .hexret = true,
585 .arg = { [0] = { .scnprintf = SCA_HEX, /* brk */ }, }, },
586 { .name = "clock_gettime",
587 .arg = { [0] = STRARRAY(clk_id, clockid), }, },
588 { .name = "clone", .errpid = true, .nr_args = 5,
589 .arg = { [0] = { .name = "flags", .scnprintf = SCA_CLONE_FLAGS, },
590 [1] = { .name = "child_stack", .scnprintf = SCA_HEX, },
591 [2] = { .name = "parent_tidptr", .scnprintf = SCA_HEX, },
592 [3] = { .name = "child_tidptr", .scnprintf = SCA_HEX, },
593 [4] = { .name = "tls", .scnprintf = SCA_HEX, }, }, },
594 { .name = "close",
595 .arg = { [0] = { .scnprintf = SCA_CLOSE_FD, /* fd */ }, }, },
596 { .name = "epoll_ctl",
597 .arg = { [1] = STRARRAY(op, epoll_ctl_ops), }, },
598 { .name = "eventfd2",
599 .arg = { [1] = { .scnprintf = SCA_EFD_FLAGS, /* flags */ }, }, },
600 { .name = "fchmodat",
601 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
602 { .name = "fchownat",
603 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
604 { .name = "fcntl",
605 .arg = { [1] = { .scnprintf = SCA_FCNTL_CMD, /* cmd */
606 .parm = &strarrays__fcntl_cmds_arrays,
607 .show_zero = true, },
608 [2] = { .scnprintf = SCA_FCNTL_ARG, /* arg */ }, }, },
609 { .name = "flock",
610 .arg = { [1] = { .scnprintf = SCA_FLOCK, /* cmd */ }, }, },
611 { .name = "fstat", .alias = "newfstat", },
612 { .name = "fstatat", .alias = "newfstatat", },
613 { .name = "futex",
614 .arg = { [1] = { .scnprintf = SCA_FUTEX_OP, /* op */ }, }, },
615 { .name = "futimesat",
616 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
617 { .name = "getitimer",
618 .arg = { [0] = STRARRAY(which, itimers), }, },
619 { .name = "getpid", .errpid = true, },
620 { .name = "getpgid", .errpid = true, },
621 { .name = "getppid", .errpid = true, },
622 { .name = "getrandom",
623 .arg = { [2] = { .scnprintf = SCA_GETRANDOM_FLAGS, /* flags */ }, }, },
624 { .name = "getrlimit",
625 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
626 { .name = "ioctl",
627 .arg = {
628 #if defined(__i386__) || defined(__x86_64__)
629 /*
630 * FIXME: Make this available to all arches.
631 */
632 [1] = { .scnprintf = SCA_IOCTL_CMD, /* cmd */ },
633 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
634 #else
635 [2] = { .scnprintf = SCA_HEX, /* arg */ }, }, },
636 #endif
637 { .name = "keyctl",
638 .arg = { [0] = STRARRAY(option, keyctl_options), }, },
639 { .name = "kill",
640 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
641 { .name = "linkat",
642 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
643 { .name = "lseek",
644 .arg = { [2] = STRARRAY(whence, whences), }, },
645 { .name = "lstat", .alias = "newlstat", },
646 { .name = "madvise",
647 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
648 [2] = { .scnprintf = SCA_MADV_BHV, /* behavior */ }, }, },
649 { .name = "mkdirat",
650 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
651 { .name = "mknodat",
652 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fd */ }, }, },
653 { .name = "mlock",
654 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
655 { .name = "mlockall",
656 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
657 { .name = "mmap", .hexret = true,
658 /* The standard mmap maps to old_mmap on s390x */
659 #if defined(__s390x__)
660 .alias = "old_mmap",
661 #endif
662 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
663 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
664 [3] = { .scnprintf = SCA_MMAP_FLAGS, /* flags */ }, }, },
665 { .name = "mprotect",
666 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
667 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ }, }, },
668 { .name = "mq_unlink",
669 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* u_name */ }, }, },
670 { .name = "mremap", .hexret = true,
671 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ },
672 [3] = { .scnprintf = SCA_MREMAP_FLAGS, /* flags */ },
673 [4] = { .scnprintf = SCA_HEX, /* new_addr */ }, }, },
674 { .name = "munlock",
675 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
676 { .name = "munmap",
677 .arg = { [0] = { .scnprintf = SCA_HEX, /* addr */ }, }, },
678 { .name = "name_to_handle_at",
679 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
680 { .name = "newfstatat",
681 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
682 { .name = "open",
683 .arg = { [1] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
684 { .name = "open_by_handle_at",
685 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
686 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
687 { .name = "openat",
688 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ },
689 [2] = { .scnprintf = SCA_OPEN_FLAGS, /* flags */ }, }, },
690 { .name = "perf_event_open",
691 .arg = { [2] = { .scnprintf = SCA_INT, /* cpu */ },
692 [3] = { .scnprintf = SCA_FD, /* group_fd */ },
693 [4] = { .scnprintf = SCA_PERF_FLAGS, /* flags */ }, }, },
694 { .name = "pipe2",
695 .arg = { [1] = { .scnprintf = SCA_PIPE_FLAGS, /* flags */ }, }, },
696 { .name = "pkey_alloc",
697 .arg = { [1] = { .scnprintf = SCA_PKEY_ALLOC_ACCESS_RIGHTS, /* access_rights */ }, }, },
698 { .name = "pkey_free",
699 .arg = { [0] = { .scnprintf = SCA_INT, /* key */ }, }, },
700 { .name = "pkey_mprotect",
701 .arg = { [0] = { .scnprintf = SCA_HEX, /* start */ },
702 [2] = { .scnprintf = SCA_MMAP_PROT, /* prot */ },
703 [3] = { .scnprintf = SCA_INT, /* pkey */ }, }, },
704 { .name = "poll", .timeout = true, },
705 { .name = "ppoll", .timeout = true, },
706 { .name = "pread", .alias = "pread64", },
707 { .name = "preadv", .alias = "pread", },
708 { .name = "prlimit64",
709 .arg = { [1] = STRARRAY(resource, rlimit_resources), }, },
710 { .name = "pwrite", .alias = "pwrite64", },
711 { .name = "readlinkat",
712 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
713 { .name = "recvfrom",
714 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
715 { .name = "recvmmsg",
716 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
717 { .name = "recvmsg",
718 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
719 { .name = "renameat",
720 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
721 { .name = "rt_sigaction",
722 .arg = { [0] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
723 { .name = "rt_sigprocmask",
724 .arg = { [0] = STRARRAY(how, sighow), }, },
725 { .name = "rt_sigqueueinfo",
726 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
727 { .name = "rt_tgsigqueueinfo",
728 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
729 { .name = "sched_setscheduler",
730 .arg = { [1] = { .scnprintf = SCA_SCHED_POLICY, /* policy */ }, }, },
731 { .name = "seccomp",
732 .arg = { [0] = { .scnprintf = SCA_SECCOMP_OP, /* op */ },
733 [1] = { .scnprintf = SCA_SECCOMP_FLAGS, /* flags */ }, }, },
734 { .name = "select", .timeout = true, },
735 { .name = "sendmmsg",
736 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
737 { .name = "sendmsg",
738 .arg = { [2] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
739 { .name = "sendto",
740 .arg = { [3] = { .scnprintf = SCA_MSG_FLAGS, /* flags */ }, }, },
741 { .name = "set_tid_address", .errpid = true, },
742 { .name = "setitimer",
743 .arg = { [0] = STRARRAY(which, itimers), }, },
744 { .name = "setrlimit",
745 .arg = { [0] = STRARRAY(resource, rlimit_resources), }, },
746 { .name = "socket",
747 .arg = { [0] = STRARRAY(family, socket_families),
748 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
749 { .name = "socketpair",
750 .arg = { [0] = STRARRAY(family, socket_families),
751 [1] = { .scnprintf = SCA_SK_TYPE, /* type */ }, }, },
752 { .name = "stat", .alias = "newstat", },
753 { .name = "statx",
754 .arg = { [0] = { .scnprintf = SCA_FDAT, /* fdat */ },
755 [2] = { .scnprintf = SCA_STATX_FLAGS, /* flags */ } ,
756 [3] = { .scnprintf = SCA_STATX_MASK, /* mask */ }, }, },
757 { .name = "swapoff",
758 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
759 { .name = "swapon",
760 .arg = { [0] = { .scnprintf = SCA_FILENAME, /* specialfile */ }, }, },
761 { .name = "symlinkat",
762 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
763 { .name = "tgkill",
764 .arg = { [2] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
765 { .name = "tkill",
766 .arg = { [1] = { .scnprintf = SCA_SIGNUM, /* sig */ }, }, },
767 { .name = "uname", .alias = "newuname", },
768 { .name = "unlinkat",
769 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dfd */ }, }, },
770 { .name = "utimensat",
771 .arg = { [0] = { .scnprintf = SCA_FDAT, /* dirfd */ }, }, },
772 { .name = "wait4", .errpid = true,
773 .arg = { [2] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
774 { .name = "waitid", .errpid = true,
775 .arg = { [3] = { .scnprintf = SCA_WAITID_OPTIONS, /* options */ }, }, },
776 };
777
778 static int syscall_fmt__cmp(const void *name, const void *fmtp)
779 {
780 const struct syscall_fmt *fmt = fmtp;
781 return strcmp(name, fmt->name);
782 }
783
784 static struct syscall_fmt *syscall_fmt__find(const char *name)
785 {
786 const int nmemb = ARRAY_SIZE(syscall_fmts);
787 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
788 }
789
790 struct syscall {
791 struct event_format *tp_format;
792 int nr_args;
793 struct format_field *args;
794 const char *name;
795 bool is_exit;
796 struct syscall_fmt *fmt;
797 struct syscall_arg_fmt *arg_fmt;
798 };
799
800 /*
801 * We need to have this 'calculated' boolean because in some cases we really
802 * don't know what is the duration of a syscall, for instance, when we start
803 * a session and some threads are waiting for a syscall to finish, say 'poll',
804 * in which case all we can do is to print "( ? ) for duration and for the
805 * start timestamp.
806 */
807 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
808 {
809 double duration = (double)t / NSEC_PER_MSEC;
810 size_t printed = fprintf(fp, "(");
811
812 if (!calculated)
813 printed += fprintf(fp, " ? ");
814 else if (duration >= 1.0)
815 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
816 else if (duration >= 0.01)
817 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
818 else
819 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
820 return printed + fprintf(fp, "): ");
821 }
822
823 /**
824 * filename.ptr: The filename char pointer that will be vfs_getname'd
825 * filename.entry_str_pos: Where to insert the string translated from
826 * filename.ptr by the vfs_getname tracepoint/kprobe.
827 * ret_scnprintf: syscall args may set this to a different syscall return
828 * formatter, for instance, fcntl may return fds, file flags, etc.
829 */
830 struct thread_trace {
831 u64 entry_time;
832 bool entry_pending;
833 unsigned long nr_events;
834 unsigned long pfmaj, pfmin;
835 char *entry_str;
836 double runtime_ms;
837 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
838 struct {
839 unsigned long ptr;
840 short int entry_str_pos;
841 bool pending_open;
842 unsigned int namelen;
843 char *name;
844 } filename;
845 struct {
846 int max;
847 char **table;
848 } paths;
849
850 struct intlist *syscall_stats;
851 };
852
853 static struct thread_trace *thread_trace__new(void)
854 {
855 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
856
857 if (ttrace)
858 ttrace->paths.max = -1;
859
860 ttrace->syscall_stats = intlist__new(NULL);
861
862 return ttrace;
863 }
864
865 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
866 {
867 struct thread_trace *ttrace;
868
869 if (thread == NULL)
870 goto fail;
871
872 if (thread__priv(thread) == NULL)
873 thread__set_priv(thread, thread_trace__new());
874
875 if (thread__priv(thread) == NULL)
876 goto fail;
877
878 ttrace = thread__priv(thread);
879 ++ttrace->nr_events;
880
881 return ttrace;
882 fail:
883 color_fprintf(fp, PERF_COLOR_RED,
884 "WARNING: not enough memory, dropping samples!\n");
885 return NULL;
886 }
887
888
889 void syscall_arg__set_ret_scnprintf(struct syscall_arg *arg,
890 size_t (*ret_scnprintf)(char *bf, size_t size, struct syscall_arg *arg))
891 {
892 struct thread_trace *ttrace = thread__priv(arg->thread);
893
894 ttrace->ret_scnprintf = ret_scnprintf;
895 }
896
897 #define TRACE_PFMAJ (1 << 0)
898 #define TRACE_PFMIN (1 << 1)
899
900 static const size_t trace__entry_str_size = 2048;
901
902 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
903 {
904 struct thread_trace *ttrace = thread__priv(thread);
905
906 if (fd > ttrace->paths.max) {
907 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
908
909 if (npath == NULL)
910 return -1;
911
912 if (ttrace->paths.max != -1) {
913 memset(npath + ttrace->paths.max + 1, 0,
914 (fd - ttrace->paths.max) * sizeof(char *));
915 } else {
916 memset(npath, 0, (fd + 1) * sizeof(char *));
917 }
918
919 ttrace->paths.table = npath;
920 ttrace->paths.max = fd;
921 }
922
923 ttrace->paths.table[fd] = strdup(pathname);
924
925 return ttrace->paths.table[fd] != NULL ? 0 : -1;
926 }
927
928 static int thread__read_fd_path(struct thread *thread, int fd)
929 {
930 char linkname[PATH_MAX], pathname[PATH_MAX];
931 struct stat st;
932 int ret;
933
934 if (thread->pid_ == thread->tid) {
935 scnprintf(linkname, sizeof(linkname),
936 "/proc/%d/fd/%d", thread->pid_, fd);
937 } else {
938 scnprintf(linkname, sizeof(linkname),
939 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
940 }
941
942 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
943 return -1;
944
945 ret = readlink(linkname, pathname, sizeof(pathname));
946
947 if (ret < 0 || ret > st.st_size)
948 return -1;
949
950 pathname[ret] = '\0';
951 return trace__set_fd_pathname(thread, fd, pathname);
952 }
953
954 static const char *thread__fd_path(struct thread *thread, int fd,
955 struct trace *trace)
956 {
957 struct thread_trace *ttrace = thread__priv(thread);
958
959 if (ttrace == NULL)
960 return NULL;
961
962 if (fd < 0)
963 return NULL;
964
965 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
966 if (!trace->live)
967 return NULL;
968 ++trace->stats.proc_getname;
969 if (thread__read_fd_path(thread, fd))
970 return NULL;
971 }
972
973 return ttrace->paths.table[fd];
974 }
975
976 size_t syscall_arg__scnprintf_fd(char *bf, size_t size, struct syscall_arg *arg)
977 {
978 int fd = arg->val;
979 size_t printed = scnprintf(bf, size, "%d", fd);
980 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
981
982 if (path)
983 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
984
985 return printed;
986 }
987
988 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
989 struct syscall_arg *arg)
990 {
991 int fd = arg->val;
992 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
993 struct thread_trace *ttrace = thread__priv(arg->thread);
994
995 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
996 zfree(&ttrace->paths.table[fd]);
997
998 return printed;
999 }
1000
1001 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1002 unsigned long ptr)
1003 {
1004 struct thread_trace *ttrace = thread__priv(thread);
1005
1006 ttrace->filename.ptr = ptr;
1007 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1008 }
1009
1010 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1011 struct syscall_arg *arg)
1012 {
1013 unsigned long ptr = arg->val;
1014
1015 if (!arg->trace->vfs_getname)
1016 return scnprintf(bf, size, "%#x", ptr);
1017
1018 thread__set_filename_pos(arg->thread, bf, ptr);
1019 return 0;
1020 }
1021
1022 static bool trace__filter_duration(struct trace *trace, double t)
1023 {
1024 return t < (trace->duration_filter * NSEC_PER_MSEC);
1025 }
1026
1027 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1028 {
1029 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1030
1031 return fprintf(fp, "%10.3f ", ts);
1032 }
1033
1034 /*
1035 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1036 * using ttrace->entry_time for a thread that receives a sys_exit without
1037 * first having received a sys_enter ("poll" issued before tracing session
1038 * starts, lost sys_enter exit due to ring buffer overflow).
1039 */
1040 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1041 {
1042 if (tstamp > 0)
1043 return __trace__fprintf_tstamp(trace, tstamp, fp);
1044
1045 return fprintf(fp, " ? ");
1046 }
1047
1048 static bool done = false;
1049 static bool interrupted = false;
1050
1051 static void sig_handler(int sig)
1052 {
1053 done = true;
1054 interrupted = sig == SIGINT;
1055 }
1056
1057 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1058 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1059 {
1060 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1061 printed += fprintf_duration(duration, duration_calculated, fp);
1062
1063 if (trace->multiple_threads) {
1064 if (trace->show_comm)
1065 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1066 printed += fprintf(fp, "%d ", thread->tid);
1067 }
1068
1069 return printed;
1070 }
1071
1072 static int trace__process_event(struct trace *trace, struct machine *machine,
1073 union perf_event *event, struct perf_sample *sample)
1074 {
1075 int ret = 0;
1076
1077 switch (event->header.type) {
1078 case PERF_RECORD_LOST:
1079 color_fprintf(trace->output, PERF_COLOR_RED,
1080 "LOST %" PRIu64 " events!\n", event->lost.lost);
1081 ret = machine__process_lost_event(machine, event, sample);
1082 break;
1083 default:
1084 ret = machine__process_event(machine, event, sample);
1085 break;
1086 }
1087
1088 return ret;
1089 }
1090
1091 static int trace__tool_process(struct perf_tool *tool,
1092 union perf_event *event,
1093 struct perf_sample *sample,
1094 struct machine *machine)
1095 {
1096 struct trace *trace = container_of(tool, struct trace, tool);
1097 return trace__process_event(trace, machine, event, sample);
1098 }
1099
1100 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1101 {
1102 struct machine *machine = vmachine;
1103
1104 if (machine->kptr_restrict_warned)
1105 return NULL;
1106
1107 if (symbol_conf.kptr_restrict) {
1108 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1109 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1110 "Kernel samples will not be resolved.\n");
1111 machine->kptr_restrict_warned = true;
1112 return NULL;
1113 }
1114
1115 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1116 }
1117
1118 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1119 {
1120 int err = symbol__init(NULL);
1121
1122 if (err)
1123 return err;
1124
1125 trace->host = machine__new_host();
1126 if (trace->host == NULL)
1127 return -ENOMEM;
1128
1129 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1130 return -errno;
1131
1132 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1133 evlist->threads, trace__tool_process, false,
1134 trace->opts.proc_map_timeout);
1135 if (err)
1136 symbol__exit();
1137
1138 return err;
1139 }
1140
1141 static void trace__symbols__exit(struct trace *trace)
1142 {
1143 machine__exit(trace->host);
1144 trace->host = NULL;
1145
1146 symbol__exit();
1147 }
1148
1149 static int syscall__alloc_arg_fmts(struct syscall *sc, int nr_args)
1150 {
1151 int idx;
1152
1153 if (nr_args == 6 && sc->fmt && sc->fmt->nr_args != 0)
1154 nr_args = sc->fmt->nr_args;
1155
1156 sc->arg_fmt = calloc(nr_args, sizeof(*sc->arg_fmt));
1157 if (sc->arg_fmt == NULL)
1158 return -1;
1159
1160 for (idx = 0; idx < nr_args; ++idx) {
1161 if (sc->fmt)
1162 sc->arg_fmt[idx] = sc->fmt->arg[idx];
1163 }
1164
1165 sc->nr_args = nr_args;
1166 return 0;
1167 }
1168
1169 static int syscall__set_arg_fmts(struct syscall *sc)
1170 {
1171 struct format_field *field;
1172 int idx = 0, len;
1173
1174 for (field = sc->args; field; field = field->next, ++idx) {
1175 if (sc->fmt && sc->fmt->arg[idx].scnprintf)
1176 continue;
1177
1178 if (strcmp(field->type, "const char *") == 0 &&
1179 (strcmp(field->name, "filename") == 0 ||
1180 strcmp(field->name, "path") == 0 ||
1181 strcmp(field->name, "pathname") == 0))
1182 sc->arg_fmt[idx].scnprintf = SCA_FILENAME;
1183 else if (field->flags & FIELD_IS_POINTER)
1184 sc->arg_fmt[idx].scnprintf = syscall_arg__scnprintf_hex;
1185 else if (strcmp(field->type, "pid_t") == 0)
1186 sc->arg_fmt[idx].scnprintf = SCA_PID;
1187 else if (strcmp(field->type, "umode_t") == 0)
1188 sc->arg_fmt[idx].scnprintf = SCA_MODE_T;
1189 else if ((strcmp(field->type, "int") == 0 ||
1190 strcmp(field->type, "unsigned int") == 0 ||
1191 strcmp(field->type, "long") == 0) &&
1192 (len = strlen(field->name)) >= 2 &&
1193 strcmp(field->name + len - 2, "fd") == 0) {
1194 /*
1195 * /sys/kernel/tracing/events/syscalls/sys_enter*
1196 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1197 * 65 int
1198 * 23 unsigned int
1199 * 7 unsigned long
1200 */
1201 sc->arg_fmt[idx].scnprintf = SCA_FD;
1202 }
1203 }
1204
1205 return 0;
1206 }
1207
1208 static int trace__read_syscall_info(struct trace *trace, int id)
1209 {
1210 char tp_name[128];
1211 struct syscall *sc;
1212 const char *name = syscalltbl__name(trace->sctbl, id);
1213
1214 if (name == NULL)
1215 return -1;
1216
1217 if (id > trace->syscalls.max) {
1218 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1219
1220 if (nsyscalls == NULL)
1221 return -1;
1222
1223 if (trace->syscalls.max != -1) {
1224 memset(nsyscalls + trace->syscalls.max + 1, 0,
1225 (id - trace->syscalls.max) * sizeof(*sc));
1226 } else {
1227 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1228 }
1229
1230 trace->syscalls.table = nsyscalls;
1231 trace->syscalls.max = id;
1232 }
1233
1234 sc = trace->syscalls.table + id;
1235 sc->name = name;
1236
1237 sc->fmt = syscall_fmt__find(sc->name);
1238
1239 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1240 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1241
1242 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1243 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1244 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1245 }
1246
1247 if (syscall__alloc_arg_fmts(sc, IS_ERR(sc->tp_format) ? 6 : sc->tp_format->format.nr_fields))
1248 return -1;
1249
1250 if (IS_ERR(sc->tp_format))
1251 return -1;
1252
1253 sc->args = sc->tp_format->format.fields;
1254 /*
1255 * We need to check and discard the first variable '__syscall_nr'
1256 * or 'nr' that mean the syscall number. It is needless here.
1257 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1258 */
1259 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1260 sc->args = sc->args->next;
1261 --sc->nr_args;
1262 }
1263
1264 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1265
1266 return syscall__set_arg_fmts(sc);
1267 }
1268
1269 static int trace__validate_ev_qualifier(struct trace *trace)
1270 {
1271 int err = 0, i;
1272 size_t nr_allocated;
1273 struct str_node *pos;
1274
1275 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1276 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1277 sizeof(trace->ev_qualifier_ids.entries[0]));
1278
1279 if (trace->ev_qualifier_ids.entries == NULL) {
1280 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1281 trace->output);
1282 err = -EINVAL;
1283 goto out;
1284 }
1285
1286 nr_allocated = trace->ev_qualifier_ids.nr;
1287 i = 0;
1288
1289 strlist__for_each_entry(pos, trace->ev_qualifier) {
1290 const char *sc = pos->s;
1291 int id = syscalltbl__id(trace->sctbl, sc), match_next = -1;
1292
1293 if (id < 0) {
1294 id = syscalltbl__strglobmatch_first(trace->sctbl, sc, &match_next);
1295 if (id >= 0)
1296 goto matches;
1297
1298 if (err == 0) {
1299 fputs("Error:\tInvalid syscall ", trace->output);
1300 err = -EINVAL;
1301 } else {
1302 fputs(", ", trace->output);
1303 }
1304
1305 fputs(sc, trace->output);
1306 }
1307 matches:
1308 trace->ev_qualifier_ids.entries[i++] = id;
1309 if (match_next == -1)
1310 continue;
1311
1312 while (1) {
1313 id = syscalltbl__strglobmatch_next(trace->sctbl, sc, &match_next);
1314 if (id < 0)
1315 break;
1316 if (nr_allocated == trace->ev_qualifier_ids.nr) {
1317 void *entries;
1318
1319 nr_allocated += 8;
1320 entries = realloc(trace->ev_qualifier_ids.entries,
1321 nr_allocated * sizeof(trace->ev_qualifier_ids.entries[0]));
1322 if (entries == NULL) {
1323 err = -ENOMEM;
1324 fputs("\nError:\t Not enough memory for parsing\n", trace->output);
1325 goto out_free;
1326 }
1327 trace->ev_qualifier_ids.entries = entries;
1328 }
1329 trace->ev_qualifier_ids.nr++;
1330 trace->ev_qualifier_ids.entries[i++] = id;
1331 }
1332 }
1333
1334 if (err < 0) {
1335 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1336 "\nHint:\tand: 'man syscalls'\n", trace->output);
1337 out_free:
1338 zfree(&trace->ev_qualifier_ids.entries);
1339 trace->ev_qualifier_ids.nr = 0;
1340 }
1341 out:
1342 return err;
1343 }
1344
1345 /*
1346 * args is to be interpreted as a series of longs but we need to handle
1347 * 8-byte unaligned accesses. args points to raw_data within the event
1348 * and raw_data is guaranteed to be 8-byte unaligned because it is
1349 * preceded by raw_size which is a u32. So we need to copy args to a temp
1350 * variable to read it. Most notably this avoids extended load instructions
1351 * on unaligned addresses
1352 */
1353 unsigned long syscall_arg__val(struct syscall_arg *arg, u8 idx)
1354 {
1355 unsigned long val;
1356 unsigned char *p = arg->args + sizeof(unsigned long) * idx;
1357
1358 memcpy(&val, p, sizeof(val));
1359 return val;
1360 }
1361
1362 static size_t syscall__scnprintf_name(struct syscall *sc, char *bf, size_t size,
1363 struct syscall_arg *arg)
1364 {
1365 if (sc->arg_fmt && sc->arg_fmt[arg->idx].name)
1366 return scnprintf(bf, size, "%s: ", sc->arg_fmt[arg->idx].name);
1367
1368 return scnprintf(bf, size, "arg%d: ", arg->idx);
1369 }
1370
1371 static size_t syscall__scnprintf_val(struct syscall *sc, char *bf, size_t size,
1372 struct syscall_arg *arg, unsigned long val)
1373 {
1374 if (sc->arg_fmt && sc->arg_fmt[arg->idx].scnprintf) {
1375 arg->val = val;
1376 if (sc->arg_fmt[arg->idx].parm)
1377 arg->parm = sc->arg_fmt[arg->idx].parm;
1378 return sc->arg_fmt[arg->idx].scnprintf(bf, size, arg);
1379 }
1380 return scnprintf(bf, size, "%ld", val);
1381 }
1382
1383 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1384 unsigned char *args, struct trace *trace,
1385 struct thread *thread)
1386 {
1387 size_t printed = 0;
1388 unsigned long val;
1389 u8 bit = 1;
1390 struct syscall_arg arg = {
1391 .args = args,
1392 .idx = 0,
1393 .mask = 0,
1394 .trace = trace,
1395 .thread = thread,
1396 };
1397 struct thread_trace *ttrace = thread__priv(thread);
1398
1399 /*
1400 * Things like fcntl will set this in its 'cmd' formatter to pick the
1401 * right formatter for the return value (an fd? file flags?), which is
1402 * not needed for syscalls that always return a given type, say an fd.
1403 */
1404 ttrace->ret_scnprintf = NULL;
1405
1406 if (sc->args != NULL) {
1407 struct format_field *field;
1408
1409 for (field = sc->args; field;
1410 field = field->next, ++arg.idx, bit <<= 1) {
1411 if (arg.mask & bit)
1412 continue;
1413
1414 val = syscall_arg__val(&arg, arg.idx);
1415
1416 /*
1417 * Suppress this argument if its value is zero and
1418 * and we don't have a string associated in an
1419 * strarray for it.
1420 */
1421 if (val == 0 &&
1422 !(sc->arg_fmt &&
1423 (sc->arg_fmt[arg.idx].show_zero ||
1424 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAY ||
1425 sc->arg_fmt[arg.idx].scnprintf == SCA_STRARRAYS) &&
1426 sc->arg_fmt[arg.idx].parm))
1427 continue;
1428
1429 printed += scnprintf(bf + printed, size - printed,
1430 "%s%s: ", printed ? ", " : "", field->name);
1431 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1432 }
1433 } else if (IS_ERR(sc->tp_format)) {
1434 /*
1435 * If we managed to read the tracepoint /format file, then we
1436 * may end up not having any args, like with gettid(), so only
1437 * print the raw args when we didn't manage to read it.
1438 */
1439 while (arg.idx < sc->nr_args) {
1440 if (arg.mask & bit)
1441 goto next_arg;
1442 val = syscall_arg__val(&arg, arg.idx);
1443 if (printed)
1444 printed += scnprintf(bf + printed, size - printed, ", ");
1445 printed += syscall__scnprintf_name(sc, bf + printed, size - printed, &arg);
1446 printed += syscall__scnprintf_val(sc, bf + printed, size - printed, &arg, val);
1447 next_arg:
1448 ++arg.idx;
1449 bit <<= 1;
1450 }
1451 }
1452
1453 return printed;
1454 }
1455
1456 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1457 union perf_event *event,
1458 struct perf_sample *sample);
1459
1460 static struct syscall *trace__syscall_info(struct trace *trace,
1461 struct perf_evsel *evsel, int id)
1462 {
1463
1464 if (id < 0) {
1465
1466 /*
1467 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1468 * before that, leaving at a higher verbosity level till that is
1469 * explained. Reproduced with plain ftrace with:
1470 *
1471 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1472 * grep "NR -1 " /t/trace_pipe
1473 *
1474 * After generating some load on the machine.
1475 */
1476 if (verbose > 1) {
1477 static u64 n;
1478 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1479 id, perf_evsel__name(evsel), ++n);
1480 }
1481 return NULL;
1482 }
1483
1484 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1485 trace__read_syscall_info(trace, id))
1486 goto out_cant_read;
1487
1488 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1489 goto out_cant_read;
1490
1491 return &trace->syscalls.table[id];
1492
1493 out_cant_read:
1494 if (verbose > 0) {
1495 fprintf(trace->output, "Problems reading syscall %d", id);
1496 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1497 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1498 fputs(" information\n", trace->output);
1499 }
1500 return NULL;
1501 }
1502
1503 static void thread__update_stats(struct thread_trace *ttrace,
1504 int id, struct perf_sample *sample)
1505 {
1506 struct int_node *inode;
1507 struct stats *stats;
1508 u64 duration = 0;
1509
1510 inode = intlist__findnew(ttrace->syscall_stats, id);
1511 if (inode == NULL)
1512 return;
1513
1514 stats = inode->priv;
1515 if (stats == NULL) {
1516 stats = malloc(sizeof(struct stats));
1517 if (stats == NULL)
1518 return;
1519 init_stats(stats);
1520 inode->priv = stats;
1521 }
1522
1523 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1524 duration = sample->time - ttrace->entry_time;
1525
1526 update_stats(stats, duration);
1527 }
1528
1529 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1530 {
1531 struct thread_trace *ttrace;
1532 u64 duration;
1533 size_t printed;
1534
1535 if (trace->current == NULL)
1536 return 0;
1537
1538 ttrace = thread__priv(trace->current);
1539
1540 if (!ttrace->entry_pending)
1541 return 0;
1542
1543 duration = sample->time - ttrace->entry_time;
1544
1545 printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1546 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1547 ttrace->entry_pending = false;
1548
1549 return printed;
1550 }
1551
1552 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1553 union perf_event *event __maybe_unused,
1554 struct perf_sample *sample)
1555 {
1556 char *msg;
1557 void *args;
1558 size_t printed = 0;
1559 struct thread *thread;
1560 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1561 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1562 struct thread_trace *ttrace;
1563
1564 if (sc == NULL)
1565 return -1;
1566
1567 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1568 ttrace = thread__trace(thread, trace->output);
1569 if (ttrace == NULL)
1570 goto out_put;
1571
1572 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1573
1574 if (ttrace->entry_str == NULL) {
1575 ttrace->entry_str = malloc(trace__entry_str_size);
1576 if (!ttrace->entry_str)
1577 goto out_put;
1578 }
1579
1580 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1581 trace__printf_interrupted_entry(trace, sample);
1582
1583 ttrace->entry_time = sample->time;
1584 msg = ttrace->entry_str;
1585 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1586
1587 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1588 args, trace, thread);
1589
1590 if (sc->is_exit) {
1591 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1592 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1593 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1594 }
1595 } else {
1596 ttrace->entry_pending = true;
1597 /* See trace__vfs_getname & trace__sys_exit */
1598 ttrace->filename.pending_open = false;
1599 }
1600
1601 if (trace->current != thread) {
1602 thread__put(trace->current);
1603 trace->current = thread__get(thread);
1604 }
1605 err = 0;
1606 out_put:
1607 thread__put(thread);
1608 return err;
1609 }
1610
1611 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1612 struct perf_sample *sample,
1613 struct callchain_cursor *cursor)
1614 {
1615 struct addr_location al;
1616
1617 if (machine__resolve(trace->host, &al, sample) < 0 ||
1618 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1619 return -1;
1620
1621 return 0;
1622 }
1623
1624 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1625 {
1626 /* TODO: user-configurable print_opts */
1627 const unsigned int print_opts = EVSEL__PRINT_SYM |
1628 EVSEL__PRINT_DSO |
1629 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1630
1631 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1632 }
1633
1634 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1635 union perf_event *event __maybe_unused,
1636 struct perf_sample *sample)
1637 {
1638 long ret;
1639 u64 duration = 0;
1640 bool duration_calculated = false;
1641 struct thread *thread;
1642 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1643 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1644 struct thread_trace *ttrace;
1645
1646 if (sc == NULL)
1647 return -1;
1648
1649 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1650 ttrace = thread__trace(thread, trace->output);
1651 if (ttrace == NULL)
1652 goto out_put;
1653
1654 if (trace->summary)
1655 thread__update_stats(ttrace, id, sample);
1656
1657 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1658
1659 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1660 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1661 ttrace->filename.pending_open = false;
1662 ++trace->stats.vfs_getname;
1663 }
1664
1665 if (ttrace->entry_time) {
1666 duration = sample->time - ttrace->entry_time;
1667 if (trace__filter_duration(trace, duration))
1668 goto out;
1669 duration_calculated = true;
1670 } else if (trace->duration_filter)
1671 goto out;
1672
1673 if (sample->callchain) {
1674 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1675 if (callchain_ret == 0) {
1676 if (callchain_cursor.nr < trace->min_stack)
1677 goto out;
1678 callchain_ret = 1;
1679 }
1680 }
1681
1682 if (trace->summary_only)
1683 goto out;
1684
1685 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1686
1687 if (ttrace->entry_pending) {
1688 fprintf(trace->output, "%-70s", ttrace->entry_str);
1689 } else {
1690 fprintf(trace->output, " ... [");
1691 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1692 fprintf(trace->output, "]: %s()", sc->name);
1693 }
1694
1695 if (sc->fmt == NULL) {
1696 if (ret < 0)
1697 goto errno_print;
1698 signed_print:
1699 fprintf(trace->output, ") = %ld", ret);
1700 } else if (ret < 0) {
1701 errno_print: {
1702 char bf[STRERR_BUFSIZE];
1703 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1704 *e = audit_errno_to_name(-ret);
1705
1706 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1707 }
1708 } else if (ret == 0 && sc->fmt->timeout)
1709 fprintf(trace->output, ") = 0 Timeout");
1710 else if (ttrace->ret_scnprintf) {
1711 char bf[1024];
1712 struct syscall_arg arg = {
1713 .val = ret,
1714 .thread = thread,
1715 .trace = trace,
1716 };
1717 ttrace->ret_scnprintf(bf, sizeof(bf), &arg);
1718 ttrace->ret_scnprintf = NULL;
1719 fprintf(trace->output, ") = %s", bf);
1720 } else if (sc->fmt->hexret)
1721 fprintf(trace->output, ") = %#lx", ret);
1722 else if (sc->fmt->errpid) {
1723 struct thread *child = machine__find_thread(trace->host, ret, ret);
1724
1725 if (child != NULL) {
1726 fprintf(trace->output, ") = %ld", ret);
1727 if (child->comm_set)
1728 fprintf(trace->output, " (%s)", thread__comm_str(child));
1729 thread__put(child);
1730 }
1731 } else
1732 goto signed_print;
1733
1734 fputc('\n', trace->output);
1735
1736 if (callchain_ret > 0)
1737 trace__fprintf_callchain(trace, sample);
1738 else if (callchain_ret < 0)
1739 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1740 out:
1741 ttrace->entry_pending = false;
1742 err = 0;
1743 out_put:
1744 thread__put(thread);
1745 return err;
1746 }
1747
1748 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1749 union perf_event *event __maybe_unused,
1750 struct perf_sample *sample)
1751 {
1752 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1753 struct thread_trace *ttrace;
1754 size_t filename_len, entry_str_len, to_move;
1755 ssize_t remaining_space;
1756 char *pos;
1757 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1758
1759 if (!thread)
1760 goto out;
1761
1762 ttrace = thread__priv(thread);
1763 if (!ttrace)
1764 goto out_put;
1765
1766 filename_len = strlen(filename);
1767 if (filename_len == 0)
1768 goto out_put;
1769
1770 if (ttrace->filename.namelen < filename_len) {
1771 char *f = realloc(ttrace->filename.name, filename_len + 1);
1772
1773 if (f == NULL)
1774 goto out_put;
1775
1776 ttrace->filename.namelen = filename_len;
1777 ttrace->filename.name = f;
1778 }
1779
1780 strcpy(ttrace->filename.name, filename);
1781 ttrace->filename.pending_open = true;
1782
1783 if (!ttrace->filename.ptr)
1784 goto out_put;
1785
1786 entry_str_len = strlen(ttrace->entry_str);
1787 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1788 if (remaining_space <= 0)
1789 goto out_put;
1790
1791 if (filename_len > (size_t)remaining_space) {
1792 filename += filename_len - remaining_space;
1793 filename_len = remaining_space;
1794 }
1795
1796 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1797 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1798 memmove(pos + filename_len, pos, to_move);
1799 memcpy(pos, filename, filename_len);
1800
1801 ttrace->filename.ptr = 0;
1802 ttrace->filename.entry_str_pos = 0;
1803 out_put:
1804 thread__put(thread);
1805 out:
1806 return 0;
1807 }
1808
1809 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1810 union perf_event *event __maybe_unused,
1811 struct perf_sample *sample)
1812 {
1813 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1814 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1815 struct thread *thread = machine__findnew_thread(trace->host,
1816 sample->pid,
1817 sample->tid);
1818 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1819
1820 if (ttrace == NULL)
1821 goto out_dump;
1822
1823 ttrace->runtime_ms += runtime_ms;
1824 trace->runtime_ms += runtime_ms;
1825 out_put:
1826 thread__put(thread);
1827 return 0;
1828
1829 out_dump:
1830 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1831 evsel->name,
1832 perf_evsel__strval(evsel, sample, "comm"),
1833 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1834 runtime,
1835 perf_evsel__intval(evsel, sample, "vruntime"));
1836 goto out_put;
1837 }
1838
1839 static void bpf_output__printer(enum binary_printer_ops op,
1840 unsigned int val, void *extra)
1841 {
1842 FILE *output = extra;
1843 unsigned char ch = (unsigned char)val;
1844
1845 switch (op) {
1846 case BINARY_PRINT_CHAR_DATA:
1847 fprintf(output, "%c", isprint(ch) ? ch : '.');
1848 break;
1849 case BINARY_PRINT_DATA_BEGIN:
1850 case BINARY_PRINT_LINE_BEGIN:
1851 case BINARY_PRINT_ADDR:
1852 case BINARY_PRINT_NUM_DATA:
1853 case BINARY_PRINT_NUM_PAD:
1854 case BINARY_PRINT_SEP:
1855 case BINARY_PRINT_CHAR_PAD:
1856 case BINARY_PRINT_LINE_END:
1857 case BINARY_PRINT_DATA_END:
1858 default:
1859 break;
1860 }
1861 }
1862
1863 static void bpf_output__fprintf(struct trace *trace,
1864 struct perf_sample *sample)
1865 {
1866 print_binary(sample->raw_data, sample->raw_size, 8,
1867 bpf_output__printer, trace->output);
1868 }
1869
1870 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1871 union perf_event *event __maybe_unused,
1872 struct perf_sample *sample)
1873 {
1874 int callchain_ret = 0;
1875
1876 if (sample->callchain) {
1877 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1878 if (callchain_ret == 0) {
1879 if (callchain_cursor.nr < trace->min_stack)
1880 goto out;
1881 callchain_ret = 1;
1882 }
1883 }
1884
1885 trace__printf_interrupted_entry(trace, sample);
1886 trace__fprintf_tstamp(trace, sample->time, trace->output);
1887
1888 if (trace->trace_syscalls)
1889 fprintf(trace->output, "( ): ");
1890
1891 fprintf(trace->output, "%s:", evsel->name);
1892
1893 if (perf_evsel__is_bpf_output(evsel)) {
1894 bpf_output__fprintf(trace, sample);
1895 } else if (evsel->tp_format) {
1896 event_format__fprintf(evsel->tp_format, sample->cpu,
1897 sample->raw_data, sample->raw_size,
1898 trace->output);
1899 }
1900
1901 fprintf(trace->output, ")\n");
1902
1903 if (callchain_ret > 0)
1904 trace__fprintf_callchain(trace, sample);
1905 else if (callchain_ret < 0)
1906 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1907 out:
1908 return 0;
1909 }
1910
1911 static void print_location(FILE *f, struct perf_sample *sample,
1912 struct addr_location *al,
1913 bool print_dso, bool print_sym)
1914 {
1915
1916 if ((verbose > 0 || print_dso) && al->map)
1917 fprintf(f, "%s@", al->map->dso->long_name);
1918
1919 if ((verbose > 0 || print_sym) && al->sym)
1920 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1921 al->addr - al->sym->start);
1922 else if (al->map)
1923 fprintf(f, "0x%" PRIx64, al->addr);
1924 else
1925 fprintf(f, "0x%" PRIx64, sample->addr);
1926 }
1927
1928 static int trace__pgfault(struct trace *trace,
1929 struct perf_evsel *evsel,
1930 union perf_event *event __maybe_unused,
1931 struct perf_sample *sample)
1932 {
1933 struct thread *thread;
1934 struct addr_location al;
1935 char map_type = 'd';
1936 struct thread_trace *ttrace;
1937 int err = -1;
1938 int callchain_ret = 0;
1939
1940 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1941
1942 if (sample->callchain) {
1943 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1944 if (callchain_ret == 0) {
1945 if (callchain_cursor.nr < trace->min_stack)
1946 goto out_put;
1947 callchain_ret = 1;
1948 }
1949 }
1950
1951 ttrace = thread__trace(thread, trace->output);
1952 if (ttrace == NULL)
1953 goto out_put;
1954
1955 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1956 ttrace->pfmaj++;
1957 else
1958 ttrace->pfmin++;
1959
1960 if (trace->summary_only)
1961 goto out;
1962
1963 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1964 sample->ip, &al);
1965
1966 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1967
1968 fprintf(trace->output, "%sfault [",
1969 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1970 "maj" : "min");
1971
1972 print_location(trace->output, sample, &al, false, true);
1973
1974 fprintf(trace->output, "] => ");
1975
1976 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1977 sample->addr, &al);
1978
1979 if (!al.map) {
1980 thread__find_addr_location(thread, sample->cpumode,
1981 MAP__FUNCTION, sample->addr, &al);
1982
1983 if (al.map)
1984 map_type = 'x';
1985 else
1986 map_type = '?';
1987 }
1988
1989 print_location(trace->output, sample, &al, true, false);
1990
1991 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1992
1993 if (callchain_ret > 0)
1994 trace__fprintf_callchain(trace, sample);
1995 else if (callchain_ret < 0)
1996 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1997 out:
1998 err = 0;
1999 out_put:
2000 thread__put(thread);
2001 return err;
2002 }
2003
2004 static void trace__set_base_time(struct trace *trace,
2005 struct perf_evsel *evsel,
2006 struct perf_sample *sample)
2007 {
2008 /*
2009 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
2010 * and don't use sample->time unconditionally, we may end up having
2011 * some other event in the future without PERF_SAMPLE_TIME for good
2012 * reason, i.e. we may not be interested in its timestamps, just in
2013 * it taking place, picking some piece of information when it
2014 * appears in our event stream (vfs_getname comes to mind).
2015 */
2016 if (trace->base_time == 0 && !trace->full_time &&
2017 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
2018 trace->base_time = sample->time;
2019 }
2020
2021 static int trace__process_sample(struct perf_tool *tool,
2022 union perf_event *event,
2023 struct perf_sample *sample,
2024 struct perf_evsel *evsel,
2025 struct machine *machine __maybe_unused)
2026 {
2027 struct trace *trace = container_of(tool, struct trace, tool);
2028 struct thread *thread;
2029 int err = 0;
2030
2031 tracepoint_handler handler = evsel->handler;
2032
2033 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
2034 if (thread && thread__is_filtered(thread))
2035 goto out;
2036
2037 trace__set_base_time(trace, evsel, sample);
2038
2039 if (handler) {
2040 ++trace->nr_events;
2041 handler(trace, evsel, event, sample);
2042 }
2043 out:
2044 thread__put(thread);
2045 return err;
2046 }
2047
2048 static int trace__record(struct trace *trace, int argc, const char **argv)
2049 {
2050 unsigned int rec_argc, i, j;
2051 const char **rec_argv;
2052 const char * const record_args[] = {
2053 "record",
2054 "-R",
2055 "-m", "1024",
2056 "-c", "1",
2057 };
2058
2059 const char * const sc_args[] = { "-e", };
2060 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
2061 const char * const majpf_args[] = { "-e", "major-faults" };
2062 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
2063 const char * const minpf_args[] = { "-e", "minor-faults" };
2064 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
2065
2066 /* +1 is for the event string below */
2067 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
2068 majpf_args_nr + minpf_args_nr + argc;
2069 rec_argv = calloc(rec_argc + 1, sizeof(char *));
2070
2071 if (rec_argv == NULL)
2072 return -ENOMEM;
2073
2074 j = 0;
2075 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2076 rec_argv[j++] = record_args[i];
2077
2078 if (trace->trace_syscalls) {
2079 for (i = 0; i < sc_args_nr; i++)
2080 rec_argv[j++] = sc_args[i];
2081
2082 /* event string may be different for older kernels - e.g., RHEL6 */
2083 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2084 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2085 else if (is_valid_tracepoint("syscalls:sys_enter"))
2086 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2087 else {
2088 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2089 free(rec_argv);
2090 return -1;
2091 }
2092 }
2093
2094 if (trace->trace_pgfaults & TRACE_PFMAJ)
2095 for (i = 0; i < majpf_args_nr; i++)
2096 rec_argv[j++] = majpf_args[i];
2097
2098 if (trace->trace_pgfaults & TRACE_PFMIN)
2099 for (i = 0; i < minpf_args_nr; i++)
2100 rec_argv[j++] = minpf_args[i];
2101
2102 for (i = 0; i < (unsigned int)argc; i++)
2103 rec_argv[j++] = argv[i];
2104
2105 return cmd_record(j, rec_argv);
2106 }
2107
2108 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2109
2110 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2111 {
2112 bool found = false;
2113 struct perf_evsel *evsel, *tmp;
2114 struct parse_events_error err = { .idx = 0, };
2115 int ret = parse_events(evlist, "probe:vfs_getname*", &err);
2116
2117 if (ret)
2118 return false;
2119
2120 evlist__for_each_entry_safe(evlist, evsel, tmp) {
2121 if (!strstarts(perf_evsel__name(evsel), "probe:vfs_getname"))
2122 continue;
2123
2124 if (perf_evsel__field(evsel, "pathname")) {
2125 evsel->handler = trace__vfs_getname;
2126 found = true;
2127 continue;
2128 }
2129
2130 list_del_init(&evsel->node);
2131 evsel->evlist = NULL;
2132 perf_evsel__delete(evsel);
2133 }
2134
2135 return found;
2136 }
2137
2138 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2139 {
2140 struct perf_evsel *evsel;
2141 struct perf_event_attr attr = {
2142 .type = PERF_TYPE_SOFTWARE,
2143 .mmap_data = 1,
2144 };
2145
2146 attr.config = config;
2147 attr.sample_period = 1;
2148
2149 event_attr_init(&attr);
2150
2151 evsel = perf_evsel__new(&attr);
2152 if (evsel)
2153 evsel->handler = trace__pgfault;
2154
2155 return evsel;
2156 }
2157
2158 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2159 {
2160 const u32 type = event->header.type;
2161 struct perf_evsel *evsel;
2162
2163 if (type != PERF_RECORD_SAMPLE) {
2164 trace__process_event(trace, trace->host, event, sample);
2165 return;
2166 }
2167
2168 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2169 if (evsel == NULL) {
2170 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2171 return;
2172 }
2173
2174 trace__set_base_time(trace, evsel, sample);
2175
2176 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2177 sample->raw_data == NULL) {
2178 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2179 perf_evsel__name(evsel), sample->tid,
2180 sample->cpu, sample->raw_size);
2181 } else {
2182 tracepoint_handler handler = evsel->handler;
2183 handler(trace, evsel, event, sample);
2184 }
2185 }
2186
2187 static int trace__add_syscall_newtp(struct trace *trace)
2188 {
2189 int ret = -1;
2190 struct perf_evlist *evlist = trace->evlist;
2191 struct perf_evsel *sys_enter, *sys_exit;
2192
2193 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2194 if (sys_enter == NULL)
2195 goto out;
2196
2197 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2198 goto out_delete_sys_enter;
2199
2200 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2201 if (sys_exit == NULL)
2202 goto out_delete_sys_enter;
2203
2204 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2205 goto out_delete_sys_exit;
2206
2207 perf_evlist__add(evlist, sys_enter);
2208 perf_evlist__add(evlist, sys_exit);
2209
2210 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2211 /*
2212 * We're interested only in the user space callchain
2213 * leading to the syscall, allow overriding that for
2214 * debugging reasons using --kernel_syscall_callchains
2215 */
2216 sys_exit->attr.exclude_callchain_kernel = 1;
2217 }
2218
2219 trace->syscalls.events.sys_enter = sys_enter;
2220 trace->syscalls.events.sys_exit = sys_exit;
2221
2222 ret = 0;
2223 out:
2224 return ret;
2225
2226 out_delete_sys_exit:
2227 perf_evsel__delete_priv(sys_exit);
2228 out_delete_sys_enter:
2229 perf_evsel__delete_priv(sys_enter);
2230 goto out;
2231 }
2232
2233 static int trace__set_ev_qualifier_filter(struct trace *trace)
2234 {
2235 int err = -1;
2236 struct perf_evsel *sys_exit;
2237 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2238 trace->ev_qualifier_ids.nr,
2239 trace->ev_qualifier_ids.entries);
2240
2241 if (filter == NULL)
2242 goto out_enomem;
2243
2244 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2245 filter)) {
2246 sys_exit = trace->syscalls.events.sys_exit;
2247 err = perf_evsel__append_tp_filter(sys_exit, filter);
2248 }
2249
2250 free(filter);
2251 out:
2252 return err;
2253 out_enomem:
2254 errno = ENOMEM;
2255 goto out;
2256 }
2257
2258 static int trace__set_filter_loop_pids(struct trace *trace)
2259 {
2260 unsigned int nr = 1;
2261 pid_t pids[32] = {
2262 getpid(),
2263 };
2264 struct thread *thread = machine__find_thread(trace->host, pids[0], pids[0]);
2265
2266 while (thread && nr < ARRAY_SIZE(pids)) {
2267 struct thread *parent = machine__find_thread(trace->host, thread->ppid, thread->ppid);
2268
2269 if (parent == NULL)
2270 break;
2271
2272 if (!strcmp(thread__comm_str(parent), "sshd")) {
2273 pids[nr++] = parent->tid;
2274 break;
2275 }
2276 thread = parent;
2277 }
2278
2279 return perf_evlist__set_filter_pids(trace->evlist, nr, pids);
2280 }
2281
2282 static int trace__run(struct trace *trace, int argc, const char **argv)
2283 {
2284 struct perf_evlist *evlist = trace->evlist;
2285 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2286 int err = -1, i;
2287 unsigned long before;
2288 const bool forks = argc > 0;
2289 bool draining = false;
2290
2291 trace->live = true;
2292
2293 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2294 goto out_error_raw_syscalls;
2295
2296 if (trace->trace_syscalls)
2297 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2298
2299 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2300 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2301 if (pgfault_maj == NULL)
2302 goto out_error_mem;
2303 perf_evlist__add(evlist, pgfault_maj);
2304 }
2305
2306 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2307 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2308 if (pgfault_min == NULL)
2309 goto out_error_mem;
2310 perf_evlist__add(evlist, pgfault_min);
2311 }
2312
2313 if (trace->sched &&
2314 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2315 trace__sched_stat_runtime))
2316 goto out_error_sched_stat_runtime;
2317
2318 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2319 if (err < 0) {
2320 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2321 goto out_delete_evlist;
2322 }
2323
2324 err = trace__symbols_init(trace, evlist);
2325 if (err < 0) {
2326 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2327 goto out_delete_evlist;
2328 }
2329
2330 perf_evlist__config(evlist, &trace->opts, NULL);
2331
2332 if (callchain_param.enabled) {
2333 bool use_identifier = false;
2334
2335 if (trace->syscalls.events.sys_exit) {
2336 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2337 &trace->opts, &callchain_param);
2338 use_identifier = true;
2339 }
2340
2341 if (pgfault_maj) {
2342 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2343 use_identifier = true;
2344 }
2345
2346 if (pgfault_min) {
2347 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2348 use_identifier = true;
2349 }
2350
2351 if (use_identifier) {
2352 /*
2353 * Now we have evsels with different sample_ids, use
2354 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2355 * from a fixed position in each ring buffer record.
2356 *
2357 * As of this the changeset introducing this comment, this
2358 * isn't strictly needed, as the fields that can come before
2359 * PERF_SAMPLE_ID are all used, but we'll probably disable
2360 * some of those for things like copying the payload of
2361 * pointer syscall arguments, and for vfs_getname we don't
2362 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2363 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2364 */
2365 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2366 perf_evlist__reset_sample_bit(evlist, ID);
2367 }
2368 }
2369
2370 signal(SIGCHLD, sig_handler);
2371 signal(SIGINT, sig_handler);
2372
2373 if (forks) {
2374 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2375 argv, false, NULL);
2376 if (err < 0) {
2377 fprintf(trace->output, "Couldn't run the workload!\n");
2378 goto out_delete_evlist;
2379 }
2380 }
2381
2382 err = perf_evlist__open(evlist);
2383 if (err < 0)
2384 goto out_error_open;
2385
2386 err = bpf__apply_obj_config();
2387 if (err) {
2388 char errbuf[BUFSIZ];
2389
2390 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2391 pr_err("ERROR: Apply config to BPF failed: %s\n",
2392 errbuf);
2393 goto out_error_open;
2394 }
2395
2396 /*
2397 * Better not use !target__has_task() here because we need to cover the
2398 * case where no threads were specified in the command line, but a
2399 * workload was, and in that case we will fill in the thread_map when
2400 * we fork the workload in perf_evlist__prepare_workload.
2401 */
2402 if (trace->filter_pids.nr > 0)
2403 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2404 else if (thread_map__pid(evlist->threads, 0) == -1)
2405 err = trace__set_filter_loop_pids(trace);
2406
2407 if (err < 0)
2408 goto out_error_mem;
2409
2410 if (trace->ev_qualifier_ids.nr > 0) {
2411 err = trace__set_ev_qualifier_filter(trace);
2412 if (err < 0)
2413 goto out_errno;
2414
2415 pr_debug("event qualifier tracepoint filter: %s\n",
2416 trace->syscalls.events.sys_exit->filter);
2417 }
2418
2419 err = perf_evlist__apply_filters(evlist, &evsel);
2420 if (err < 0)
2421 goto out_error_apply_filters;
2422
2423 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2424 if (err < 0)
2425 goto out_error_mmap;
2426
2427 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2428 perf_evlist__enable(evlist);
2429
2430 if (forks)
2431 perf_evlist__start_workload(evlist);
2432
2433 if (trace->opts.initial_delay) {
2434 usleep(trace->opts.initial_delay * 1000);
2435 perf_evlist__enable(evlist);
2436 }
2437
2438 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2439 evlist->threads->nr > 1 ||
2440 perf_evlist__first(evlist)->attr.inherit;
2441 again:
2442 before = trace->nr_events;
2443
2444 for (i = 0; i < evlist->nr_mmaps; i++) {
2445 union perf_event *event;
2446
2447 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2448 struct perf_sample sample;
2449
2450 ++trace->nr_events;
2451
2452 err = perf_evlist__parse_sample(evlist, event, &sample);
2453 if (err) {
2454 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2455 goto next_event;
2456 }
2457
2458 trace__handle_event(trace, event, &sample);
2459 next_event:
2460 perf_evlist__mmap_consume(evlist, i);
2461
2462 if (interrupted)
2463 goto out_disable;
2464
2465 if (done && !draining) {
2466 perf_evlist__disable(evlist);
2467 draining = true;
2468 }
2469 }
2470 }
2471
2472 if (trace->nr_events == before) {
2473 int timeout = done ? 100 : -1;
2474
2475 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2476 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2477 draining = true;
2478
2479 goto again;
2480 }
2481 } else {
2482 goto again;
2483 }
2484
2485 out_disable:
2486 thread__zput(trace->current);
2487
2488 perf_evlist__disable(evlist);
2489
2490 if (!err) {
2491 if (trace->summary)
2492 trace__fprintf_thread_summary(trace, trace->output);
2493
2494 if (trace->show_tool_stats) {
2495 fprintf(trace->output, "Stats:\n "
2496 " vfs_getname : %" PRIu64 "\n"
2497 " proc_getname: %" PRIu64 "\n",
2498 trace->stats.vfs_getname,
2499 trace->stats.proc_getname);
2500 }
2501 }
2502
2503 out_delete_evlist:
2504 trace__symbols__exit(trace);
2505
2506 perf_evlist__delete(evlist);
2507 trace->evlist = NULL;
2508 trace->live = false;
2509 return err;
2510 {
2511 char errbuf[BUFSIZ];
2512
2513 out_error_sched_stat_runtime:
2514 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2515 goto out_error;
2516
2517 out_error_raw_syscalls:
2518 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2519 goto out_error;
2520
2521 out_error_mmap:
2522 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2523 goto out_error;
2524
2525 out_error_open:
2526 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2527
2528 out_error:
2529 fprintf(trace->output, "%s\n", errbuf);
2530 goto out_delete_evlist;
2531
2532 out_error_apply_filters:
2533 fprintf(trace->output,
2534 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2535 evsel->filter, perf_evsel__name(evsel), errno,
2536 str_error_r(errno, errbuf, sizeof(errbuf)));
2537 goto out_delete_evlist;
2538 }
2539 out_error_mem:
2540 fprintf(trace->output, "Not enough memory to run!\n");
2541 goto out_delete_evlist;
2542
2543 out_errno:
2544 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2545 goto out_delete_evlist;
2546 }
2547
2548 static int trace__replay(struct trace *trace)
2549 {
2550 const struct perf_evsel_str_handler handlers[] = {
2551 { "probe:vfs_getname", trace__vfs_getname, },
2552 };
2553 struct perf_data_file file = {
2554 .path = input_name,
2555 .mode = PERF_DATA_MODE_READ,
2556 .force = trace->force,
2557 };
2558 struct perf_session *session;
2559 struct perf_evsel *evsel;
2560 int err = -1;
2561
2562 trace->tool.sample = trace__process_sample;
2563 trace->tool.mmap = perf_event__process_mmap;
2564 trace->tool.mmap2 = perf_event__process_mmap2;
2565 trace->tool.comm = perf_event__process_comm;
2566 trace->tool.exit = perf_event__process_exit;
2567 trace->tool.fork = perf_event__process_fork;
2568 trace->tool.attr = perf_event__process_attr;
2569 trace->tool.tracing_data = perf_event__process_tracing_data;
2570 trace->tool.build_id = perf_event__process_build_id;
2571 trace->tool.namespaces = perf_event__process_namespaces;
2572
2573 trace->tool.ordered_events = true;
2574 trace->tool.ordering_requires_timestamps = true;
2575
2576 /* add tid to output */
2577 trace->multiple_threads = true;
2578
2579 session = perf_session__new(&file, false, &trace->tool);
2580 if (session == NULL)
2581 return -1;
2582
2583 if (trace->opts.target.pid)
2584 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2585
2586 if (trace->opts.target.tid)
2587 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2588
2589 if (symbol__init(&session->header.env) < 0)
2590 goto out;
2591
2592 trace->host = &session->machines.host;
2593
2594 err = perf_session__set_tracepoints_handlers(session, handlers);
2595 if (err)
2596 goto out;
2597
2598 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2599 "raw_syscalls:sys_enter");
2600 /* older kernels have syscalls tp versus raw_syscalls */
2601 if (evsel == NULL)
2602 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2603 "syscalls:sys_enter");
2604
2605 if (evsel &&
2606 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2607 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2608 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2609 goto out;
2610 }
2611
2612 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2613 "raw_syscalls:sys_exit");
2614 if (evsel == NULL)
2615 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2616 "syscalls:sys_exit");
2617 if (evsel &&
2618 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2619 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2620 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2621 goto out;
2622 }
2623
2624 evlist__for_each_entry(session->evlist, evsel) {
2625 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2626 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2627 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2628 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2629 evsel->handler = trace__pgfault;
2630 }
2631
2632 setup_pager();
2633
2634 err = perf_session__process_events(session);
2635 if (err)
2636 pr_err("Failed to process events, error %d", err);
2637
2638 else if (trace->summary)
2639 trace__fprintf_thread_summary(trace, trace->output);
2640
2641 out:
2642 perf_session__delete(session);
2643
2644 return err;
2645 }
2646
2647 static size_t trace__fprintf_threads_header(FILE *fp)
2648 {
2649 size_t printed;
2650
2651 printed = fprintf(fp, "\n Summary of events:\n\n");
2652
2653 return printed;
2654 }
2655
2656 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2657 struct stats *stats;
2658 double msecs;
2659 int syscall;
2660 )
2661 {
2662 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2663 struct stats *stats = source->priv;
2664
2665 entry->syscall = source->i;
2666 entry->stats = stats;
2667 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2668 }
2669
2670 static size_t thread__dump_stats(struct thread_trace *ttrace,
2671 struct trace *trace, FILE *fp)
2672 {
2673 size_t printed = 0;
2674 struct syscall *sc;
2675 struct rb_node *nd;
2676 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2677
2678 if (syscall_stats == NULL)
2679 return 0;
2680
2681 printed += fprintf(fp, "\n");
2682
2683 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2684 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2685 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2686
2687 resort_rb__for_each_entry(nd, syscall_stats) {
2688 struct stats *stats = syscall_stats_entry->stats;
2689 if (stats) {
2690 double min = (double)(stats->min) / NSEC_PER_MSEC;
2691 double max = (double)(stats->max) / NSEC_PER_MSEC;
2692 double avg = avg_stats(stats);
2693 double pct;
2694 u64 n = (u64) stats->n;
2695
2696 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2697 avg /= NSEC_PER_MSEC;
2698
2699 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2700 printed += fprintf(fp, " %-15s", sc->name);
2701 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2702 n, syscall_stats_entry->msecs, min, avg);
2703 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2704 }
2705 }
2706
2707 resort_rb__delete(syscall_stats);
2708 printed += fprintf(fp, "\n\n");
2709
2710 return printed;
2711 }
2712
2713 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2714 {
2715 size_t printed = 0;
2716 struct thread_trace *ttrace = thread__priv(thread);
2717 double ratio;
2718
2719 if (ttrace == NULL)
2720 return 0;
2721
2722 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2723
2724 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2725 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2726 printed += fprintf(fp, "%.1f%%", ratio);
2727 if (ttrace->pfmaj)
2728 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2729 if (ttrace->pfmin)
2730 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2731 if (trace->sched)
2732 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2733 else if (fputc('\n', fp) != EOF)
2734 ++printed;
2735
2736 printed += thread__dump_stats(ttrace, trace, fp);
2737
2738 return printed;
2739 }
2740
2741 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2742 {
2743 return ttrace ? ttrace->nr_events : 0;
2744 }
2745
2746 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2747 struct thread *thread;
2748 )
2749 {
2750 entry->thread = rb_entry(nd, struct thread, rb_node);
2751 }
2752
2753 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2754 {
2755 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2756 size_t printed = trace__fprintf_threads_header(fp);
2757 struct rb_node *nd;
2758
2759 if (threads == NULL) {
2760 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2761 return 0;
2762 }
2763
2764 resort_rb__for_each_entry(nd, threads)
2765 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2766
2767 resort_rb__delete(threads);
2768
2769 return printed;
2770 }
2771
2772 static int trace__set_duration(const struct option *opt, const char *str,
2773 int unset __maybe_unused)
2774 {
2775 struct trace *trace = opt->value;
2776
2777 trace->duration_filter = atof(str);
2778 return 0;
2779 }
2780
2781 static int trace__set_filter_pids(const struct option *opt, const char *str,
2782 int unset __maybe_unused)
2783 {
2784 int ret = -1;
2785 size_t i;
2786 struct trace *trace = opt->value;
2787 /*
2788 * FIXME: introduce a intarray class, plain parse csv and create a
2789 * { int nr, int entries[] } struct...
2790 */
2791 struct intlist *list = intlist__new(str);
2792
2793 if (list == NULL)
2794 return -1;
2795
2796 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2797 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2798
2799 if (trace->filter_pids.entries == NULL)
2800 goto out;
2801
2802 trace->filter_pids.entries[0] = getpid();
2803
2804 for (i = 1; i < trace->filter_pids.nr; ++i)
2805 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2806
2807 intlist__delete(list);
2808 ret = 0;
2809 out:
2810 return ret;
2811 }
2812
2813 static int trace__open_output(struct trace *trace, const char *filename)
2814 {
2815 struct stat st;
2816
2817 if (!stat(filename, &st) && st.st_size) {
2818 char oldname[PATH_MAX];
2819
2820 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2821 unlink(oldname);
2822 rename(filename, oldname);
2823 }
2824
2825 trace->output = fopen(filename, "w");
2826
2827 return trace->output == NULL ? -errno : 0;
2828 }
2829
2830 static int parse_pagefaults(const struct option *opt, const char *str,
2831 int unset __maybe_unused)
2832 {
2833 int *trace_pgfaults = opt->value;
2834
2835 if (strcmp(str, "all") == 0)
2836 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2837 else if (strcmp(str, "maj") == 0)
2838 *trace_pgfaults |= TRACE_PFMAJ;
2839 else if (strcmp(str, "min") == 0)
2840 *trace_pgfaults |= TRACE_PFMIN;
2841 else
2842 return -1;
2843
2844 return 0;
2845 }
2846
2847 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2848 {
2849 struct perf_evsel *evsel;
2850
2851 evlist__for_each_entry(evlist, evsel)
2852 evsel->handler = handler;
2853 }
2854
2855 /*
2856 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2857 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2858 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2859 *
2860 * It'd be better to introduce a parse_options() variant that would return a
2861 * list with the terms it didn't match to an event...
2862 */
2863 static int trace__parse_events_option(const struct option *opt, const char *str,
2864 int unset __maybe_unused)
2865 {
2866 struct trace *trace = (struct trace *)opt->value;
2867 const char *s = str;
2868 char *sep = NULL, *lists[2] = { NULL, NULL, };
2869 int len = strlen(str) + 1, err = -1, list, idx;
2870 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2871 char group_name[PATH_MAX];
2872
2873 if (strace_groups_dir == NULL)
2874 return -1;
2875
2876 if (*s == '!') {
2877 ++s;
2878 trace->not_ev_qualifier = true;
2879 }
2880
2881 while (1) {
2882 if ((sep = strchr(s, ',')) != NULL)
2883 *sep = '\0';
2884
2885 list = 0;
2886 if (syscalltbl__id(trace->sctbl, s) >= 0 ||
2887 syscalltbl__strglobmatch_first(trace->sctbl, s, &idx) >= 0) {
2888 list = 1;
2889 } else {
2890 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2891 if (access(group_name, R_OK) == 0)
2892 list = 1;
2893 }
2894
2895 if (lists[list]) {
2896 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2897 } else {
2898 lists[list] = malloc(len);
2899 if (lists[list] == NULL)
2900 goto out;
2901 strcpy(lists[list], s);
2902 }
2903
2904 if (!sep)
2905 break;
2906
2907 *sep = ',';
2908 s = sep + 1;
2909 }
2910
2911 if (lists[1] != NULL) {
2912 struct strlist_config slist_config = {
2913 .dirname = strace_groups_dir,
2914 };
2915
2916 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2917 if (trace->ev_qualifier == NULL) {
2918 fputs("Not enough memory to parse event qualifier", trace->output);
2919 goto out;
2920 }
2921
2922 if (trace__validate_ev_qualifier(trace))
2923 goto out;
2924 }
2925
2926 err = 0;
2927
2928 if (lists[0]) {
2929 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2930 "event selector. use 'perf list' to list available events",
2931 parse_events_option);
2932 err = parse_events_option(&o, lists[0], 0);
2933 }
2934 out:
2935 if (sep)
2936 *sep = ',';
2937
2938 return err;
2939 }
2940
2941 int cmd_trace(int argc, const char **argv)
2942 {
2943 const char *trace_usage[] = {
2944 "perf trace [<options>] [<command>]",
2945 "perf trace [<options>] -- <command> [<options>]",
2946 "perf trace record [<options>] [<command>]",
2947 "perf trace record [<options>] -- <command> [<options>]",
2948 NULL
2949 };
2950 struct trace trace = {
2951 .syscalls = {
2952 . max = -1,
2953 },
2954 .opts = {
2955 .target = {
2956 .uid = UINT_MAX,
2957 .uses_mmap = true,
2958 },
2959 .user_freq = UINT_MAX,
2960 .user_interval = ULLONG_MAX,
2961 .no_buffering = true,
2962 .mmap_pages = UINT_MAX,
2963 .proc_map_timeout = 500,
2964 },
2965 .output = stderr,
2966 .show_comm = true,
2967 .trace_syscalls = true,
2968 .kernel_syscallchains = false,
2969 .max_stack = UINT_MAX,
2970 };
2971 const char *output_name = NULL;
2972 const struct option trace_options[] = {
2973 OPT_CALLBACK('e', "event", &trace, "event",
2974 "event/syscall selector. use 'perf list' to list available events",
2975 trace__parse_events_option),
2976 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2977 "show the thread COMM next to its id"),
2978 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2979 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2980 trace__parse_events_option),
2981 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2982 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2983 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2984 "trace events on existing process id"),
2985 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2986 "trace events on existing thread id"),
2987 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2988 "pids to filter (by the kernel)", trace__set_filter_pids),
2989 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2990 "system-wide collection from all CPUs"),
2991 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2992 "list of cpus to monitor"),
2993 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2994 "child tasks do not inherit counters"),
2995 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2996 "number of mmap data pages",
2997 perf_evlist__parse_mmap_pages),
2998 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2999 "user to profile"),
3000 OPT_CALLBACK(0, "duration", &trace, "float",
3001 "show only events with duration > N.M ms",
3002 trace__set_duration),
3003 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
3004 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
3005 OPT_BOOLEAN('T', "time", &trace.full_time,
3006 "Show full timestamp, not time relative to first start"),
3007 OPT_BOOLEAN('s', "summary", &trace.summary_only,
3008 "Show only syscall summary with statistics"),
3009 OPT_BOOLEAN('S', "with-summary", &trace.summary,
3010 "Show all syscalls and summary with statistics"),
3011 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
3012 "Trace pagefaults", parse_pagefaults, "maj"),
3013 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
3014 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
3015 OPT_CALLBACK(0, "call-graph", &trace.opts,
3016 "record_mode[,record_size]", record_callchain_help,
3017 &record_parse_callchain_opt),
3018 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
3019 "Show the kernel callchains on the syscall exit path"),
3020 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
3021 "Set the minimum stack depth when parsing the callchain, "
3022 "anything below the specified depth will be ignored."),
3023 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
3024 "Set the maximum stack depth when parsing the callchain, "
3025 "anything beyond the specified depth will be ignored. "
3026 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
3027 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
3028 "per thread proc mmap processing timeout in ms"),
3029 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
3030 "ms to wait before starting measurement after program "
3031 "start"),
3032 OPT_END()
3033 };
3034 bool __maybe_unused max_stack_user_set = true;
3035 bool mmap_pages_user_set = true;
3036 const char * const trace_subcommands[] = { "record", NULL };
3037 int err;
3038 char bf[BUFSIZ];
3039
3040 signal(SIGSEGV, sighandler_dump_stack);
3041 signal(SIGFPE, sighandler_dump_stack);
3042
3043 trace.evlist = perf_evlist__new();
3044 trace.sctbl = syscalltbl__new();
3045
3046 if (trace.evlist == NULL || trace.sctbl == NULL) {
3047 pr_err("Not enough memory to run!\n");
3048 err = -ENOMEM;
3049 goto out;
3050 }
3051
3052 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
3053 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3054
3055 err = bpf__setup_stdout(trace.evlist);
3056 if (err) {
3057 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
3058 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
3059 goto out;
3060 }
3061
3062 err = -1;
3063
3064 if (trace.trace_pgfaults) {
3065 trace.opts.sample_address = true;
3066 trace.opts.sample_time = true;
3067 }
3068
3069 if (trace.opts.mmap_pages == UINT_MAX)
3070 mmap_pages_user_set = false;
3071
3072 if (trace.max_stack == UINT_MAX) {
3073 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
3074 max_stack_user_set = false;
3075 }
3076
3077 #ifdef HAVE_DWARF_UNWIND_SUPPORT
3078 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
3079 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
3080 #endif
3081
3082 if (callchain_param.enabled) {
3083 if (!mmap_pages_user_set && geteuid() == 0)
3084 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
3085
3086 symbol_conf.use_callchain = true;
3087 }
3088
3089 if (trace.evlist->nr_entries > 0)
3090 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
3091
3092 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
3093 return trace__record(&trace, argc-1, &argv[1]);
3094
3095 /* summary_only implies summary option, but don't overwrite summary if set */
3096 if (trace.summary_only)
3097 trace.summary = trace.summary_only;
3098
3099 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
3100 trace.evlist->nr_entries == 0 /* Was --events used? */) {
3101 pr_err("Please specify something to trace.\n");
3102 return -1;
3103 }
3104
3105 if (!trace.trace_syscalls && trace.ev_qualifier) {
3106 pr_err("The -e option can't be used with --no-syscalls.\n");
3107 goto out;
3108 }
3109
3110 if (output_name != NULL) {
3111 err = trace__open_output(&trace, output_name);
3112 if (err < 0) {
3113 perror("failed to create output file");
3114 goto out;
3115 }
3116 }
3117
3118 trace.open_id = syscalltbl__id(trace.sctbl, "open");
3119
3120 err = target__validate(&trace.opts.target);
3121 if (err) {
3122 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3123 fprintf(trace.output, "%s", bf);
3124 goto out_close;
3125 }
3126
3127 err = target__parse_uid(&trace.opts.target);
3128 if (err) {
3129 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3130 fprintf(trace.output, "%s", bf);
3131 goto out_close;
3132 }
3133
3134 if (!argc && target__none(&trace.opts.target))
3135 trace.opts.target.system_wide = true;
3136
3137 if (input_name)
3138 err = trace__replay(&trace);
3139 else
3140 err = trace__run(&trace, argc, argv);
3141
3142 out_close:
3143 if (output_name != NULL)
3144 fclose(trace.output);
3145 out:
3146 return err;
3147 }