Merge branch 'bind_unbind' of git://git.kernel.org/pub/scm/linux/kernel/git/gregkh...
[GitHub/LineageOS/android_kernel_motorola_exynos9610.git] / tools / perf / builtin-trace.c
1 /*
2 * builtin-trace.c
3 *
4 * Builtin 'trace' command:
5 *
6 * Display a continuously updated trace of any workload, CPU, specific PID,
7 * system wide, etc. Default format is loosely strace like, but any other
8 * event may be specified using --event.
9 *
10 * Copyright (C) 2012, 2013, 2014, 2015 Red Hat Inc, Arnaldo Carvalho de Melo <acme@redhat.com>
11 *
12 * Initially based on the 'trace' prototype by Thomas Gleixner:
13 *
14 * http://lwn.net/Articles/415728/ ("Announcing a new utility: 'trace'")
15 *
16 * Released under the GPL v2. (and only v2, not any later version)
17 */
18
19 #include <traceevent/event-parse.h>
20 #include <api/fs/tracing_path.h>
21 #include "builtin.h"
22 #include "util/color.h"
23 #include "util/debug.h"
24 #include "util/event.h"
25 #include "util/evlist.h"
26 #include <subcmd/exec-cmd.h>
27 #include "util/machine.h"
28 #include "util/path.h"
29 #include "util/session.h"
30 #include "util/thread.h"
31 #include <subcmd/parse-options.h>
32 #include "util/strlist.h"
33 #include "util/intlist.h"
34 #include "util/thread_map.h"
35 #include "util/stat.h"
36 #include "trace/beauty/beauty.h"
37 #include "trace-event.h"
38 #include "util/parse-events.h"
39 #include "util/bpf-loader.h"
40 #include "callchain.h"
41 #include "print_binary.h"
42 #include "string2.h"
43 #include "syscalltbl.h"
44 #include "rb_resort.h"
45
46 #include <errno.h>
47 #include <inttypes.h>
48 #include <libaudit.h> /* FIXME: Still needed for audit_errno_to_name */
49 #include <poll.h>
50 #include <signal.h>
51 #include <stdlib.h>
52 #include <string.h>
53 #include <linux/err.h>
54 #include <linux/filter.h>
55 #include <linux/audit.h>
56 #include <linux/kernel.h>
57 #include <linux/random.h>
58 #include <linux/stringify.h>
59 #include <linux/time64.h>
60
61 #include "sane_ctype.h"
62
63 #ifndef O_CLOEXEC
64 # define O_CLOEXEC 02000000
65 #endif
66
67 struct trace {
68 struct perf_tool tool;
69 struct syscalltbl *sctbl;
70 struct {
71 int max;
72 struct syscall *table;
73 struct {
74 struct perf_evsel *sys_enter,
75 *sys_exit;
76 } events;
77 } syscalls;
78 struct record_opts opts;
79 struct perf_evlist *evlist;
80 struct machine *host;
81 struct thread *current;
82 u64 base_time;
83 FILE *output;
84 unsigned long nr_events;
85 struct strlist *ev_qualifier;
86 struct {
87 size_t nr;
88 int *entries;
89 } ev_qualifier_ids;
90 struct {
91 size_t nr;
92 pid_t *entries;
93 } filter_pids;
94 double duration_filter;
95 double runtime_ms;
96 struct {
97 u64 vfs_getname,
98 proc_getname;
99 } stats;
100 unsigned int max_stack;
101 unsigned int min_stack;
102 bool not_ev_qualifier;
103 bool live;
104 bool full_time;
105 bool sched;
106 bool multiple_threads;
107 bool summary;
108 bool summary_only;
109 bool show_comm;
110 bool show_tool_stats;
111 bool trace_syscalls;
112 bool kernel_syscallchains;
113 bool force;
114 bool vfs_getname;
115 int trace_pgfaults;
116 int open_id;
117 };
118
119 struct tp_field {
120 int offset;
121 union {
122 u64 (*integer)(struct tp_field *field, struct perf_sample *sample);
123 void *(*pointer)(struct tp_field *field, struct perf_sample *sample);
124 };
125 };
126
127 #define TP_UINT_FIELD(bits) \
128 static u64 tp_field__u##bits(struct tp_field *field, struct perf_sample *sample) \
129 { \
130 u##bits value; \
131 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
132 return value; \
133 }
134
135 TP_UINT_FIELD(8);
136 TP_UINT_FIELD(16);
137 TP_UINT_FIELD(32);
138 TP_UINT_FIELD(64);
139
140 #define TP_UINT_FIELD__SWAPPED(bits) \
141 static u64 tp_field__swapped_u##bits(struct tp_field *field, struct perf_sample *sample) \
142 { \
143 u##bits value; \
144 memcpy(&value, sample->raw_data + field->offset, sizeof(value)); \
145 return bswap_##bits(value);\
146 }
147
148 TP_UINT_FIELD__SWAPPED(16);
149 TP_UINT_FIELD__SWAPPED(32);
150 TP_UINT_FIELD__SWAPPED(64);
151
152 static int tp_field__init_uint(struct tp_field *field,
153 struct format_field *format_field,
154 bool needs_swap)
155 {
156 field->offset = format_field->offset;
157
158 switch (format_field->size) {
159 case 1:
160 field->integer = tp_field__u8;
161 break;
162 case 2:
163 field->integer = needs_swap ? tp_field__swapped_u16 : tp_field__u16;
164 break;
165 case 4:
166 field->integer = needs_swap ? tp_field__swapped_u32 : tp_field__u32;
167 break;
168 case 8:
169 field->integer = needs_swap ? tp_field__swapped_u64 : tp_field__u64;
170 break;
171 default:
172 return -1;
173 }
174
175 return 0;
176 }
177
178 static void *tp_field__ptr(struct tp_field *field, struct perf_sample *sample)
179 {
180 return sample->raw_data + field->offset;
181 }
182
183 static int tp_field__init_ptr(struct tp_field *field, struct format_field *format_field)
184 {
185 field->offset = format_field->offset;
186 field->pointer = tp_field__ptr;
187 return 0;
188 }
189
190 struct syscall_tp {
191 struct tp_field id;
192 union {
193 struct tp_field args, ret;
194 };
195 };
196
197 static int perf_evsel__init_tp_uint_field(struct perf_evsel *evsel,
198 struct tp_field *field,
199 const char *name)
200 {
201 struct format_field *format_field = perf_evsel__field(evsel, name);
202
203 if (format_field == NULL)
204 return -1;
205
206 return tp_field__init_uint(field, format_field, evsel->needs_swap);
207 }
208
209 #define perf_evsel__init_sc_tp_uint_field(evsel, name) \
210 ({ struct syscall_tp *sc = evsel->priv;\
211 perf_evsel__init_tp_uint_field(evsel, &sc->name, #name); })
212
213 static int perf_evsel__init_tp_ptr_field(struct perf_evsel *evsel,
214 struct tp_field *field,
215 const char *name)
216 {
217 struct format_field *format_field = perf_evsel__field(evsel, name);
218
219 if (format_field == NULL)
220 return -1;
221
222 return tp_field__init_ptr(field, format_field);
223 }
224
225 #define perf_evsel__init_sc_tp_ptr_field(evsel, name) \
226 ({ struct syscall_tp *sc = evsel->priv;\
227 perf_evsel__init_tp_ptr_field(evsel, &sc->name, #name); })
228
229 static void perf_evsel__delete_priv(struct perf_evsel *evsel)
230 {
231 zfree(&evsel->priv);
232 perf_evsel__delete(evsel);
233 }
234
235 static int perf_evsel__init_syscall_tp(struct perf_evsel *evsel, void *handler)
236 {
237 evsel->priv = malloc(sizeof(struct syscall_tp));
238 if (evsel->priv != NULL) {
239 if (perf_evsel__init_sc_tp_uint_field(evsel, id))
240 goto out_delete;
241
242 evsel->handler = handler;
243 return 0;
244 }
245
246 return -ENOMEM;
247
248 out_delete:
249 zfree(&evsel->priv);
250 return -ENOENT;
251 }
252
253 static struct perf_evsel *perf_evsel__syscall_newtp(const char *direction, void *handler)
254 {
255 struct perf_evsel *evsel = perf_evsel__newtp("raw_syscalls", direction);
256
257 /* older kernel (e.g., RHEL6) use syscalls:{enter,exit} */
258 if (IS_ERR(evsel))
259 evsel = perf_evsel__newtp("syscalls", direction);
260
261 if (IS_ERR(evsel))
262 return NULL;
263
264 if (perf_evsel__init_syscall_tp(evsel, handler))
265 goto out_delete;
266
267 return evsel;
268
269 out_delete:
270 perf_evsel__delete_priv(evsel);
271 return NULL;
272 }
273
274 #define perf_evsel__sc_tp_uint(evsel, name, sample) \
275 ({ struct syscall_tp *fields = evsel->priv; \
276 fields->name.integer(&fields->name, sample); })
277
278 #define perf_evsel__sc_tp_ptr(evsel, name, sample) \
279 ({ struct syscall_tp *fields = evsel->priv; \
280 fields->name.pointer(&fields->name, sample); })
281
282 struct strarray {
283 int offset;
284 int nr_entries;
285 const char **entries;
286 };
287
288 #define DEFINE_STRARRAY(array) struct strarray strarray__##array = { \
289 .nr_entries = ARRAY_SIZE(array), \
290 .entries = array, \
291 }
292
293 #define DEFINE_STRARRAY_OFFSET(array, off) struct strarray strarray__##array = { \
294 .offset = off, \
295 .nr_entries = ARRAY_SIZE(array), \
296 .entries = array, \
297 }
298
299 static size_t __syscall_arg__scnprintf_strarray(char *bf, size_t size,
300 const char *intfmt,
301 struct syscall_arg *arg)
302 {
303 struct strarray *sa = arg->parm;
304 int idx = arg->val - sa->offset;
305
306 if (idx < 0 || idx >= sa->nr_entries)
307 return scnprintf(bf, size, intfmt, arg->val);
308
309 return scnprintf(bf, size, "%s", sa->entries[idx]);
310 }
311
312 static size_t syscall_arg__scnprintf_strarray(char *bf, size_t size,
313 struct syscall_arg *arg)
314 {
315 return __syscall_arg__scnprintf_strarray(bf, size, "%d", arg);
316 }
317
318 #define SCA_STRARRAY syscall_arg__scnprintf_strarray
319
320 #if defined(__i386__) || defined(__x86_64__)
321 /*
322 * FIXME: Make this available to all arches as soon as the ioctl beautifier
323 * gets rewritten to support all arches.
324 */
325 static size_t syscall_arg__scnprintf_strhexarray(char *bf, size_t size,
326 struct syscall_arg *arg)
327 {
328 return __syscall_arg__scnprintf_strarray(bf, size, "%#x", arg);
329 }
330
331 #define SCA_STRHEXARRAY syscall_arg__scnprintf_strhexarray
332 #endif /* defined(__i386__) || defined(__x86_64__) */
333
334 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
335 struct syscall_arg *arg);
336
337 #define SCA_FD syscall_arg__scnprintf_fd
338
339 #ifndef AT_FDCWD
340 #define AT_FDCWD -100
341 #endif
342
343 static size_t syscall_arg__scnprintf_fd_at(char *bf, size_t size,
344 struct syscall_arg *arg)
345 {
346 int fd = arg->val;
347
348 if (fd == AT_FDCWD)
349 return scnprintf(bf, size, "CWD");
350
351 return syscall_arg__scnprintf_fd(bf, size, arg);
352 }
353
354 #define SCA_FDAT syscall_arg__scnprintf_fd_at
355
356 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
357 struct syscall_arg *arg);
358
359 #define SCA_CLOSE_FD syscall_arg__scnprintf_close_fd
360
361 static size_t syscall_arg__scnprintf_hex(char *bf, size_t size,
362 struct syscall_arg *arg)
363 {
364 return scnprintf(bf, size, "%#lx", arg->val);
365 }
366
367 #define SCA_HEX syscall_arg__scnprintf_hex
368
369 static size_t syscall_arg__scnprintf_int(char *bf, size_t size,
370 struct syscall_arg *arg)
371 {
372 return scnprintf(bf, size, "%d", arg->val);
373 }
374
375 #define SCA_INT syscall_arg__scnprintf_int
376
377 static const char *bpf_cmd[] = {
378 "MAP_CREATE", "MAP_LOOKUP_ELEM", "MAP_UPDATE_ELEM", "MAP_DELETE_ELEM",
379 "MAP_GET_NEXT_KEY", "PROG_LOAD",
380 };
381 static DEFINE_STRARRAY(bpf_cmd);
382
383 static const char *epoll_ctl_ops[] = { "ADD", "DEL", "MOD", };
384 static DEFINE_STRARRAY_OFFSET(epoll_ctl_ops, 1);
385
386 static const char *itimers[] = { "REAL", "VIRTUAL", "PROF", };
387 static DEFINE_STRARRAY(itimers);
388
389 static const char *keyctl_options[] = {
390 "GET_KEYRING_ID", "JOIN_SESSION_KEYRING", "UPDATE", "REVOKE", "CHOWN",
391 "SETPERM", "DESCRIBE", "CLEAR", "LINK", "UNLINK", "SEARCH", "READ",
392 "INSTANTIATE", "NEGATE", "SET_REQKEY_KEYRING", "SET_TIMEOUT",
393 "ASSUME_AUTHORITY", "GET_SECURITY", "SESSION_TO_PARENT", "REJECT",
394 "INSTANTIATE_IOV", "INVALIDATE", "GET_PERSISTENT",
395 };
396 static DEFINE_STRARRAY(keyctl_options);
397
398 static const char *whences[] = { "SET", "CUR", "END",
399 #ifdef SEEK_DATA
400 "DATA",
401 #endif
402 #ifdef SEEK_HOLE
403 "HOLE",
404 #endif
405 };
406 static DEFINE_STRARRAY(whences);
407
408 static const char *fcntl_cmds[] = {
409 "DUPFD", "GETFD", "SETFD", "GETFL", "SETFL", "GETLK", "SETLK",
410 "SETLKW", "SETOWN", "GETOWN", "SETSIG", "GETSIG", "F_GETLK64",
411 "F_SETLK64", "F_SETLKW64", "F_SETOWN_EX", "F_GETOWN_EX",
412 "F_GETOWNER_UIDS",
413 };
414 static DEFINE_STRARRAY(fcntl_cmds);
415
416 static const char *rlimit_resources[] = {
417 "CPU", "FSIZE", "DATA", "STACK", "CORE", "RSS", "NPROC", "NOFILE",
418 "MEMLOCK", "AS", "LOCKS", "SIGPENDING", "MSGQUEUE", "NICE", "RTPRIO",
419 "RTTIME",
420 };
421 static DEFINE_STRARRAY(rlimit_resources);
422
423 static const char *sighow[] = { "BLOCK", "UNBLOCK", "SETMASK", };
424 static DEFINE_STRARRAY(sighow);
425
426 static const char *clockid[] = {
427 "REALTIME", "MONOTONIC", "PROCESS_CPUTIME_ID", "THREAD_CPUTIME_ID",
428 "MONOTONIC_RAW", "REALTIME_COARSE", "MONOTONIC_COARSE", "BOOTTIME",
429 "REALTIME_ALARM", "BOOTTIME_ALARM", "SGI_CYCLE", "TAI"
430 };
431 static DEFINE_STRARRAY(clockid);
432
433 static const char *socket_families[] = {
434 "UNSPEC", "LOCAL", "INET", "AX25", "IPX", "APPLETALK", "NETROM",
435 "BRIDGE", "ATMPVC", "X25", "INET6", "ROSE", "DECnet", "NETBEUI",
436 "SECURITY", "KEY", "NETLINK", "PACKET", "ASH", "ECONET", "ATMSVC",
437 "RDS", "SNA", "IRDA", "PPPOX", "WANPIPE", "LLC", "IB", "CAN", "TIPC",
438 "BLUETOOTH", "IUCV", "RXRPC", "ISDN", "PHONET", "IEEE802154", "CAIF",
439 "ALG", "NFC", "VSOCK",
440 };
441 static DEFINE_STRARRAY(socket_families);
442
443 static size_t syscall_arg__scnprintf_access_mode(char *bf, size_t size,
444 struct syscall_arg *arg)
445 {
446 size_t printed = 0;
447 int mode = arg->val;
448
449 if (mode == F_OK) /* 0 */
450 return scnprintf(bf, size, "F");
451 #define P_MODE(n) \
452 if (mode & n##_OK) { \
453 printed += scnprintf(bf + printed, size - printed, "%s", #n); \
454 mode &= ~n##_OK; \
455 }
456
457 P_MODE(R);
458 P_MODE(W);
459 P_MODE(X);
460 #undef P_MODE
461
462 if (mode)
463 printed += scnprintf(bf + printed, size - printed, "|%#x", mode);
464
465 return printed;
466 }
467
468 #define SCA_ACCMODE syscall_arg__scnprintf_access_mode
469
470 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
471 struct syscall_arg *arg);
472
473 #define SCA_FILENAME syscall_arg__scnprintf_filename
474
475 static size_t syscall_arg__scnprintf_pipe_flags(char *bf, size_t size,
476 struct syscall_arg *arg)
477 {
478 int printed = 0, flags = arg->val;
479
480 #define P_FLAG(n) \
481 if (flags & O_##n) { \
482 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
483 flags &= ~O_##n; \
484 }
485
486 P_FLAG(CLOEXEC);
487 P_FLAG(NONBLOCK);
488 #undef P_FLAG
489
490 if (flags)
491 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
492
493 return printed;
494 }
495
496 #define SCA_PIPE_FLAGS syscall_arg__scnprintf_pipe_flags
497
498 #if defined(__i386__) || defined(__x86_64__)
499 /*
500 * FIXME: Make this available to all arches.
501 */
502 #define TCGETS 0x5401
503
504 static const char *tioctls[] = {
505 "TCGETS", "TCSETS", "TCSETSW", "TCSETSF", "TCGETA", "TCSETA", "TCSETAW",
506 "TCSETAF", "TCSBRK", "TCXONC", "TCFLSH", "TIOCEXCL", "TIOCNXCL",
507 "TIOCSCTTY", "TIOCGPGRP", "TIOCSPGRP", "TIOCOUTQ", "TIOCSTI",
508 "TIOCGWINSZ", "TIOCSWINSZ", "TIOCMGET", "TIOCMBIS", "TIOCMBIC",
509 "TIOCMSET", "TIOCGSOFTCAR", "TIOCSSOFTCAR", "FIONREAD", "TIOCLINUX",
510 "TIOCCONS", "TIOCGSERIAL", "TIOCSSERIAL", "TIOCPKT", "FIONBIO",
511 "TIOCNOTTY", "TIOCSETD", "TIOCGETD", "TCSBRKP", [0x27] = "TIOCSBRK",
512 "TIOCCBRK", "TIOCGSID", "TCGETS2", "TCSETS2", "TCSETSW2", "TCSETSF2",
513 "TIOCGRS485", "TIOCSRS485", "TIOCGPTN", "TIOCSPTLCK",
514 "TIOCGDEV||TCGETX", "TCSETX", "TCSETXF", "TCSETXW", "TIOCSIG",
515 "TIOCVHANGUP", "TIOCGPKT", "TIOCGPTLCK", "TIOCGEXCL",
516 [0x50] = "FIONCLEX", "FIOCLEX", "FIOASYNC", "TIOCSERCONFIG",
517 "TIOCSERGWILD", "TIOCSERSWILD", "TIOCGLCKTRMIOS", "TIOCSLCKTRMIOS",
518 "TIOCSERGSTRUCT", "TIOCSERGETLSR", "TIOCSERGETMULTI", "TIOCSERSETMULTI",
519 "TIOCMIWAIT", "TIOCGICOUNT", [0x60] = "FIOQSIZE",
520 };
521
522 static DEFINE_STRARRAY_OFFSET(tioctls, 0x5401);
523 #endif /* defined(__i386__) || defined(__x86_64__) */
524
525 #ifndef GRND_NONBLOCK
526 #define GRND_NONBLOCK 0x0001
527 #endif
528 #ifndef GRND_RANDOM
529 #define GRND_RANDOM 0x0002
530 #endif
531
532 static size_t syscall_arg__scnprintf_getrandom_flags(char *bf, size_t size,
533 struct syscall_arg *arg)
534 {
535 int printed = 0, flags = arg->val;
536
537 #define P_FLAG(n) \
538 if (flags & GRND_##n) { \
539 printed += scnprintf(bf + printed, size - printed, "%s%s", printed ? "|" : "", #n); \
540 flags &= ~GRND_##n; \
541 }
542
543 P_FLAG(RANDOM);
544 P_FLAG(NONBLOCK);
545 #undef P_FLAG
546
547 if (flags)
548 printed += scnprintf(bf + printed, size - printed, "%s%#x", printed ? "|" : "", flags);
549
550 return printed;
551 }
552
553 #define SCA_GETRANDOM_FLAGS syscall_arg__scnprintf_getrandom_flags
554
555 #define STRARRAY(arg, name, array) \
556 .arg_scnprintf = { [arg] = SCA_STRARRAY, }, \
557 .arg_parm = { [arg] = &strarray__##array, }
558
559 #include "trace/beauty/eventfd.c"
560 #include "trace/beauty/flock.c"
561 #include "trace/beauty/futex_op.c"
562 #include "trace/beauty/mmap.c"
563 #include "trace/beauty/mode_t.c"
564 #include "trace/beauty/msg_flags.c"
565 #include "trace/beauty/open_flags.c"
566 #include "trace/beauty/perf_event_open.c"
567 #include "trace/beauty/pid.c"
568 #include "trace/beauty/sched_policy.c"
569 #include "trace/beauty/seccomp.c"
570 #include "trace/beauty/signum.c"
571 #include "trace/beauty/socket_type.c"
572 #include "trace/beauty/waitid_options.c"
573
574 static struct syscall_fmt {
575 const char *name;
576 const char *alias;
577 size_t (*arg_scnprintf[6])(char *bf, size_t size, struct syscall_arg *arg);
578 void *arg_parm[6];
579 bool errmsg;
580 bool errpid;
581 bool timeout;
582 bool hexret;
583 } syscall_fmts[] = {
584 { .name = "access", .errmsg = true,
585 .arg_scnprintf = { [1] = SCA_ACCMODE, /* mode */ }, },
586 { .name = "arch_prctl", .errmsg = true, .alias = "prctl", },
587 { .name = "bpf", .errmsg = true, STRARRAY(0, cmd, bpf_cmd), },
588 { .name = "brk", .hexret = true,
589 .arg_scnprintf = { [0] = SCA_HEX, /* brk */ }, },
590 { .name = "chdir", .errmsg = true, },
591 { .name = "chmod", .errmsg = true, },
592 { .name = "chroot", .errmsg = true, },
593 { .name = "clock_gettime", .errmsg = true, STRARRAY(0, clk_id, clockid), },
594 { .name = "clone", .errpid = true, },
595 { .name = "close", .errmsg = true,
596 .arg_scnprintf = { [0] = SCA_CLOSE_FD, /* fd */ }, },
597 { .name = "connect", .errmsg = true, },
598 { .name = "creat", .errmsg = true, },
599 { .name = "dup", .errmsg = true, },
600 { .name = "dup2", .errmsg = true, },
601 { .name = "dup3", .errmsg = true, },
602 { .name = "epoll_ctl", .errmsg = true, STRARRAY(1, op, epoll_ctl_ops), },
603 { .name = "eventfd2", .errmsg = true,
604 .arg_scnprintf = { [1] = SCA_EFD_FLAGS, /* flags */ }, },
605 { .name = "faccessat", .errmsg = true, },
606 { .name = "fadvise64", .errmsg = true, },
607 { .name = "fallocate", .errmsg = true, },
608 { .name = "fchdir", .errmsg = true, },
609 { .name = "fchmod", .errmsg = true, },
610 { .name = "fchmodat", .errmsg = true,
611 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
612 { .name = "fchown", .errmsg = true, },
613 { .name = "fchownat", .errmsg = true,
614 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
615 { .name = "fcntl", .errmsg = true,
616 .arg_scnprintf = { [1] = SCA_STRARRAY, /* cmd */ },
617 .arg_parm = { [1] = &strarray__fcntl_cmds, /* cmd */ }, },
618 { .name = "fdatasync", .errmsg = true, },
619 { .name = "flock", .errmsg = true,
620 .arg_scnprintf = { [1] = SCA_FLOCK, /* cmd */ }, },
621 { .name = "fsetxattr", .errmsg = true, },
622 { .name = "fstat", .errmsg = true, .alias = "newfstat", },
623 { .name = "fstatat", .errmsg = true, .alias = "newfstatat", },
624 { .name = "fstatfs", .errmsg = true, },
625 { .name = "fsync", .errmsg = true, },
626 { .name = "ftruncate", .errmsg = true, },
627 { .name = "futex", .errmsg = true,
628 .arg_scnprintf = { [1] = SCA_FUTEX_OP, /* op */ }, },
629 { .name = "futimesat", .errmsg = true,
630 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
631 { .name = "getdents", .errmsg = true, },
632 { .name = "getdents64", .errmsg = true, },
633 { .name = "getitimer", .errmsg = true, STRARRAY(0, which, itimers), },
634 { .name = "getpid", .errpid = true, },
635 { .name = "getpgid", .errpid = true, },
636 { .name = "getppid", .errpid = true, },
637 { .name = "getrandom", .errmsg = true,
638 .arg_scnprintf = { [2] = SCA_GETRANDOM_FLAGS, /* flags */ }, },
639 { .name = "getrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
640 { .name = "getxattr", .errmsg = true, },
641 { .name = "inotify_add_watch", .errmsg = true, },
642 { .name = "ioctl", .errmsg = true,
643 .arg_scnprintf = {
644 #if defined(__i386__) || defined(__x86_64__)
645 /*
646 * FIXME: Make this available to all arches.
647 */
648 [1] = SCA_STRHEXARRAY, /* cmd */
649 [2] = SCA_HEX, /* arg */ },
650 .arg_parm = { [1] = &strarray__tioctls, /* cmd */ }, },
651 #else
652 [2] = SCA_HEX, /* arg */ }, },
653 #endif
654 { .name = "keyctl", .errmsg = true, STRARRAY(0, option, keyctl_options), },
655 { .name = "kill", .errmsg = true,
656 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
657 { .name = "lchown", .errmsg = true, },
658 { .name = "lgetxattr", .errmsg = true, },
659 { .name = "linkat", .errmsg = true,
660 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
661 { .name = "listxattr", .errmsg = true, },
662 { .name = "llistxattr", .errmsg = true, },
663 { .name = "lremovexattr", .errmsg = true, },
664 { .name = "lseek", .errmsg = true,
665 .arg_scnprintf = { [2] = SCA_STRARRAY, /* whence */ },
666 .arg_parm = { [2] = &strarray__whences, /* whence */ }, },
667 { .name = "lsetxattr", .errmsg = true, },
668 { .name = "lstat", .errmsg = true, .alias = "newlstat", },
669 { .name = "lsxattr", .errmsg = true, },
670 { .name = "madvise", .errmsg = true,
671 .arg_scnprintf = { [0] = SCA_HEX, /* start */
672 [2] = SCA_MADV_BHV, /* behavior */ }, },
673 { .name = "mkdir", .errmsg = true, },
674 { .name = "mkdirat", .errmsg = true,
675 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
676 { .name = "mknod", .errmsg = true, },
677 { .name = "mknodat", .errmsg = true,
678 .arg_scnprintf = { [0] = SCA_FDAT, /* fd */ }, },
679 { .name = "mlock", .errmsg = true,
680 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
681 { .name = "mlockall", .errmsg = true,
682 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
683 { .name = "mmap", .hexret = true,
684 /* The standard mmap maps to old_mmap on s390x */
685 #if defined(__s390x__)
686 .alias = "old_mmap",
687 #endif
688 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
689 [2] = SCA_MMAP_PROT, /* prot */
690 [3] = SCA_MMAP_FLAGS, /* flags */ }, },
691 { .name = "mprotect", .errmsg = true,
692 .arg_scnprintf = { [0] = SCA_HEX, /* start */
693 [2] = SCA_MMAP_PROT, /* prot */ }, },
694 { .name = "mq_unlink", .errmsg = true,
695 .arg_scnprintf = { [0] = SCA_FILENAME, /* u_name */ }, },
696 { .name = "mremap", .hexret = true,
697 .arg_scnprintf = { [0] = SCA_HEX, /* addr */
698 [3] = SCA_MREMAP_FLAGS, /* flags */
699 [4] = SCA_HEX, /* new_addr */ }, },
700 { .name = "munlock", .errmsg = true,
701 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
702 { .name = "munmap", .errmsg = true,
703 .arg_scnprintf = { [0] = SCA_HEX, /* addr */ }, },
704 { .name = "name_to_handle_at", .errmsg = true,
705 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
706 { .name = "newfstatat", .errmsg = true,
707 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
708 { .name = "open", .errmsg = true,
709 .arg_scnprintf = { [1] = SCA_OPEN_FLAGS, /* flags */ }, },
710 { .name = "open_by_handle_at", .errmsg = true,
711 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
712 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
713 { .name = "openat", .errmsg = true,
714 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */
715 [2] = SCA_OPEN_FLAGS, /* flags */ }, },
716 { .name = "perf_event_open", .errmsg = true,
717 .arg_scnprintf = { [2] = SCA_INT, /* cpu */
718 [3] = SCA_FD, /* group_fd */
719 [4] = SCA_PERF_FLAGS, /* flags */ }, },
720 { .name = "pipe2", .errmsg = true,
721 .arg_scnprintf = { [1] = SCA_PIPE_FLAGS, /* flags */ }, },
722 { .name = "poll", .errmsg = true, .timeout = true, },
723 { .name = "ppoll", .errmsg = true, .timeout = true, },
724 { .name = "pread", .errmsg = true, .alias = "pread64", },
725 { .name = "preadv", .errmsg = true, .alias = "pread", },
726 { .name = "prlimit64", .errmsg = true, STRARRAY(1, resource, rlimit_resources), },
727 { .name = "pwrite", .errmsg = true, .alias = "pwrite64", },
728 { .name = "pwritev", .errmsg = true, },
729 { .name = "read", .errmsg = true, },
730 { .name = "readlink", .errmsg = true, },
731 { .name = "readlinkat", .errmsg = true,
732 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
733 { .name = "readv", .errmsg = true, },
734 { .name = "recvfrom", .errmsg = true,
735 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
736 { .name = "recvmmsg", .errmsg = true,
737 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
738 { .name = "recvmsg", .errmsg = true,
739 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
740 { .name = "removexattr", .errmsg = true, },
741 { .name = "renameat", .errmsg = true,
742 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
743 { .name = "rmdir", .errmsg = true, },
744 { .name = "rt_sigaction", .errmsg = true,
745 .arg_scnprintf = { [0] = SCA_SIGNUM, /* sig */ }, },
746 { .name = "rt_sigprocmask", .errmsg = true, STRARRAY(0, how, sighow), },
747 { .name = "rt_sigqueueinfo", .errmsg = true,
748 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
749 { .name = "rt_tgsigqueueinfo", .errmsg = true,
750 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
751 { .name = "sched_getattr", .errmsg = true, },
752 { .name = "sched_setattr", .errmsg = true, },
753 { .name = "sched_setscheduler", .errmsg = true,
754 .arg_scnprintf = { [1] = SCA_SCHED_POLICY, /* policy */ }, },
755 { .name = "seccomp", .errmsg = true,
756 .arg_scnprintf = { [0] = SCA_SECCOMP_OP, /* op */
757 [1] = SCA_SECCOMP_FLAGS, /* flags */ }, },
758 { .name = "select", .errmsg = true, .timeout = true, },
759 { .name = "sendmmsg", .errmsg = true,
760 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
761 { .name = "sendmsg", .errmsg = true,
762 .arg_scnprintf = { [2] = SCA_MSG_FLAGS, /* flags */ }, },
763 { .name = "sendto", .errmsg = true,
764 .arg_scnprintf = { [3] = SCA_MSG_FLAGS, /* flags */ }, },
765 { .name = "set_tid_address", .errpid = true, },
766 { .name = "setitimer", .errmsg = true, STRARRAY(0, which, itimers), },
767 { .name = "setpgid", .errmsg = true, },
768 { .name = "setrlimit", .errmsg = true, STRARRAY(0, resource, rlimit_resources), },
769 { .name = "setxattr", .errmsg = true, },
770 { .name = "shutdown", .errmsg = true, },
771 { .name = "socket", .errmsg = true,
772 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
773 [1] = SCA_SK_TYPE, /* type */ },
774 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
775 { .name = "socketpair", .errmsg = true,
776 .arg_scnprintf = { [0] = SCA_STRARRAY, /* family */
777 [1] = SCA_SK_TYPE, /* type */ },
778 .arg_parm = { [0] = &strarray__socket_families, /* family */ }, },
779 { .name = "stat", .errmsg = true, .alias = "newstat", },
780 { .name = "statfs", .errmsg = true, },
781 { .name = "statx", .errmsg = true,
782 .arg_scnprintf = { [0] = SCA_FDAT, /* flags */
783 [2] = SCA_STATX_FLAGS, /* flags */
784 [3] = SCA_STATX_MASK, /* mask */ }, },
785 { .name = "swapoff", .errmsg = true,
786 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
787 { .name = "swapon", .errmsg = true,
788 .arg_scnprintf = { [0] = SCA_FILENAME, /* specialfile */ }, },
789 { .name = "symlinkat", .errmsg = true,
790 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
791 { .name = "tgkill", .errmsg = true,
792 .arg_scnprintf = { [2] = SCA_SIGNUM, /* sig */ }, },
793 { .name = "tkill", .errmsg = true,
794 .arg_scnprintf = { [1] = SCA_SIGNUM, /* sig */ }, },
795 { .name = "truncate", .errmsg = true, },
796 { .name = "uname", .errmsg = true, .alias = "newuname", },
797 { .name = "unlinkat", .errmsg = true,
798 .arg_scnprintf = { [0] = SCA_FDAT, /* dfd */ }, },
799 { .name = "utime", .errmsg = true, },
800 { .name = "utimensat", .errmsg = true,
801 .arg_scnprintf = { [0] = SCA_FDAT, /* dirfd */ }, },
802 { .name = "utimes", .errmsg = true, },
803 { .name = "vmsplice", .errmsg = true, },
804 { .name = "wait4", .errpid = true,
805 .arg_scnprintf = { [2] = SCA_WAITID_OPTIONS, /* options */ }, },
806 { .name = "waitid", .errpid = true,
807 .arg_scnprintf = { [3] = SCA_WAITID_OPTIONS, /* options */ }, },
808 { .name = "write", .errmsg = true, },
809 { .name = "writev", .errmsg = true, },
810 };
811
812 static int syscall_fmt__cmp(const void *name, const void *fmtp)
813 {
814 const struct syscall_fmt *fmt = fmtp;
815 return strcmp(name, fmt->name);
816 }
817
818 static struct syscall_fmt *syscall_fmt__find(const char *name)
819 {
820 const int nmemb = ARRAY_SIZE(syscall_fmts);
821 return bsearch(name, syscall_fmts, nmemb, sizeof(struct syscall_fmt), syscall_fmt__cmp);
822 }
823
824 struct syscall {
825 struct event_format *tp_format;
826 int nr_args;
827 struct format_field *args;
828 const char *name;
829 bool is_exit;
830 struct syscall_fmt *fmt;
831 size_t (**arg_scnprintf)(char *bf, size_t size, struct syscall_arg *arg);
832 void **arg_parm;
833 };
834
835 /*
836 * We need to have this 'calculated' boolean because in some cases we really
837 * don't know what is the duration of a syscall, for instance, when we start
838 * a session and some threads are waiting for a syscall to finish, say 'poll',
839 * in which case all we can do is to print "( ? ) for duration and for the
840 * start timestamp.
841 */
842 static size_t fprintf_duration(unsigned long t, bool calculated, FILE *fp)
843 {
844 double duration = (double)t / NSEC_PER_MSEC;
845 size_t printed = fprintf(fp, "(");
846
847 if (!calculated)
848 printed += fprintf(fp, " ? ");
849 else if (duration >= 1.0)
850 printed += color_fprintf(fp, PERF_COLOR_RED, "%6.3f ms", duration);
851 else if (duration >= 0.01)
852 printed += color_fprintf(fp, PERF_COLOR_YELLOW, "%6.3f ms", duration);
853 else
854 printed += color_fprintf(fp, PERF_COLOR_NORMAL, "%6.3f ms", duration);
855 return printed + fprintf(fp, "): ");
856 }
857
858 /**
859 * filename.ptr: The filename char pointer that will be vfs_getname'd
860 * filename.entry_str_pos: Where to insert the string translated from
861 * filename.ptr by the vfs_getname tracepoint/kprobe.
862 */
863 struct thread_trace {
864 u64 entry_time;
865 bool entry_pending;
866 unsigned long nr_events;
867 unsigned long pfmaj, pfmin;
868 char *entry_str;
869 double runtime_ms;
870 struct {
871 unsigned long ptr;
872 short int entry_str_pos;
873 bool pending_open;
874 unsigned int namelen;
875 char *name;
876 } filename;
877 struct {
878 int max;
879 char **table;
880 } paths;
881
882 struct intlist *syscall_stats;
883 };
884
885 static struct thread_trace *thread_trace__new(void)
886 {
887 struct thread_trace *ttrace = zalloc(sizeof(struct thread_trace));
888
889 if (ttrace)
890 ttrace->paths.max = -1;
891
892 ttrace->syscall_stats = intlist__new(NULL);
893
894 return ttrace;
895 }
896
897 static struct thread_trace *thread__trace(struct thread *thread, FILE *fp)
898 {
899 struct thread_trace *ttrace;
900
901 if (thread == NULL)
902 goto fail;
903
904 if (thread__priv(thread) == NULL)
905 thread__set_priv(thread, thread_trace__new());
906
907 if (thread__priv(thread) == NULL)
908 goto fail;
909
910 ttrace = thread__priv(thread);
911 ++ttrace->nr_events;
912
913 return ttrace;
914 fail:
915 color_fprintf(fp, PERF_COLOR_RED,
916 "WARNING: not enough memory, dropping samples!\n");
917 return NULL;
918 }
919
920 #define TRACE_PFMAJ (1 << 0)
921 #define TRACE_PFMIN (1 << 1)
922
923 static const size_t trace__entry_str_size = 2048;
924
925 static int trace__set_fd_pathname(struct thread *thread, int fd, const char *pathname)
926 {
927 struct thread_trace *ttrace = thread__priv(thread);
928
929 if (fd > ttrace->paths.max) {
930 char **npath = realloc(ttrace->paths.table, (fd + 1) * sizeof(char *));
931
932 if (npath == NULL)
933 return -1;
934
935 if (ttrace->paths.max != -1) {
936 memset(npath + ttrace->paths.max + 1, 0,
937 (fd - ttrace->paths.max) * sizeof(char *));
938 } else {
939 memset(npath, 0, (fd + 1) * sizeof(char *));
940 }
941
942 ttrace->paths.table = npath;
943 ttrace->paths.max = fd;
944 }
945
946 ttrace->paths.table[fd] = strdup(pathname);
947
948 return ttrace->paths.table[fd] != NULL ? 0 : -1;
949 }
950
951 static int thread__read_fd_path(struct thread *thread, int fd)
952 {
953 char linkname[PATH_MAX], pathname[PATH_MAX];
954 struct stat st;
955 int ret;
956
957 if (thread->pid_ == thread->tid) {
958 scnprintf(linkname, sizeof(linkname),
959 "/proc/%d/fd/%d", thread->pid_, fd);
960 } else {
961 scnprintf(linkname, sizeof(linkname),
962 "/proc/%d/task/%d/fd/%d", thread->pid_, thread->tid, fd);
963 }
964
965 if (lstat(linkname, &st) < 0 || st.st_size + 1 > (off_t)sizeof(pathname))
966 return -1;
967
968 ret = readlink(linkname, pathname, sizeof(pathname));
969
970 if (ret < 0 || ret > st.st_size)
971 return -1;
972
973 pathname[ret] = '\0';
974 return trace__set_fd_pathname(thread, fd, pathname);
975 }
976
977 static const char *thread__fd_path(struct thread *thread, int fd,
978 struct trace *trace)
979 {
980 struct thread_trace *ttrace = thread__priv(thread);
981
982 if (ttrace == NULL)
983 return NULL;
984
985 if (fd < 0)
986 return NULL;
987
988 if ((fd > ttrace->paths.max || ttrace->paths.table[fd] == NULL)) {
989 if (!trace->live)
990 return NULL;
991 ++trace->stats.proc_getname;
992 if (thread__read_fd_path(thread, fd))
993 return NULL;
994 }
995
996 return ttrace->paths.table[fd];
997 }
998
999 static size_t syscall_arg__scnprintf_fd(char *bf, size_t size,
1000 struct syscall_arg *arg)
1001 {
1002 int fd = arg->val;
1003 size_t printed = scnprintf(bf, size, "%d", fd);
1004 const char *path = thread__fd_path(arg->thread, fd, arg->trace);
1005
1006 if (path)
1007 printed += scnprintf(bf + printed, size - printed, "<%s>", path);
1008
1009 return printed;
1010 }
1011
1012 static size_t syscall_arg__scnprintf_close_fd(char *bf, size_t size,
1013 struct syscall_arg *arg)
1014 {
1015 int fd = arg->val;
1016 size_t printed = syscall_arg__scnprintf_fd(bf, size, arg);
1017 struct thread_trace *ttrace = thread__priv(arg->thread);
1018
1019 if (ttrace && fd >= 0 && fd <= ttrace->paths.max)
1020 zfree(&ttrace->paths.table[fd]);
1021
1022 return printed;
1023 }
1024
1025 static void thread__set_filename_pos(struct thread *thread, const char *bf,
1026 unsigned long ptr)
1027 {
1028 struct thread_trace *ttrace = thread__priv(thread);
1029
1030 ttrace->filename.ptr = ptr;
1031 ttrace->filename.entry_str_pos = bf - ttrace->entry_str;
1032 }
1033
1034 static size_t syscall_arg__scnprintf_filename(char *bf, size_t size,
1035 struct syscall_arg *arg)
1036 {
1037 unsigned long ptr = arg->val;
1038
1039 if (!arg->trace->vfs_getname)
1040 return scnprintf(bf, size, "%#x", ptr);
1041
1042 thread__set_filename_pos(arg->thread, bf, ptr);
1043 return 0;
1044 }
1045
1046 static bool trace__filter_duration(struct trace *trace, double t)
1047 {
1048 return t < (trace->duration_filter * NSEC_PER_MSEC);
1049 }
1050
1051 static size_t __trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1052 {
1053 double ts = (double)(tstamp - trace->base_time) / NSEC_PER_MSEC;
1054
1055 return fprintf(fp, "%10.3f ", ts);
1056 }
1057
1058 /*
1059 * We're handling tstamp=0 as an undefined tstamp, i.e. like when we are
1060 * using ttrace->entry_time for a thread that receives a sys_exit without
1061 * first having received a sys_enter ("poll" issued before tracing session
1062 * starts, lost sys_enter exit due to ring buffer overflow).
1063 */
1064 static size_t trace__fprintf_tstamp(struct trace *trace, u64 tstamp, FILE *fp)
1065 {
1066 if (tstamp > 0)
1067 return __trace__fprintf_tstamp(trace, tstamp, fp);
1068
1069 return fprintf(fp, " ? ");
1070 }
1071
1072 static bool done = false;
1073 static bool interrupted = false;
1074
1075 static void sig_handler(int sig)
1076 {
1077 done = true;
1078 interrupted = sig == SIGINT;
1079 }
1080
1081 static size_t trace__fprintf_entry_head(struct trace *trace, struct thread *thread,
1082 u64 duration, bool duration_calculated, u64 tstamp, FILE *fp)
1083 {
1084 size_t printed = trace__fprintf_tstamp(trace, tstamp, fp);
1085 printed += fprintf_duration(duration, duration_calculated, fp);
1086
1087 if (trace->multiple_threads) {
1088 if (trace->show_comm)
1089 printed += fprintf(fp, "%.14s/", thread__comm_str(thread));
1090 printed += fprintf(fp, "%d ", thread->tid);
1091 }
1092
1093 return printed;
1094 }
1095
1096 static int trace__process_event(struct trace *trace, struct machine *machine,
1097 union perf_event *event, struct perf_sample *sample)
1098 {
1099 int ret = 0;
1100
1101 switch (event->header.type) {
1102 case PERF_RECORD_LOST:
1103 color_fprintf(trace->output, PERF_COLOR_RED,
1104 "LOST %" PRIu64 " events!\n", event->lost.lost);
1105 ret = machine__process_lost_event(machine, event, sample);
1106 break;
1107 default:
1108 ret = machine__process_event(machine, event, sample);
1109 break;
1110 }
1111
1112 return ret;
1113 }
1114
1115 static int trace__tool_process(struct perf_tool *tool,
1116 union perf_event *event,
1117 struct perf_sample *sample,
1118 struct machine *machine)
1119 {
1120 struct trace *trace = container_of(tool, struct trace, tool);
1121 return trace__process_event(trace, machine, event, sample);
1122 }
1123
1124 static char *trace__machine__resolve_kernel_addr(void *vmachine, unsigned long long *addrp, char **modp)
1125 {
1126 struct machine *machine = vmachine;
1127
1128 if (machine->kptr_restrict_warned)
1129 return NULL;
1130
1131 if (symbol_conf.kptr_restrict) {
1132 pr_warning("Kernel address maps (/proc/{kallsyms,modules}) are restricted.\n\n"
1133 "Check /proc/sys/kernel/kptr_restrict.\n\n"
1134 "Kernel samples will not be resolved.\n");
1135 machine->kptr_restrict_warned = true;
1136 return NULL;
1137 }
1138
1139 return machine__resolve_kernel_addr(vmachine, addrp, modp);
1140 }
1141
1142 static int trace__symbols_init(struct trace *trace, struct perf_evlist *evlist)
1143 {
1144 int err = symbol__init(NULL);
1145
1146 if (err)
1147 return err;
1148
1149 trace->host = machine__new_host();
1150 if (trace->host == NULL)
1151 return -ENOMEM;
1152
1153 if (trace_event__register_resolver(trace->host, trace__machine__resolve_kernel_addr) < 0)
1154 return -errno;
1155
1156 err = __machine__synthesize_threads(trace->host, &trace->tool, &trace->opts.target,
1157 evlist->threads, trace__tool_process, false,
1158 trace->opts.proc_map_timeout);
1159 if (err)
1160 symbol__exit();
1161
1162 return err;
1163 }
1164
1165 static int syscall__set_arg_fmts(struct syscall *sc)
1166 {
1167 struct format_field *field;
1168 int idx = 0, len;
1169
1170 sc->arg_scnprintf = calloc(sc->nr_args, sizeof(void *));
1171 if (sc->arg_scnprintf == NULL)
1172 return -1;
1173
1174 if (sc->fmt)
1175 sc->arg_parm = sc->fmt->arg_parm;
1176
1177 for (field = sc->args; field; field = field->next) {
1178 if (sc->fmt && sc->fmt->arg_scnprintf[idx])
1179 sc->arg_scnprintf[idx] = sc->fmt->arg_scnprintf[idx];
1180 else if (strcmp(field->type, "const char *") == 0 &&
1181 (strcmp(field->name, "filename") == 0 ||
1182 strcmp(field->name, "path") == 0 ||
1183 strcmp(field->name, "pathname") == 0))
1184 sc->arg_scnprintf[idx] = SCA_FILENAME;
1185 else if (field->flags & FIELD_IS_POINTER)
1186 sc->arg_scnprintf[idx] = syscall_arg__scnprintf_hex;
1187 else if (strcmp(field->type, "pid_t") == 0)
1188 sc->arg_scnprintf[idx] = SCA_PID;
1189 else if (strcmp(field->type, "umode_t") == 0)
1190 sc->arg_scnprintf[idx] = SCA_MODE_T;
1191 else if ((strcmp(field->type, "int") == 0 ||
1192 strcmp(field->type, "unsigned int") == 0 ||
1193 strcmp(field->type, "long") == 0) &&
1194 (len = strlen(field->name)) >= 2 &&
1195 strcmp(field->name + len - 2, "fd") == 0) {
1196 /*
1197 * /sys/kernel/tracing/events/syscalls/sys_enter*
1198 * egrep 'field:.*fd;' .../format|sed -r 's/.*field:([a-z ]+) [a-z_]*fd.+/\1/g'|sort|uniq -c
1199 * 65 int
1200 * 23 unsigned int
1201 * 7 unsigned long
1202 */
1203 sc->arg_scnprintf[idx] = SCA_FD;
1204 }
1205 ++idx;
1206 }
1207
1208 return 0;
1209 }
1210
1211 static int trace__read_syscall_info(struct trace *trace, int id)
1212 {
1213 char tp_name[128];
1214 struct syscall *sc;
1215 const char *name = syscalltbl__name(trace->sctbl, id);
1216
1217 if (name == NULL)
1218 return -1;
1219
1220 if (id > trace->syscalls.max) {
1221 struct syscall *nsyscalls = realloc(trace->syscalls.table, (id + 1) * sizeof(*sc));
1222
1223 if (nsyscalls == NULL)
1224 return -1;
1225
1226 if (trace->syscalls.max != -1) {
1227 memset(nsyscalls + trace->syscalls.max + 1, 0,
1228 (id - trace->syscalls.max) * sizeof(*sc));
1229 } else {
1230 memset(nsyscalls, 0, (id + 1) * sizeof(*sc));
1231 }
1232
1233 trace->syscalls.table = nsyscalls;
1234 trace->syscalls.max = id;
1235 }
1236
1237 sc = trace->syscalls.table + id;
1238 sc->name = name;
1239
1240 sc->fmt = syscall_fmt__find(sc->name);
1241
1242 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->name);
1243 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1244
1245 if (IS_ERR(sc->tp_format) && sc->fmt && sc->fmt->alias) {
1246 snprintf(tp_name, sizeof(tp_name), "sys_enter_%s", sc->fmt->alias);
1247 sc->tp_format = trace_event__tp_format("syscalls", tp_name);
1248 }
1249
1250 if (IS_ERR(sc->tp_format))
1251 return -1;
1252
1253 sc->args = sc->tp_format->format.fields;
1254 sc->nr_args = sc->tp_format->format.nr_fields;
1255 /*
1256 * We need to check and discard the first variable '__syscall_nr'
1257 * or 'nr' that mean the syscall number. It is needless here.
1258 * So drop '__syscall_nr' or 'nr' field but does not exist on older kernels.
1259 */
1260 if (sc->args && (!strcmp(sc->args->name, "__syscall_nr") || !strcmp(sc->args->name, "nr"))) {
1261 sc->args = sc->args->next;
1262 --sc->nr_args;
1263 }
1264
1265 sc->is_exit = !strcmp(name, "exit_group") || !strcmp(name, "exit");
1266
1267 return syscall__set_arg_fmts(sc);
1268 }
1269
1270 static int trace__validate_ev_qualifier(struct trace *trace)
1271 {
1272 int err = 0, i;
1273 struct str_node *pos;
1274
1275 trace->ev_qualifier_ids.nr = strlist__nr_entries(trace->ev_qualifier);
1276 trace->ev_qualifier_ids.entries = malloc(trace->ev_qualifier_ids.nr *
1277 sizeof(trace->ev_qualifier_ids.entries[0]));
1278
1279 if (trace->ev_qualifier_ids.entries == NULL) {
1280 fputs("Error:\tNot enough memory for allocating events qualifier ids\n",
1281 trace->output);
1282 err = -EINVAL;
1283 goto out;
1284 }
1285
1286 i = 0;
1287
1288 strlist__for_each_entry(pos, trace->ev_qualifier) {
1289 const char *sc = pos->s;
1290 int id = syscalltbl__id(trace->sctbl, sc);
1291
1292 if (id < 0) {
1293 if (err == 0) {
1294 fputs("Error:\tInvalid syscall ", trace->output);
1295 err = -EINVAL;
1296 } else {
1297 fputs(", ", trace->output);
1298 }
1299
1300 fputs(sc, trace->output);
1301 }
1302
1303 trace->ev_qualifier_ids.entries[i++] = id;
1304 }
1305
1306 if (err < 0) {
1307 fputs("\nHint:\ttry 'perf list syscalls:sys_enter_*'"
1308 "\nHint:\tand: 'man syscalls'\n", trace->output);
1309 zfree(&trace->ev_qualifier_ids.entries);
1310 trace->ev_qualifier_ids.nr = 0;
1311 }
1312 out:
1313 return err;
1314 }
1315
1316 /*
1317 * args is to be interpreted as a series of longs but we need to handle
1318 * 8-byte unaligned accesses. args points to raw_data within the event
1319 * and raw_data is guaranteed to be 8-byte unaligned because it is
1320 * preceded by raw_size which is a u32. So we need to copy args to a temp
1321 * variable to read it. Most notably this avoids extended load instructions
1322 * on unaligned addresses
1323 */
1324
1325 static size_t syscall__scnprintf_args(struct syscall *sc, char *bf, size_t size,
1326 unsigned char *args, struct trace *trace,
1327 struct thread *thread)
1328 {
1329 size_t printed = 0;
1330 unsigned char *p;
1331 unsigned long val;
1332
1333 if (sc->args != NULL) {
1334 struct format_field *field;
1335 u8 bit = 1;
1336 struct syscall_arg arg = {
1337 .idx = 0,
1338 .mask = 0,
1339 .trace = trace,
1340 .thread = thread,
1341 };
1342
1343 for (field = sc->args; field;
1344 field = field->next, ++arg.idx, bit <<= 1) {
1345 if (arg.mask & bit)
1346 continue;
1347
1348 /* special care for unaligned accesses */
1349 p = args + sizeof(unsigned long) * arg.idx;
1350 memcpy(&val, p, sizeof(val));
1351
1352 /*
1353 * Suppress this argument if its value is zero and
1354 * and we don't have a string associated in an
1355 * strarray for it.
1356 */
1357 if (val == 0 &&
1358 !(sc->arg_scnprintf &&
1359 sc->arg_scnprintf[arg.idx] == SCA_STRARRAY &&
1360 sc->arg_parm[arg.idx]))
1361 continue;
1362
1363 printed += scnprintf(bf + printed, size - printed,
1364 "%s%s: ", printed ? ", " : "", field->name);
1365 if (sc->arg_scnprintf && sc->arg_scnprintf[arg.idx]) {
1366 arg.val = val;
1367 if (sc->arg_parm)
1368 arg.parm = sc->arg_parm[arg.idx];
1369 printed += sc->arg_scnprintf[arg.idx](bf + printed,
1370 size - printed, &arg);
1371 } else {
1372 printed += scnprintf(bf + printed, size - printed,
1373 "%ld", val);
1374 }
1375 }
1376 } else if (IS_ERR(sc->tp_format)) {
1377 /*
1378 * If we managed to read the tracepoint /format file, then we
1379 * may end up not having any args, like with gettid(), so only
1380 * print the raw args when we didn't manage to read it.
1381 */
1382 int i = 0;
1383
1384 while (i < 6) {
1385 /* special care for unaligned accesses */
1386 p = args + sizeof(unsigned long) * i;
1387 memcpy(&val, p, sizeof(val));
1388 printed += scnprintf(bf + printed, size - printed,
1389 "%sarg%d: %ld",
1390 printed ? ", " : "", i, val);
1391 ++i;
1392 }
1393 }
1394
1395 return printed;
1396 }
1397
1398 typedef int (*tracepoint_handler)(struct trace *trace, struct perf_evsel *evsel,
1399 union perf_event *event,
1400 struct perf_sample *sample);
1401
1402 static struct syscall *trace__syscall_info(struct trace *trace,
1403 struct perf_evsel *evsel, int id)
1404 {
1405
1406 if (id < 0) {
1407
1408 /*
1409 * XXX: Noticed on x86_64, reproduced as far back as 3.0.36, haven't tried
1410 * before that, leaving at a higher verbosity level till that is
1411 * explained. Reproduced with plain ftrace with:
1412 *
1413 * echo 1 > /t/events/raw_syscalls/sys_exit/enable
1414 * grep "NR -1 " /t/trace_pipe
1415 *
1416 * After generating some load on the machine.
1417 */
1418 if (verbose > 1) {
1419 static u64 n;
1420 fprintf(trace->output, "Invalid syscall %d id, skipping (%s, %" PRIu64 ") ...\n",
1421 id, perf_evsel__name(evsel), ++n);
1422 }
1423 return NULL;
1424 }
1425
1426 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL) &&
1427 trace__read_syscall_info(trace, id))
1428 goto out_cant_read;
1429
1430 if ((id > trace->syscalls.max || trace->syscalls.table[id].name == NULL))
1431 goto out_cant_read;
1432
1433 return &trace->syscalls.table[id];
1434
1435 out_cant_read:
1436 if (verbose > 0) {
1437 fprintf(trace->output, "Problems reading syscall %d", id);
1438 if (id <= trace->syscalls.max && trace->syscalls.table[id].name != NULL)
1439 fprintf(trace->output, "(%s)", trace->syscalls.table[id].name);
1440 fputs(" information\n", trace->output);
1441 }
1442 return NULL;
1443 }
1444
1445 static void thread__update_stats(struct thread_trace *ttrace,
1446 int id, struct perf_sample *sample)
1447 {
1448 struct int_node *inode;
1449 struct stats *stats;
1450 u64 duration = 0;
1451
1452 inode = intlist__findnew(ttrace->syscall_stats, id);
1453 if (inode == NULL)
1454 return;
1455
1456 stats = inode->priv;
1457 if (stats == NULL) {
1458 stats = malloc(sizeof(struct stats));
1459 if (stats == NULL)
1460 return;
1461 init_stats(stats);
1462 inode->priv = stats;
1463 }
1464
1465 if (ttrace->entry_time && sample->time > ttrace->entry_time)
1466 duration = sample->time - ttrace->entry_time;
1467
1468 update_stats(stats, duration);
1469 }
1470
1471 static int trace__printf_interrupted_entry(struct trace *trace, struct perf_sample *sample)
1472 {
1473 struct thread_trace *ttrace;
1474 u64 duration;
1475 size_t printed;
1476
1477 if (trace->current == NULL)
1478 return 0;
1479
1480 ttrace = thread__priv(trace->current);
1481
1482 if (!ttrace->entry_pending)
1483 return 0;
1484
1485 duration = sample->time - ttrace->entry_time;
1486
1487 printed = trace__fprintf_entry_head(trace, trace->current, duration, true, ttrace->entry_time, trace->output);
1488 printed += fprintf(trace->output, "%-70s) ...\n", ttrace->entry_str);
1489 ttrace->entry_pending = false;
1490
1491 return printed;
1492 }
1493
1494 static int trace__sys_enter(struct trace *trace, struct perf_evsel *evsel,
1495 union perf_event *event __maybe_unused,
1496 struct perf_sample *sample)
1497 {
1498 char *msg;
1499 void *args;
1500 size_t printed = 0;
1501 struct thread *thread;
1502 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1;
1503 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1504 struct thread_trace *ttrace;
1505
1506 if (sc == NULL)
1507 return -1;
1508
1509 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1510 ttrace = thread__trace(thread, trace->output);
1511 if (ttrace == NULL)
1512 goto out_put;
1513
1514 args = perf_evsel__sc_tp_ptr(evsel, args, sample);
1515
1516 if (ttrace->entry_str == NULL) {
1517 ttrace->entry_str = malloc(trace__entry_str_size);
1518 if (!ttrace->entry_str)
1519 goto out_put;
1520 }
1521
1522 if (!(trace->duration_filter || trace->summary_only || trace->min_stack))
1523 trace__printf_interrupted_entry(trace, sample);
1524
1525 ttrace->entry_time = sample->time;
1526 msg = ttrace->entry_str;
1527 printed += scnprintf(msg + printed, trace__entry_str_size - printed, "%s(", sc->name);
1528
1529 printed += syscall__scnprintf_args(sc, msg + printed, trace__entry_str_size - printed,
1530 args, trace, thread);
1531
1532 if (sc->is_exit) {
1533 if (!(trace->duration_filter || trace->summary_only || trace->min_stack)) {
1534 trace__fprintf_entry_head(trace, thread, 0, false, ttrace->entry_time, trace->output);
1535 fprintf(trace->output, "%-70s)\n", ttrace->entry_str);
1536 }
1537 } else {
1538 ttrace->entry_pending = true;
1539 /* See trace__vfs_getname & trace__sys_exit */
1540 ttrace->filename.pending_open = false;
1541 }
1542
1543 if (trace->current != thread) {
1544 thread__put(trace->current);
1545 trace->current = thread__get(thread);
1546 }
1547 err = 0;
1548 out_put:
1549 thread__put(thread);
1550 return err;
1551 }
1552
1553 static int trace__resolve_callchain(struct trace *trace, struct perf_evsel *evsel,
1554 struct perf_sample *sample,
1555 struct callchain_cursor *cursor)
1556 {
1557 struct addr_location al;
1558
1559 if (machine__resolve(trace->host, &al, sample) < 0 ||
1560 thread__resolve_callchain(al.thread, cursor, evsel, sample, NULL, NULL, trace->max_stack))
1561 return -1;
1562
1563 return 0;
1564 }
1565
1566 static int trace__fprintf_callchain(struct trace *trace, struct perf_sample *sample)
1567 {
1568 /* TODO: user-configurable print_opts */
1569 const unsigned int print_opts = EVSEL__PRINT_SYM |
1570 EVSEL__PRINT_DSO |
1571 EVSEL__PRINT_UNKNOWN_AS_ADDR;
1572
1573 return sample__fprintf_callchain(sample, 38, print_opts, &callchain_cursor, trace->output);
1574 }
1575
1576 static int trace__sys_exit(struct trace *trace, struct perf_evsel *evsel,
1577 union perf_event *event __maybe_unused,
1578 struct perf_sample *sample)
1579 {
1580 long ret;
1581 u64 duration = 0;
1582 bool duration_calculated = false;
1583 struct thread *thread;
1584 int id = perf_evsel__sc_tp_uint(evsel, id, sample), err = -1, callchain_ret = 0;
1585 struct syscall *sc = trace__syscall_info(trace, evsel, id);
1586 struct thread_trace *ttrace;
1587
1588 if (sc == NULL)
1589 return -1;
1590
1591 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1592 ttrace = thread__trace(thread, trace->output);
1593 if (ttrace == NULL)
1594 goto out_put;
1595
1596 if (trace->summary)
1597 thread__update_stats(ttrace, id, sample);
1598
1599 ret = perf_evsel__sc_tp_uint(evsel, ret, sample);
1600
1601 if (id == trace->open_id && ret >= 0 && ttrace->filename.pending_open) {
1602 trace__set_fd_pathname(thread, ret, ttrace->filename.name);
1603 ttrace->filename.pending_open = false;
1604 ++trace->stats.vfs_getname;
1605 }
1606
1607 if (ttrace->entry_time) {
1608 duration = sample->time - ttrace->entry_time;
1609 if (trace__filter_duration(trace, duration))
1610 goto out;
1611 duration_calculated = true;
1612 } else if (trace->duration_filter)
1613 goto out;
1614
1615 if (sample->callchain) {
1616 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1617 if (callchain_ret == 0) {
1618 if (callchain_cursor.nr < trace->min_stack)
1619 goto out;
1620 callchain_ret = 1;
1621 }
1622 }
1623
1624 if (trace->summary_only)
1625 goto out;
1626
1627 trace__fprintf_entry_head(trace, thread, duration, duration_calculated, ttrace->entry_time, trace->output);
1628
1629 if (ttrace->entry_pending) {
1630 fprintf(trace->output, "%-70s", ttrace->entry_str);
1631 } else {
1632 fprintf(trace->output, " ... [");
1633 color_fprintf(trace->output, PERF_COLOR_YELLOW, "continued");
1634 fprintf(trace->output, "]: %s()", sc->name);
1635 }
1636
1637 if (sc->fmt == NULL) {
1638 signed_print:
1639 fprintf(trace->output, ") = %ld", ret);
1640 } else if (ret < 0 && (sc->fmt->errmsg || sc->fmt->errpid)) {
1641 char bf[STRERR_BUFSIZE];
1642 const char *emsg = str_error_r(-ret, bf, sizeof(bf)),
1643 *e = audit_errno_to_name(-ret);
1644
1645 fprintf(trace->output, ") = -1 %s %s", e, emsg);
1646 } else if (ret == 0 && sc->fmt->timeout)
1647 fprintf(trace->output, ") = 0 Timeout");
1648 else if (sc->fmt->hexret)
1649 fprintf(trace->output, ") = %#lx", ret);
1650 else if (sc->fmt->errpid) {
1651 struct thread *child = machine__find_thread(trace->host, ret, ret);
1652
1653 if (child != NULL) {
1654 fprintf(trace->output, ") = %ld", ret);
1655 if (child->comm_set)
1656 fprintf(trace->output, " (%s)", thread__comm_str(child));
1657 thread__put(child);
1658 }
1659 } else
1660 goto signed_print;
1661
1662 fputc('\n', trace->output);
1663
1664 if (callchain_ret > 0)
1665 trace__fprintf_callchain(trace, sample);
1666 else if (callchain_ret < 0)
1667 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1668 out:
1669 ttrace->entry_pending = false;
1670 err = 0;
1671 out_put:
1672 thread__put(thread);
1673 return err;
1674 }
1675
1676 static int trace__vfs_getname(struct trace *trace, struct perf_evsel *evsel,
1677 union perf_event *event __maybe_unused,
1678 struct perf_sample *sample)
1679 {
1680 struct thread *thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1681 struct thread_trace *ttrace;
1682 size_t filename_len, entry_str_len, to_move;
1683 ssize_t remaining_space;
1684 char *pos;
1685 const char *filename = perf_evsel__rawptr(evsel, sample, "pathname");
1686
1687 if (!thread)
1688 goto out;
1689
1690 ttrace = thread__priv(thread);
1691 if (!ttrace)
1692 goto out_put;
1693
1694 filename_len = strlen(filename);
1695 if (filename_len == 0)
1696 goto out_put;
1697
1698 if (ttrace->filename.namelen < filename_len) {
1699 char *f = realloc(ttrace->filename.name, filename_len + 1);
1700
1701 if (f == NULL)
1702 goto out_put;
1703
1704 ttrace->filename.namelen = filename_len;
1705 ttrace->filename.name = f;
1706 }
1707
1708 strcpy(ttrace->filename.name, filename);
1709 ttrace->filename.pending_open = true;
1710
1711 if (!ttrace->filename.ptr)
1712 goto out_put;
1713
1714 entry_str_len = strlen(ttrace->entry_str);
1715 remaining_space = trace__entry_str_size - entry_str_len - 1; /* \0 */
1716 if (remaining_space <= 0)
1717 goto out_put;
1718
1719 if (filename_len > (size_t)remaining_space) {
1720 filename += filename_len - remaining_space;
1721 filename_len = remaining_space;
1722 }
1723
1724 to_move = entry_str_len - ttrace->filename.entry_str_pos + 1; /* \0 */
1725 pos = ttrace->entry_str + ttrace->filename.entry_str_pos;
1726 memmove(pos + filename_len, pos, to_move);
1727 memcpy(pos, filename, filename_len);
1728
1729 ttrace->filename.ptr = 0;
1730 ttrace->filename.entry_str_pos = 0;
1731 out_put:
1732 thread__put(thread);
1733 out:
1734 return 0;
1735 }
1736
1737 static int trace__sched_stat_runtime(struct trace *trace, struct perf_evsel *evsel,
1738 union perf_event *event __maybe_unused,
1739 struct perf_sample *sample)
1740 {
1741 u64 runtime = perf_evsel__intval(evsel, sample, "runtime");
1742 double runtime_ms = (double)runtime / NSEC_PER_MSEC;
1743 struct thread *thread = machine__findnew_thread(trace->host,
1744 sample->pid,
1745 sample->tid);
1746 struct thread_trace *ttrace = thread__trace(thread, trace->output);
1747
1748 if (ttrace == NULL)
1749 goto out_dump;
1750
1751 ttrace->runtime_ms += runtime_ms;
1752 trace->runtime_ms += runtime_ms;
1753 out_put:
1754 thread__put(thread);
1755 return 0;
1756
1757 out_dump:
1758 fprintf(trace->output, "%s: comm=%s,pid=%u,runtime=%" PRIu64 ",vruntime=%" PRIu64 ")\n",
1759 evsel->name,
1760 perf_evsel__strval(evsel, sample, "comm"),
1761 (pid_t)perf_evsel__intval(evsel, sample, "pid"),
1762 runtime,
1763 perf_evsel__intval(evsel, sample, "vruntime"));
1764 goto out_put;
1765 }
1766
1767 static void bpf_output__printer(enum binary_printer_ops op,
1768 unsigned int val, void *extra)
1769 {
1770 FILE *output = extra;
1771 unsigned char ch = (unsigned char)val;
1772
1773 switch (op) {
1774 case BINARY_PRINT_CHAR_DATA:
1775 fprintf(output, "%c", isprint(ch) ? ch : '.');
1776 break;
1777 case BINARY_PRINT_DATA_BEGIN:
1778 case BINARY_PRINT_LINE_BEGIN:
1779 case BINARY_PRINT_ADDR:
1780 case BINARY_PRINT_NUM_DATA:
1781 case BINARY_PRINT_NUM_PAD:
1782 case BINARY_PRINT_SEP:
1783 case BINARY_PRINT_CHAR_PAD:
1784 case BINARY_PRINT_LINE_END:
1785 case BINARY_PRINT_DATA_END:
1786 default:
1787 break;
1788 }
1789 }
1790
1791 static void bpf_output__fprintf(struct trace *trace,
1792 struct perf_sample *sample)
1793 {
1794 print_binary(sample->raw_data, sample->raw_size, 8,
1795 bpf_output__printer, trace->output);
1796 }
1797
1798 static int trace__event_handler(struct trace *trace, struct perf_evsel *evsel,
1799 union perf_event *event __maybe_unused,
1800 struct perf_sample *sample)
1801 {
1802 int callchain_ret = 0;
1803
1804 if (sample->callchain) {
1805 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1806 if (callchain_ret == 0) {
1807 if (callchain_cursor.nr < trace->min_stack)
1808 goto out;
1809 callchain_ret = 1;
1810 }
1811 }
1812
1813 trace__printf_interrupted_entry(trace, sample);
1814 trace__fprintf_tstamp(trace, sample->time, trace->output);
1815
1816 if (trace->trace_syscalls)
1817 fprintf(trace->output, "( ): ");
1818
1819 fprintf(trace->output, "%s:", evsel->name);
1820
1821 if (perf_evsel__is_bpf_output(evsel)) {
1822 bpf_output__fprintf(trace, sample);
1823 } else if (evsel->tp_format) {
1824 event_format__fprintf(evsel->tp_format, sample->cpu,
1825 sample->raw_data, sample->raw_size,
1826 trace->output);
1827 }
1828
1829 fprintf(trace->output, ")\n");
1830
1831 if (callchain_ret > 0)
1832 trace__fprintf_callchain(trace, sample);
1833 else if (callchain_ret < 0)
1834 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1835 out:
1836 return 0;
1837 }
1838
1839 static void print_location(FILE *f, struct perf_sample *sample,
1840 struct addr_location *al,
1841 bool print_dso, bool print_sym)
1842 {
1843
1844 if ((verbose > 0 || print_dso) && al->map)
1845 fprintf(f, "%s@", al->map->dso->long_name);
1846
1847 if ((verbose > 0 || print_sym) && al->sym)
1848 fprintf(f, "%s+0x%" PRIx64, al->sym->name,
1849 al->addr - al->sym->start);
1850 else if (al->map)
1851 fprintf(f, "0x%" PRIx64, al->addr);
1852 else
1853 fprintf(f, "0x%" PRIx64, sample->addr);
1854 }
1855
1856 static int trace__pgfault(struct trace *trace,
1857 struct perf_evsel *evsel,
1858 union perf_event *event __maybe_unused,
1859 struct perf_sample *sample)
1860 {
1861 struct thread *thread;
1862 struct addr_location al;
1863 char map_type = 'd';
1864 struct thread_trace *ttrace;
1865 int err = -1;
1866 int callchain_ret = 0;
1867
1868 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1869
1870 if (sample->callchain) {
1871 callchain_ret = trace__resolve_callchain(trace, evsel, sample, &callchain_cursor);
1872 if (callchain_ret == 0) {
1873 if (callchain_cursor.nr < trace->min_stack)
1874 goto out_put;
1875 callchain_ret = 1;
1876 }
1877 }
1878
1879 ttrace = thread__trace(thread, trace->output);
1880 if (ttrace == NULL)
1881 goto out_put;
1882
1883 if (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ)
1884 ttrace->pfmaj++;
1885 else
1886 ttrace->pfmin++;
1887
1888 if (trace->summary_only)
1889 goto out;
1890
1891 thread__find_addr_location(thread, sample->cpumode, MAP__FUNCTION,
1892 sample->ip, &al);
1893
1894 trace__fprintf_entry_head(trace, thread, 0, true, sample->time, trace->output);
1895
1896 fprintf(trace->output, "%sfault [",
1897 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ?
1898 "maj" : "min");
1899
1900 print_location(trace->output, sample, &al, false, true);
1901
1902 fprintf(trace->output, "] => ");
1903
1904 thread__find_addr_location(thread, sample->cpumode, MAP__VARIABLE,
1905 sample->addr, &al);
1906
1907 if (!al.map) {
1908 thread__find_addr_location(thread, sample->cpumode,
1909 MAP__FUNCTION, sample->addr, &al);
1910
1911 if (al.map)
1912 map_type = 'x';
1913 else
1914 map_type = '?';
1915 }
1916
1917 print_location(trace->output, sample, &al, true, false);
1918
1919 fprintf(trace->output, " (%c%c)\n", map_type, al.level);
1920
1921 if (callchain_ret > 0)
1922 trace__fprintf_callchain(trace, sample);
1923 else if (callchain_ret < 0)
1924 pr_err("Problem processing %s callchain, skipping...\n", perf_evsel__name(evsel));
1925 out:
1926 err = 0;
1927 out_put:
1928 thread__put(thread);
1929 return err;
1930 }
1931
1932 static void trace__set_base_time(struct trace *trace,
1933 struct perf_evsel *evsel,
1934 struct perf_sample *sample)
1935 {
1936 /*
1937 * BPF events were not setting PERF_SAMPLE_TIME, so be more robust
1938 * and don't use sample->time unconditionally, we may end up having
1939 * some other event in the future without PERF_SAMPLE_TIME for good
1940 * reason, i.e. we may not be interested in its timestamps, just in
1941 * it taking place, picking some piece of information when it
1942 * appears in our event stream (vfs_getname comes to mind).
1943 */
1944 if (trace->base_time == 0 && !trace->full_time &&
1945 (evsel->attr.sample_type & PERF_SAMPLE_TIME))
1946 trace->base_time = sample->time;
1947 }
1948
1949 static int trace__process_sample(struct perf_tool *tool,
1950 union perf_event *event,
1951 struct perf_sample *sample,
1952 struct perf_evsel *evsel,
1953 struct machine *machine __maybe_unused)
1954 {
1955 struct trace *trace = container_of(tool, struct trace, tool);
1956 struct thread *thread;
1957 int err = 0;
1958
1959 tracepoint_handler handler = evsel->handler;
1960
1961 thread = machine__findnew_thread(trace->host, sample->pid, sample->tid);
1962 if (thread && thread__is_filtered(thread))
1963 goto out;
1964
1965 trace__set_base_time(trace, evsel, sample);
1966
1967 if (handler) {
1968 ++trace->nr_events;
1969 handler(trace, evsel, event, sample);
1970 }
1971 out:
1972 thread__put(thread);
1973 return err;
1974 }
1975
1976 static int trace__record(struct trace *trace, int argc, const char **argv)
1977 {
1978 unsigned int rec_argc, i, j;
1979 const char **rec_argv;
1980 const char * const record_args[] = {
1981 "record",
1982 "-R",
1983 "-m", "1024",
1984 "-c", "1",
1985 };
1986
1987 const char * const sc_args[] = { "-e", };
1988 unsigned int sc_args_nr = ARRAY_SIZE(sc_args);
1989 const char * const majpf_args[] = { "-e", "major-faults" };
1990 unsigned int majpf_args_nr = ARRAY_SIZE(majpf_args);
1991 const char * const minpf_args[] = { "-e", "minor-faults" };
1992 unsigned int minpf_args_nr = ARRAY_SIZE(minpf_args);
1993
1994 /* +1 is for the event string below */
1995 rec_argc = ARRAY_SIZE(record_args) + sc_args_nr + 1 +
1996 majpf_args_nr + minpf_args_nr + argc;
1997 rec_argv = calloc(rec_argc + 1, sizeof(char *));
1998
1999 if (rec_argv == NULL)
2000 return -ENOMEM;
2001
2002 j = 0;
2003 for (i = 0; i < ARRAY_SIZE(record_args); i++)
2004 rec_argv[j++] = record_args[i];
2005
2006 if (trace->trace_syscalls) {
2007 for (i = 0; i < sc_args_nr; i++)
2008 rec_argv[j++] = sc_args[i];
2009
2010 /* event string may be different for older kernels - e.g., RHEL6 */
2011 if (is_valid_tracepoint("raw_syscalls:sys_enter"))
2012 rec_argv[j++] = "raw_syscalls:sys_enter,raw_syscalls:sys_exit";
2013 else if (is_valid_tracepoint("syscalls:sys_enter"))
2014 rec_argv[j++] = "syscalls:sys_enter,syscalls:sys_exit";
2015 else {
2016 pr_err("Neither raw_syscalls nor syscalls events exist.\n");
2017 return -1;
2018 }
2019 }
2020
2021 if (trace->trace_pgfaults & TRACE_PFMAJ)
2022 for (i = 0; i < majpf_args_nr; i++)
2023 rec_argv[j++] = majpf_args[i];
2024
2025 if (trace->trace_pgfaults & TRACE_PFMIN)
2026 for (i = 0; i < minpf_args_nr; i++)
2027 rec_argv[j++] = minpf_args[i];
2028
2029 for (i = 0; i < (unsigned int)argc; i++)
2030 rec_argv[j++] = argv[i];
2031
2032 return cmd_record(j, rec_argv);
2033 }
2034
2035 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp);
2036
2037 static bool perf_evlist__add_vfs_getname(struct perf_evlist *evlist)
2038 {
2039 struct perf_evsel *evsel = perf_evsel__newtp("probe", "vfs_getname");
2040
2041 if (IS_ERR(evsel))
2042 return false;
2043
2044 if (perf_evsel__field(evsel, "pathname") == NULL) {
2045 perf_evsel__delete(evsel);
2046 return false;
2047 }
2048
2049 evsel->handler = trace__vfs_getname;
2050 perf_evlist__add(evlist, evsel);
2051 return true;
2052 }
2053
2054 static struct perf_evsel *perf_evsel__new_pgfault(u64 config)
2055 {
2056 struct perf_evsel *evsel;
2057 struct perf_event_attr attr = {
2058 .type = PERF_TYPE_SOFTWARE,
2059 .mmap_data = 1,
2060 };
2061
2062 attr.config = config;
2063 attr.sample_period = 1;
2064
2065 event_attr_init(&attr);
2066
2067 evsel = perf_evsel__new(&attr);
2068 if (evsel)
2069 evsel->handler = trace__pgfault;
2070
2071 return evsel;
2072 }
2073
2074 static void trace__handle_event(struct trace *trace, union perf_event *event, struct perf_sample *sample)
2075 {
2076 const u32 type = event->header.type;
2077 struct perf_evsel *evsel;
2078
2079 if (type != PERF_RECORD_SAMPLE) {
2080 trace__process_event(trace, trace->host, event, sample);
2081 return;
2082 }
2083
2084 evsel = perf_evlist__id2evsel(trace->evlist, sample->id);
2085 if (evsel == NULL) {
2086 fprintf(trace->output, "Unknown tp ID %" PRIu64 ", skipping...\n", sample->id);
2087 return;
2088 }
2089
2090 trace__set_base_time(trace, evsel, sample);
2091
2092 if (evsel->attr.type == PERF_TYPE_TRACEPOINT &&
2093 sample->raw_data == NULL) {
2094 fprintf(trace->output, "%s sample with no payload for tid: %d, cpu %d, raw_size=%d, skipping...\n",
2095 perf_evsel__name(evsel), sample->tid,
2096 sample->cpu, sample->raw_size);
2097 } else {
2098 tracepoint_handler handler = evsel->handler;
2099 handler(trace, evsel, event, sample);
2100 }
2101 }
2102
2103 static int trace__add_syscall_newtp(struct trace *trace)
2104 {
2105 int ret = -1;
2106 struct perf_evlist *evlist = trace->evlist;
2107 struct perf_evsel *sys_enter, *sys_exit;
2108
2109 sys_enter = perf_evsel__syscall_newtp("sys_enter", trace__sys_enter);
2110 if (sys_enter == NULL)
2111 goto out;
2112
2113 if (perf_evsel__init_sc_tp_ptr_field(sys_enter, args))
2114 goto out_delete_sys_enter;
2115
2116 sys_exit = perf_evsel__syscall_newtp("sys_exit", trace__sys_exit);
2117 if (sys_exit == NULL)
2118 goto out_delete_sys_enter;
2119
2120 if (perf_evsel__init_sc_tp_uint_field(sys_exit, ret))
2121 goto out_delete_sys_exit;
2122
2123 perf_evlist__add(evlist, sys_enter);
2124 perf_evlist__add(evlist, sys_exit);
2125
2126 if (callchain_param.enabled && !trace->kernel_syscallchains) {
2127 /*
2128 * We're interested only in the user space callchain
2129 * leading to the syscall, allow overriding that for
2130 * debugging reasons using --kernel_syscall_callchains
2131 */
2132 sys_exit->attr.exclude_callchain_kernel = 1;
2133 }
2134
2135 trace->syscalls.events.sys_enter = sys_enter;
2136 trace->syscalls.events.sys_exit = sys_exit;
2137
2138 ret = 0;
2139 out:
2140 return ret;
2141
2142 out_delete_sys_exit:
2143 perf_evsel__delete_priv(sys_exit);
2144 out_delete_sys_enter:
2145 perf_evsel__delete_priv(sys_enter);
2146 goto out;
2147 }
2148
2149 static int trace__set_ev_qualifier_filter(struct trace *trace)
2150 {
2151 int err = -1;
2152 struct perf_evsel *sys_exit;
2153 char *filter = asprintf_expr_inout_ints("id", !trace->not_ev_qualifier,
2154 trace->ev_qualifier_ids.nr,
2155 trace->ev_qualifier_ids.entries);
2156
2157 if (filter == NULL)
2158 goto out_enomem;
2159
2160 if (!perf_evsel__append_tp_filter(trace->syscalls.events.sys_enter,
2161 filter)) {
2162 sys_exit = trace->syscalls.events.sys_exit;
2163 err = perf_evsel__append_tp_filter(sys_exit, filter);
2164 }
2165
2166 free(filter);
2167 out:
2168 return err;
2169 out_enomem:
2170 errno = ENOMEM;
2171 goto out;
2172 }
2173
2174 static int trace__run(struct trace *trace, int argc, const char **argv)
2175 {
2176 struct perf_evlist *evlist = trace->evlist;
2177 struct perf_evsel *evsel, *pgfault_maj = NULL, *pgfault_min = NULL;
2178 int err = -1, i;
2179 unsigned long before;
2180 const bool forks = argc > 0;
2181 bool draining = false;
2182
2183 trace->live = true;
2184
2185 if (trace->trace_syscalls && trace__add_syscall_newtp(trace))
2186 goto out_error_raw_syscalls;
2187
2188 if (trace->trace_syscalls)
2189 trace->vfs_getname = perf_evlist__add_vfs_getname(evlist);
2190
2191 if ((trace->trace_pgfaults & TRACE_PFMAJ)) {
2192 pgfault_maj = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MAJ);
2193 if (pgfault_maj == NULL)
2194 goto out_error_mem;
2195 perf_evlist__add(evlist, pgfault_maj);
2196 }
2197
2198 if ((trace->trace_pgfaults & TRACE_PFMIN)) {
2199 pgfault_min = perf_evsel__new_pgfault(PERF_COUNT_SW_PAGE_FAULTS_MIN);
2200 if (pgfault_min == NULL)
2201 goto out_error_mem;
2202 perf_evlist__add(evlist, pgfault_min);
2203 }
2204
2205 if (trace->sched &&
2206 perf_evlist__add_newtp(evlist, "sched", "sched_stat_runtime",
2207 trace__sched_stat_runtime))
2208 goto out_error_sched_stat_runtime;
2209
2210 err = perf_evlist__create_maps(evlist, &trace->opts.target);
2211 if (err < 0) {
2212 fprintf(trace->output, "Problems parsing the target to trace, check your options!\n");
2213 goto out_delete_evlist;
2214 }
2215
2216 err = trace__symbols_init(trace, evlist);
2217 if (err < 0) {
2218 fprintf(trace->output, "Problems initializing symbol libraries!\n");
2219 goto out_delete_evlist;
2220 }
2221
2222 perf_evlist__config(evlist, &trace->opts, NULL);
2223
2224 if (callchain_param.enabled) {
2225 bool use_identifier = false;
2226
2227 if (trace->syscalls.events.sys_exit) {
2228 perf_evsel__config_callchain(trace->syscalls.events.sys_exit,
2229 &trace->opts, &callchain_param);
2230 use_identifier = true;
2231 }
2232
2233 if (pgfault_maj) {
2234 perf_evsel__config_callchain(pgfault_maj, &trace->opts, &callchain_param);
2235 use_identifier = true;
2236 }
2237
2238 if (pgfault_min) {
2239 perf_evsel__config_callchain(pgfault_min, &trace->opts, &callchain_param);
2240 use_identifier = true;
2241 }
2242
2243 if (use_identifier) {
2244 /*
2245 * Now we have evsels with different sample_ids, use
2246 * PERF_SAMPLE_IDENTIFIER to map from sample to evsel
2247 * from a fixed position in each ring buffer record.
2248 *
2249 * As of this the changeset introducing this comment, this
2250 * isn't strictly needed, as the fields that can come before
2251 * PERF_SAMPLE_ID are all used, but we'll probably disable
2252 * some of those for things like copying the payload of
2253 * pointer syscall arguments, and for vfs_getname we don't
2254 * need PERF_SAMPLE_ADDR and PERF_SAMPLE_IP, so do this
2255 * here as a warning we need to use PERF_SAMPLE_IDENTIFIER.
2256 */
2257 perf_evlist__set_sample_bit(evlist, IDENTIFIER);
2258 perf_evlist__reset_sample_bit(evlist, ID);
2259 }
2260 }
2261
2262 signal(SIGCHLD, sig_handler);
2263 signal(SIGINT, sig_handler);
2264
2265 if (forks) {
2266 err = perf_evlist__prepare_workload(evlist, &trace->opts.target,
2267 argv, false, NULL);
2268 if (err < 0) {
2269 fprintf(trace->output, "Couldn't run the workload!\n");
2270 goto out_delete_evlist;
2271 }
2272 }
2273
2274 err = perf_evlist__open(evlist);
2275 if (err < 0)
2276 goto out_error_open;
2277
2278 err = bpf__apply_obj_config();
2279 if (err) {
2280 char errbuf[BUFSIZ];
2281
2282 bpf__strerror_apply_obj_config(err, errbuf, sizeof(errbuf));
2283 pr_err("ERROR: Apply config to BPF failed: %s\n",
2284 errbuf);
2285 goto out_error_open;
2286 }
2287
2288 /*
2289 * Better not use !target__has_task() here because we need to cover the
2290 * case where no threads were specified in the command line, but a
2291 * workload was, and in that case we will fill in the thread_map when
2292 * we fork the workload in perf_evlist__prepare_workload.
2293 */
2294 if (trace->filter_pids.nr > 0)
2295 err = perf_evlist__set_filter_pids(evlist, trace->filter_pids.nr, trace->filter_pids.entries);
2296 else if (thread_map__pid(evlist->threads, 0) == -1)
2297 err = perf_evlist__set_filter_pid(evlist, getpid());
2298
2299 if (err < 0)
2300 goto out_error_mem;
2301
2302 if (trace->ev_qualifier_ids.nr > 0) {
2303 err = trace__set_ev_qualifier_filter(trace);
2304 if (err < 0)
2305 goto out_errno;
2306
2307 pr_debug("event qualifier tracepoint filter: %s\n",
2308 trace->syscalls.events.sys_exit->filter);
2309 }
2310
2311 err = perf_evlist__apply_filters(evlist, &evsel);
2312 if (err < 0)
2313 goto out_error_apply_filters;
2314
2315 err = perf_evlist__mmap(evlist, trace->opts.mmap_pages, false);
2316 if (err < 0)
2317 goto out_error_mmap;
2318
2319 if (!target__none(&trace->opts.target) && !trace->opts.initial_delay)
2320 perf_evlist__enable(evlist);
2321
2322 if (forks)
2323 perf_evlist__start_workload(evlist);
2324
2325 if (trace->opts.initial_delay) {
2326 usleep(trace->opts.initial_delay * 1000);
2327 perf_evlist__enable(evlist);
2328 }
2329
2330 trace->multiple_threads = thread_map__pid(evlist->threads, 0) == -1 ||
2331 evlist->threads->nr > 1 ||
2332 perf_evlist__first(evlist)->attr.inherit;
2333 again:
2334 before = trace->nr_events;
2335
2336 for (i = 0; i < evlist->nr_mmaps; i++) {
2337 union perf_event *event;
2338
2339 while ((event = perf_evlist__mmap_read(evlist, i)) != NULL) {
2340 struct perf_sample sample;
2341
2342 ++trace->nr_events;
2343
2344 err = perf_evlist__parse_sample(evlist, event, &sample);
2345 if (err) {
2346 fprintf(trace->output, "Can't parse sample, err = %d, skipping...\n", err);
2347 goto next_event;
2348 }
2349
2350 trace__handle_event(trace, event, &sample);
2351 next_event:
2352 perf_evlist__mmap_consume(evlist, i);
2353
2354 if (interrupted)
2355 goto out_disable;
2356
2357 if (done && !draining) {
2358 perf_evlist__disable(evlist);
2359 draining = true;
2360 }
2361 }
2362 }
2363
2364 if (trace->nr_events == before) {
2365 int timeout = done ? 100 : -1;
2366
2367 if (!draining && perf_evlist__poll(evlist, timeout) > 0) {
2368 if (perf_evlist__filter_pollfd(evlist, POLLERR | POLLHUP) == 0)
2369 draining = true;
2370
2371 goto again;
2372 }
2373 } else {
2374 goto again;
2375 }
2376
2377 out_disable:
2378 thread__zput(trace->current);
2379
2380 perf_evlist__disable(evlist);
2381
2382 if (!err) {
2383 if (trace->summary)
2384 trace__fprintf_thread_summary(trace, trace->output);
2385
2386 if (trace->show_tool_stats) {
2387 fprintf(trace->output, "Stats:\n "
2388 " vfs_getname : %" PRIu64 "\n"
2389 " proc_getname: %" PRIu64 "\n",
2390 trace->stats.vfs_getname,
2391 trace->stats.proc_getname);
2392 }
2393 }
2394
2395 out_delete_evlist:
2396 perf_evlist__delete(evlist);
2397 trace->evlist = NULL;
2398 trace->live = false;
2399 return err;
2400 {
2401 char errbuf[BUFSIZ];
2402
2403 out_error_sched_stat_runtime:
2404 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "sched", "sched_stat_runtime");
2405 goto out_error;
2406
2407 out_error_raw_syscalls:
2408 tracing_path__strerror_open_tp(errno, errbuf, sizeof(errbuf), "raw_syscalls", "sys_(enter|exit)");
2409 goto out_error;
2410
2411 out_error_mmap:
2412 perf_evlist__strerror_mmap(evlist, errno, errbuf, sizeof(errbuf));
2413 goto out_error;
2414
2415 out_error_open:
2416 perf_evlist__strerror_open(evlist, errno, errbuf, sizeof(errbuf));
2417
2418 out_error:
2419 fprintf(trace->output, "%s\n", errbuf);
2420 goto out_delete_evlist;
2421
2422 out_error_apply_filters:
2423 fprintf(trace->output,
2424 "Failed to set filter \"%s\" on event %s with %d (%s)\n",
2425 evsel->filter, perf_evsel__name(evsel), errno,
2426 str_error_r(errno, errbuf, sizeof(errbuf)));
2427 goto out_delete_evlist;
2428 }
2429 out_error_mem:
2430 fprintf(trace->output, "Not enough memory to run!\n");
2431 goto out_delete_evlist;
2432
2433 out_errno:
2434 fprintf(trace->output, "errno=%d,%s\n", errno, strerror(errno));
2435 goto out_delete_evlist;
2436 }
2437
2438 static int trace__replay(struct trace *trace)
2439 {
2440 const struct perf_evsel_str_handler handlers[] = {
2441 { "probe:vfs_getname", trace__vfs_getname, },
2442 };
2443 struct perf_data_file file = {
2444 .path = input_name,
2445 .mode = PERF_DATA_MODE_READ,
2446 .force = trace->force,
2447 };
2448 struct perf_session *session;
2449 struct perf_evsel *evsel;
2450 int err = -1;
2451
2452 trace->tool.sample = trace__process_sample;
2453 trace->tool.mmap = perf_event__process_mmap;
2454 trace->tool.mmap2 = perf_event__process_mmap2;
2455 trace->tool.comm = perf_event__process_comm;
2456 trace->tool.exit = perf_event__process_exit;
2457 trace->tool.fork = perf_event__process_fork;
2458 trace->tool.attr = perf_event__process_attr;
2459 trace->tool.tracing_data = perf_event__process_tracing_data;
2460 trace->tool.build_id = perf_event__process_build_id;
2461 trace->tool.namespaces = perf_event__process_namespaces;
2462
2463 trace->tool.ordered_events = true;
2464 trace->tool.ordering_requires_timestamps = true;
2465
2466 /* add tid to output */
2467 trace->multiple_threads = true;
2468
2469 session = perf_session__new(&file, false, &trace->tool);
2470 if (session == NULL)
2471 return -1;
2472
2473 if (trace->opts.target.pid)
2474 symbol_conf.pid_list_str = strdup(trace->opts.target.pid);
2475
2476 if (trace->opts.target.tid)
2477 symbol_conf.tid_list_str = strdup(trace->opts.target.tid);
2478
2479 if (symbol__init(&session->header.env) < 0)
2480 goto out;
2481
2482 trace->host = &session->machines.host;
2483
2484 err = perf_session__set_tracepoints_handlers(session, handlers);
2485 if (err)
2486 goto out;
2487
2488 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2489 "raw_syscalls:sys_enter");
2490 /* older kernels have syscalls tp versus raw_syscalls */
2491 if (evsel == NULL)
2492 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2493 "syscalls:sys_enter");
2494
2495 if (evsel &&
2496 (perf_evsel__init_syscall_tp(evsel, trace__sys_enter) < 0 ||
2497 perf_evsel__init_sc_tp_ptr_field(evsel, args))) {
2498 pr_err("Error during initialize raw_syscalls:sys_enter event\n");
2499 goto out;
2500 }
2501
2502 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2503 "raw_syscalls:sys_exit");
2504 if (evsel == NULL)
2505 evsel = perf_evlist__find_tracepoint_by_name(session->evlist,
2506 "syscalls:sys_exit");
2507 if (evsel &&
2508 (perf_evsel__init_syscall_tp(evsel, trace__sys_exit) < 0 ||
2509 perf_evsel__init_sc_tp_uint_field(evsel, ret))) {
2510 pr_err("Error during initialize raw_syscalls:sys_exit event\n");
2511 goto out;
2512 }
2513
2514 evlist__for_each_entry(session->evlist, evsel) {
2515 if (evsel->attr.type == PERF_TYPE_SOFTWARE &&
2516 (evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MAJ ||
2517 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS_MIN ||
2518 evsel->attr.config == PERF_COUNT_SW_PAGE_FAULTS))
2519 evsel->handler = trace__pgfault;
2520 }
2521
2522 setup_pager();
2523
2524 err = perf_session__process_events(session);
2525 if (err)
2526 pr_err("Failed to process events, error %d", err);
2527
2528 else if (trace->summary)
2529 trace__fprintf_thread_summary(trace, trace->output);
2530
2531 out:
2532 perf_session__delete(session);
2533
2534 return err;
2535 }
2536
2537 static size_t trace__fprintf_threads_header(FILE *fp)
2538 {
2539 size_t printed;
2540
2541 printed = fprintf(fp, "\n Summary of events:\n\n");
2542
2543 return printed;
2544 }
2545
2546 DEFINE_RESORT_RB(syscall_stats, a->msecs > b->msecs,
2547 struct stats *stats;
2548 double msecs;
2549 int syscall;
2550 )
2551 {
2552 struct int_node *source = rb_entry(nd, struct int_node, rb_node);
2553 struct stats *stats = source->priv;
2554
2555 entry->syscall = source->i;
2556 entry->stats = stats;
2557 entry->msecs = stats ? (u64)stats->n * (avg_stats(stats) / NSEC_PER_MSEC) : 0;
2558 }
2559
2560 static size_t thread__dump_stats(struct thread_trace *ttrace,
2561 struct trace *trace, FILE *fp)
2562 {
2563 size_t printed = 0;
2564 struct syscall *sc;
2565 struct rb_node *nd;
2566 DECLARE_RESORT_RB_INTLIST(syscall_stats, ttrace->syscall_stats);
2567
2568 if (syscall_stats == NULL)
2569 return 0;
2570
2571 printed += fprintf(fp, "\n");
2572
2573 printed += fprintf(fp, " syscall calls total min avg max stddev\n");
2574 printed += fprintf(fp, " (msec) (msec) (msec) (msec) (%%)\n");
2575 printed += fprintf(fp, " --------------- -------- --------- --------- --------- --------- ------\n");
2576
2577 resort_rb__for_each_entry(nd, syscall_stats) {
2578 struct stats *stats = syscall_stats_entry->stats;
2579 if (stats) {
2580 double min = (double)(stats->min) / NSEC_PER_MSEC;
2581 double max = (double)(stats->max) / NSEC_PER_MSEC;
2582 double avg = avg_stats(stats);
2583 double pct;
2584 u64 n = (u64) stats->n;
2585
2586 pct = avg ? 100.0 * stddev_stats(stats)/avg : 0.0;
2587 avg /= NSEC_PER_MSEC;
2588
2589 sc = &trace->syscalls.table[syscall_stats_entry->syscall];
2590 printed += fprintf(fp, " %-15s", sc->name);
2591 printed += fprintf(fp, " %8" PRIu64 " %9.3f %9.3f %9.3f",
2592 n, syscall_stats_entry->msecs, min, avg);
2593 printed += fprintf(fp, " %9.3f %9.2f%%\n", max, pct);
2594 }
2595 }
2596
2597 resort_rb__delete(syscall_stats);
2598 printed += fprintf(fp, "\n\n");
2599
2600 return printed;
2601 }
2602
2603 static size_t trace__fprintf_thread(FILE *fp, struct thread *thread, struct trace *trace)
2604 {
2605 size_t printed = 0;
2606 struct thread_trace *ttrace = thread__priv(thread);
2607 double ratio;
2608
2609 if (ttrace == NULL)
2610 return 0;
2611
2612 ratio = (double)ttrace->nr_events / trace->nr_events * 100.0;
2613
2614 printed += fprintf(fp, " %s (%d), ", thread__comm_str(thread), thread->tid);
2615 printed += fprintf(fp, "%lu events, ", ttrace->nr_events);
2616 printed += fprintf(fp, "%.1f%%", ratio);
2617 if (ttrace->pfmaj)
2618 printed += fprintf(fp, ", %lu majfaults", ttrace->pfmaj);
2619 if (ttrace->pfmin)
2620 printed += fprintf(fp, ", %lu minfaults", ttrace->pfmin);
2621 if (trace->sched)
2622 printed += fprintf(fp, ", %.3f msec\n", ttrace->runtime_ms);
2623 else if (fputc('\n', fp) != EOF)
2624 ++printed;
2625
2626 printed += thread__dump_stats(ttrace, trace, fp);
2627
2628 return printed;
2629 }
2630
2631 static unsigned long thread__nr_events(struct thread_trace *ttrace)
2632 {
2633 return ttrace ? ttrace->nr_events : 0;
2634 }
2635
2636 DEFINE_RESORT_RB(threads, (thread__nr_events(a->thread->priv) < thread__nr_events(b->thread->priv)),
2637 struct thread *thread;
2638 )
2639 {
2640 entry->thread = rb_entry(nd, struct thread, rb_node);
2641 }
2642
2643 static size_t trace__fprintf_thread_summary(struct trace *trace, FILE *fp)
2644 {
2645 DECLARE_RESORT_RB_MACHINE_THREADS(threads, trace->host);
2646 size_t printed = trace__fprintf_threads_header(fp);
2647 struct rb_node *nd;
2648
2649 if (threads == NULL) {
2650 fprintf(fp, "%s", "Error sorting output by nr_events!\n");
2651 return 0;
2652 }
2653
2654 resort_rb__for_each_entry(nd, threads)
2655 printed += trace__fprintf_thread(fp, threads_entry->thread, trace);
2656
2657 resort_rb__delete(threads);
2658
2659 return printed;
2660 }
2661
2662 static int trace__set_duration(const struct option *opt, const char *str,
2663 int unset __maybe_unused)
2664 {
2665 struct trace *trace = opt->value;
2666
2667 trace->duration_filter = atof(str);
2668 return 0;
2669 }
2670
2671 static int trace__set_filter_pids(const struct option *opt, const char *str,
2672 int unset __maybe_unused)
2673 {
2674 int ret = -1;
2675 size_t i;
2676 struct trace *trace = opt->value;
2677 /*
2678 * FIXME: introduce a intarray class, plain parse csv and create a
2679 * { int nr, int entries[] } struct...
2680 */
2681 struct intlist *list = intlist__new(str);
2682
2683 if (list == NULL)
2684 return -1;
2685
2686 i = trace->filter_pids.nr = intlist__nr_entries(list) + 1;
2687 trace->filter_pids.entries = calloc(i, sizeof(pid_t));
2688
2689 if (trace->filter_pids.entries == NULL)
2690 goto out;
2691
2692 trace->filter_pids.entries[0] = getpid();
2693
2694 for (i = 1; i < trace->filter_pids.nr; ++i)
2695 trace->filter_pids.entries[i] = intlist__entry(list, i - 1)->i;
2696
2697 intlist__delete(list);
2698 ret = 0;
2699 out:
2700 return ret;
2701 }
2702
2703 static int trace__open_output(struct trace *trace, const char *filename)
2704 {
2705 struct stat st;
2706
2707 if (!stat(filename, &st) && st.st_size) {
2708 char oldname[PATH_MAX];
2709
2710 scnprintf(oldname, sizeof(oldname), "%s.old", filename);
2711 unlink(oldname);
2712 rename(filename, oldname);
2713 }
2714
2715 trace->output = fopen(filename, "w");
2716
2717 return trace->output == NULL ? -errno : 0;
2718 }
2719
2720 static int parse_pagefaults(const struct option *opt, const char *str,
2721 int unset __maybe_unused)
2722 {
2723 int *trace_pgfaults = opt->value;
2724
2725 if (strcmp(str, "all") == 0)
2726 *trace_pgfaults |= TRACE_PFMAJ | TRACE_PFMIN;
2727 else if (strcmp(str, "maj") == 0)
2728 *trace_pgfaults |= TRACE_PFMAJ;
2729 else if (strcmp(str, "min") == 0)
2730 *trace_pgfaults |= TRACE_PFMIN;
2731 else
2732 return -1;
2733
2734 return 0;
2735 }
2736
2737 static void evlist__set_evsel_handler(struct perf_evlist *evlist, void *handler)
2738 {
2739 struct perf_evsel *evsel;
2740
2741 evlist__for_each_entry(evlist, evsel)
2742 evsel->handler = handler;
2743 }
2744
2745 /*
2746 * XXX: Hackish, just splitting the combined -e+--event (syscalls
2747 * (raw_syscalls:{sys_{enter,exit}} + events (tracepoints, HW, SW, etc) to use
2748 * existing facilities unchanged (trace->ev_qualifier + parse_options()).
2749 *
2750 * It'd be better to introduce a parse_options() variant that would return a
2751 * list with the terms it didn't match to an event...
2752 */
2753 static int trace__parse_events_option(const struct option *opt, const char *str,
2754 int unset __maybe_unused)
2755 {
2756 struct trace *trace = (struct trace *)opt->value;
2757 const char *s = str;
2758 char *sep = NULL, *lists[2] = { NULL, NULL, };
2759 int len = strlen(str), err = -1, list;
2760 char *strace_groups_dir = system_path(STRACE_GROUPS_DIR);
2761 char group_name[PATH_MAX];
2762
2763 if (strace_groups_dir == NULL)
2764 return -1;
2765
2766 if (*s == '!') {
2767 ++s;
2768 trace->not_ev_qualifier = true;
2769 }
2770
2771 while (1) {
2772 if ((sep = strchr(s, ',')) != NULL)
2773 *sep = '\0';
2774
2775 list = 0;
2776 if (syscalltbl__id(trace->sctbl, s) >= 0) {
2777 list = 1;
2778 } else {
2779 path__join(group_name, sizeof(group_name), strace_groups_dir, s);
2780 if (access(group_name, R_OK) == 0)
2781 list = 1;
2782 }
2783
2784 if (lists[list]) {
2785 sprintf(lists[list] + strlen(lists[list]), ",%s", s);
2786 } else {
2787 lists[list] = malloc(len);
2788 if (lists[list] == NULL)
2789 goto out;
2790 strcpy(lists[list], s);
2791 }
2792
2793 if (!sep)
2794 break;
2795
2796 *sep = ',';
2797 s = sep + 1;
2798 }
2799
2800 if (lists[1] != NULL) {
2801 struct strlist_config slist_config = {
2802 .dirname = strace_groups_dir,
2803 };
2804
2805 trace->ev_qualifier = strlist__new(lists[1], &slist_config);
2806 if (trace->ev_qualifier == NULL) {
2807 fputs("Not enough memory to parse event qualifier", trace->output);
2808 goto out;
2809 }
2810
2811 if (trace__validate_ev_qualifier(trace))
2812 goto out;
2813 }
2814
2815 err = 0;
2816
2817 if (lists[0]) {
2818 struct option o = OPT_CALLBACK('e', "event", &trace->evlist, "event",
2819 "event selector. use 'perf list' to list available events",
2820 parse_events_option);
2821 err = parse_events_option(&o, lists[0], 0);
2822 }
2823 out:
2824 if (sep)
2825 *sep = ',';
2826
2827 return err;
2828 }
2829
2830 int cmd_trace(int argc, const char **argv)
2831 {
2832 const char *trace_usage[] = {
2833 "perf trace [<options>] [<command>]",
2834 "perf trace [<options>] -- <command> [<options>]",
2835 "perf trace record [<options>] [<command>]",
2836 "perf trace record [<options>] -- <command> [<options>]",
2837 NULL
2838 };
2839 struct trace trace = {
2840 .syscalls = {
2841 . max = -1,
2842 },
2843 .opts = {
2844 .target = {
2845 .uid = UINT_MAX,
2846 .uses_mmap = true,
2847 },
2848 .user_freq = UINT_MAX,
2849 .user_interval = ULLONG_MAX,
2850 .no_buffering = true,
2851 .mmap_pages = UINT_MAX,
2852 .proc_map_timeout = 500,
2853 },
2854 .output = stderr,
2855 .show_comm = true,
2856 .trace_syscalls = true,
2857 .kernel_syscallchains = false,
2858 .max_stack = UINT_MAX,
2859 };
2860 const char *output_name = NULL;
2861 const struct option trace_options[] = {
2862 OPT_CALLBACK('e', "event", &trace, "event",
2863 "event/syscall selector. use 'perf list' to list available events",
2864 trace__parse_events_option),
2865 OPT_BOOLEAN(0, "comm", &trace.show_comm,
2866 "show the thread COMM next to its id"),
2867 OPT_BOOLEAN(0, "tool_stats", &trace.show_tool_stats, "show tool stats"),
2868 OPT_CALLBACK(0, "expr", &trace, "expr", "list of syscalls/events to trace",
2869 trace__parse_events_option),
2870 OPT_STRING('o', "output", &output_name, "file", "output file name"),
2871 OPT_STRING('i', "input", &input_name, "file", "Analyze events in file"),
2872 OPT_STRING('p', "pid", &trace.opts.target.pid, "pid",
2873 "trace events on existing process id"),
2874 OPT_STRING('t', "tid", &trace.opts.target.tid, "tid",
2875 "trace events on existing thread id"),
2876 OPT_CALLBACK(0, "filter-pids", &trace, "CSV list of pids",
2877 "pids to filter (by the kernel)", trace__set_filter_pids),
2878 OPT_BOOLEAN('a', "all-cpus", &trace.opts.target.system_wide,
2879 "system-wide collection from all CPUs"),
2880 OPT_STRING('C', "cpu", &trace.opts.target.cpu_list, "cpu",
2881 "list of cpus to monitor"),
2882 OPT_BOOLEAN(0, "no-inherit", &trace.opts.no_inherit,
2883 "child tasks do not inherit counters"),
2884 OPT_CALLBACK('m', "mmap-pages", &trace.opts.mmap_pages, "pages",
2885 "number of mmap data pages",
2886 perf_evlist__parse_mmap_pages),
2887 OPT_STRING('u', "uid", &trace.opts.target.uid_str, "user",
2888 "user to profile"),
2889 OPT_CALLBACK(0, "duration", &trace, "float",
2890 "show only events with duration > N.M ms",
2891 trace__set_duration),
2892 OPT_BOOLEAN(0, "sched", &trace.sched, "show blocking scheduler events"),
2893 OPT_INCR('v', "verbose", &verbose, "be more verbose"),
2894 OPT_BOOLEAN('T', "time", &trace.full_time,
2895 "Show full timestamp, not time relative to first start"),
2896 OPT_BOOLEAN('s', "summary", &trace.summary_only,
2897 "Show only syscall summary with statistics"),
2898 OPT_BOOLEAN('S', "with-summary", &trace.summary,
2899 "Show all syscalls and summary with statistics"),
2900 OPT_CALLBACK_DEFAULT('F', "pf", &trace.trace_pgfaults, "all|maj|min",
2901 "Trace pagefaults", parse_pagefaults, "maj"),
2902 OPT_BOOLEAN(0, "syscalls", &trace.trace_syscalls, "Trace syscalls"),
2903 OPT_BOOLEAN('f', "force", &trace.force, "don't complain, do it"),
2904 OPT_CALLBACK(0, "call-graph", &trace.opts,
2905 "record_mode[,record_size]", record_callchain_help,
2906 &record_parse_callchain_opt),
2907 OPT_BOOLEAN(0, "kernel-syscall-graph", &trace.kernel_syscallchains,
2908 "Show the kernel callchains on the syscall exit path"),
2909 OPT_UINTEGER(0, "min-stack", &trace.min_stack,
2910 "Set the minimum stack depth when parsing the callchain, "
2911 "anything below the specified depth will be ignored."),
2912 OPT_UINTEGER(0, "max-stack", &trace.max_stack,
2913 "Set the maximum stack depth when parsing the callchain, "
2914 "anything beyond the specified depth will be ignored. "
2915 "Default: kernel.perf_event_max_stack or " __stringify(PERF_MAX_STACK_DEPTH)),
2916 OPT_UINTEGER(0, "proc-map-timeout", &trace.opts.proc_map_timeout,
2917 "per thread proc mmap processing timeout in ms"),
2918 OPT_UINTEGER('D', "delay", &trace.opts.initial_delay,
2919 "ms to wait before starting measurement after program "
2920 "start"),
2921 OPT_END()
2922 };
2923 bool __maybe_unused max_stack_user_set = true;
2924 bool mmap_pages_user_set = true;
2925 const char * const trace_subcommands[] = { "record", NULL };
2926 int err;
2927 char bf[BUFSIZ];
2928
2929 signal(SIGSEGV, sighandler_dump_stack);
2930 signal(SIGFPE, sighandler_dump_stack);
2931
2932 trace.evlist = perf_evlist__new();
2933 trace.sctbl = syscalltbl__new();
2934
2935 if (trace.evlist == NULL || trace.sctbl == NULL) {
2936 pr_err("Not enough memory to run!\n");
2937 err = -ENOMEM;
2938 goto out;
2939 }
2940
2941 argc = parse_options_subcommand(argc, argv, trace_options, trace_subcommands,
2942 trace_usage, PARSE_OPT_STOP_AT_NON_OPTION);
2943
2944 err = bpf__setup_stdout(trace.evlist);
2945 if (err) {
2946 bpf__strerror_setup_stdout(trace.evlist, err, bf, sizeof(bf));
2947 pr_err("ERROR: Setup BPF stdout failed: %s\n", bf);
2948 goto out;
2949 }
2950
2951 err = -1;
2952
2953 if (trace.trace_pgfaults) {
2954 trace.opts.sample_address = true;
2955 trace.opts.sample_time = true;
2956 }
2957
2958 if (trace.opts.mmap_pages == UINT_MAX)
2959 mmap_pages_user_set = false;
2960
2961 if (trace.max_stack == UINT_MAX) {
2962 trace.max_stack = input_name ? PERF_MAX_STACK_DEPTH : sysctl_perf_event_max_stack;
2963 max_stack_user_set = false;
2964 }
2965
2966 #ifdef HAVE_DWARF_UNWIND_SUPPORT
2967 if ((trace.min_stack || max_stack_user_set) && !callchain_param.enabled && trace.trace_syscalls)
2968 record_opts__parse_callchain(&trace.opts, &callchain_param, "dwarf", false);
2969 #endif
2970
2971 if (callchain_param.enabled) {
2972 if (!mmap_pages_user_set && geteuid() == 0)
2973 trace.opts.mmap_pages = perf_event_mlock_kb_in_pages() * 4;
2974
2975 symbol_conf.use_callchain = true;
2976 }
2977
2978 if (trace.evlist->nr_entries > 0)
2979 evlist__set_evsel_handler(trace.evlist, trace__event_handler);
2980
2981 if ((argc >= 1) && (strcmp(argv[0], "record") == 0))
2982 return trace__record(&trace, argc-1, &argv[1]);
2983
2984 /* summary_only implies summary option, but don't overwrite summary if set */
2985 if (trace.summary_only)
2986 trace.summary = trace.summary_only;
2987
2988 if (!trace.trace_syscalls && !trace.trace_pgfaults &&
2989 trace.evlist->nr_entries == 0 /* Was --events used? */) {
2990 pr_err("Please specify something to trace.\n");
2991 return -1;
2992 }
2993
2994 if (!trace.trace_syscalls && trace.ev_qualifier) {
2995 pr_err("The -e option can't be used with --no-syscalls.\n");
2996 goto out;
2997 }
2998
2999 if (output_name != NULL) {
3000 err = trace__open_output(&trace, output_name);
3001 if (err < 0) {
3002 perror("failed to create output file");
3003 goto out;
3004 }
3005 }
3006
3007 trace.open_id = syscalltbl__id(trace.sctbl, "open");
3008
3009 err = target__validate(&trace.opts.target);
3010 if (err) {
3011 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3012 fprintf(trace.output, "%s", bf);
3013 goto out_close;
3014 }
3015
3016 err = target__parse_uid(&trace.opts.target);
3017 if (err) {
3018 target__strerror(&trace.opts.target, err, bf, sizeof(bf));
3019 fprintf(trace.output, "%s", bf);
3020 goto out_close;
3021 }
3022
3023 if (!argc && target__none(&trace.opts.target))
3024 trace.opts.target.system_wide = true;
3025
3026 if (input_name)
3027 err = trace__replay(&trace);
3028 else
3029 err = trace__run(&trace, argc, argv);
3030
3031 out_close:
3032 if (output_name != NULL)
3033 fclose(trace.output);
3034 out:
3035 return err;
3036 }