Merge tag 'v3.10.56' into update
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / kernel / events / core.c
1 /*
2 * Performance events core code:
3 *
4 * Copyright (C) 2008 Thomas Gleixner <tglx@linutronix.de>
5 * Copyright (C) 2008-2011 Red Hat, Inc., Ingo Molnar
6 * Copyright (C) 2008-2011 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
7 * Copyright © 2009 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com>
8 *
9 * For licensing details see kernel-base/COPYING
10 */
11
12 #include <linux/fs.h>
13 #include <linux/mm.h>
14 #include <linux/cpu.h>
15 #include <linux/smp.h>
16 #include <linux/idr.h>
17 #include <linux/file.h>
18 #include <linux/poll.h>
19 #include <linux/slab.h>
20 #include <linux/hash.h>
21 #include <linux/tick.h>
22 #include <linux/sysfs.h>
23 #include <linux/dcache.h>
24 #include <linux/percpu.h>
25 #include <linux/ptrace.h>
26 #include <linux/reboot.h>
27 #include <linux/vmstat.h>
28 #include <linux/device.h>
29 #include <linux/export.h>
30 #include <linux/vmalloc.h>
31 #include <linux/hardirq.h>
32 #include <linux/rculist.h>
33 #include <linux/uaccess.h>
34 #include <linux/syscalls.h>
35 #include <linux/anon_inodes.h>
36 #include <linux/kernel_stat.h>
37 #include <linux/perf_event.h>
38 #include <linux/ftrace_event.h>
39 #include <linux/hw_breakpoint.h>
40 #include <linux/mm_types.h>
41 #include <linux/cgroup.h>
42
43 #include "internal.h"
44
45 #include <asm/irq_regs.h>
46
47 struct remote_function_call {
48 struct task_struct *p;
49 int (*func)(void *info);
50 void *info;
51 int ret;
52 };
53
54 static void remote_function(void *data)
55 {
56 struct remote_function_call *tfc = data;
57 struct task_struct *p = tfc->p;
58
59 if (p) {
60 tfc->ret = -EAGAIN;
61 if (task_cpu(p) != smp_processor_id() || !task_curr(p))
62 return;
63 }
64
65 tfc->ret = tfc->func(tfc->info);
66 }
67
68 /**
69 * task_function_call - call a function on the cpu on which a task runs
70 * @p: the task to evaluate
71 * @func: the function to be called
72 * @info: the function call argument
73 *
74 * Calls the function @func when the task is currently running. This might
75 * be on the current CPU, which just calls the function directly
76 *
77 * returns: @func return value, or
78 * -ESRCH - when the process isn't running
79 * -EAGAIN - when the process moved away
80 */
81 static int
82 task_function_call(struct task_struct *p, int (*func) (void *info), void *info)
83 {
84 struct remote_function_call data = {
85 .p = p,
86 .func = func,
87 .info = info,
88 .ret = -ESRCH, /* No such (running) process */
89 };
90
91 if (task_curr(p))
92 smp_call_function_single(task_cpu(p), remote_function, &data, 1);
93
94 return data.ret;
95 }
96
97 /**
98 * cpu_function_call - call a function on the cpu
99 * @func: the function to be called
100 * @info: the function call argument
101 *
102 * Calls the function @func on the remote cpu.
103 *
104 * returns: @func return value or -ENXIO when the cpu is offline
105 */
106 static int cpu_function_call(int cpu, int (*func) (void *info), void *info)
107 {
108 struct remote_function_call data = {
109 .p = NULL,
110 .func = func,
111 .info = info,
112 .ret = -ENXIO, /* No such CPU */
113 };
114
115 smp_call_function_single(cpu, remote_function, &data, 1);
116
117 return data.ret;
118 }
119
120 #define PERF_FLAG_ALL (PERF_FLAG_FD_NO_GROUP |\
121 PERF_FLAG_FD_OUTPUT |\
122 PERF_FLAG_PID_CGROUP)
123
124 /*
125 * branch priv levels that need permission checks
126 */
127 #define PERF_SAMPLE_BRANCH_PERM_PLM \
128 (PERF_SAMPLE_BRANCH_KERNEL |\
129 PERF_SAMPLE_BRANCH_HV)
130
131 enum event_type_t {
132 EVENT_FLEXIBLE = 0x1,
133 EVENT_PINNED = 0x2,
134 EVENT_ALL = EVENT_FLEXIBLE | EVENT_PINNED,
135 };
136
137 /*
138 * perf_sched_events : >0 events exist
139 * perf_cgroup_events: >0 per-cpu cgroup events exist on this cpu
140 */
141 struct static_key_deferred perf_sched_events __read_mostly;
142 static DEFINE_PER_CPU(atomic_t, perf_cgroup_events);
143 static DEFINE_PER_CPU(atomic_t, perf_branch_stack_events);
144
145 static atomic_t nr_mmap_events __read_mostly;
146 static atomic_t nr_comm_events __read_mostly;
147 static atomic_t nr_task_events __read_mostly;
148
149 static LIST_HEAD(pmus);
150 static DEFINE_MUTEX(pmus_lock);
151 static struct srcu_struct pmus_srcu;
152
153 /*
154 * perf event paranoia level:
155 * -1 - not paranoid at all
156 * 0 - disallow raw tracepoint access for unpriv
157 * 1 - disallow cpu events for unpriv
158 * 2 - disallow kernel profiling for unpriv
159 */
160 int sysctl_perf_event_paranoid __read_mostly = 1;
161
162 /* Minimum for 512 kiB + 1 user control page */
163 int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */
164
165 /*
166 * max perf event sample rate
167 */
168 #define DEFAULT_MAX_SAMPLE_RATE 100000
169 #define DEFAULT_SAMPLE_PERIOD_NS (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
170 #define DEFAULT_CPU_TIME_MAX_PERCENT 25
171
172 int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
173
174 static int max_samples_per_tick __read_mostly = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
175 static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
176
177 static atomic_t perf_sample_allowed_ns __read_mostly =
178 ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
179
180 void update_perf_cpu_limits(void)
181 {
182 u64 tmp = perf_sample_period_ns;
183
184 tmp *= sysctl_perf_cpu_time_max_percent;
185 do_div(tmp, 100);
186 atomic_set(&perf_sample_allowed_ns, tmp);
187 }
188
189 int perf_proc_update_handler(struct ctl_table *table, int write,
190 void __user *buffer, size_t *lenp,
191 loff_t *ppos)
192 {
193 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
194
195 if (ret || !write)
196 return ret;
197
198 max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
199 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
200 update_perf_cpu_limits();
201
202 return 0;
203 }
204
205 int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
206
207 int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
208 void __user *buffer, size_t *lenp,
209 loff_t *ppos)
210 {
211 int ret = proc_dointvec(table, write, buffer, lenp, ppos);
212
213 if (ret || !write)
214 return ret;
215
216 update_perf_cpu_limits();
217
218 return 0;
219 }
220
221 /*
222 * perf samples are done in some very critical code paths (NMIs).
223 * If they take too much CPU time, the system can lock up and not
224 * get any real work done. This will drop the sample rate when
225 * we detect that events are taking too long.
226 */
227 #define NR_ACCUMULATED_SAMPLES 128
228 DEFINE_PER_CPU(u64, running_sample_length);
229
230 void perf_sample_event_took(u64 sample_len_ns)
231 {
232 u64 avg_local_sample_len;
233 u64 local_samples_len;
234
235 if (atomic_read(&perf_sample_allowed_ns) == 0)
236 return;
237
238 /* decay the counter by 1 average sample */
239 local_samples_len = __get_cpu_var(running_sample_length);
240 local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
241 local_samples_len += sample_len_ns;
242 __get_cpu_var(running_sample_length) = local_samples_len;
243
244 /*
245 * note: this will be biased artifically low until we have
246 * seen NR_ACCUMULATED_SAMPLES. Doing it this way keeps us
247 * from having to maintain a count.
248 */
249 avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
250
251 if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
252 return;
253
254 if (max_samples_per_tick <= 1)
255 return;
256
257 max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
258 sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
259 perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
260
261 printk_ratelimited(KERN_WARNING
262 "perf samples too long (%lld > %d), lowering "
263 "kernel.perf_event_max_sample_rate to %d\n",
264 avg_local_sample_len,
265 atomic_read(&perf_sample_allowed_ns),
266 sysctl_perf_event_sample_rate);
267
268 update_perf_cpu_limits();
269 }
270
271 static atomic64_t perf_event_id;
272
273 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
274 enum event_type_t event_type);
275
276 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
277 enum event_type_t event_type,
278 struct task_struct *task);
279
280 static void update_context_time(struct perf_event_context *ctx);
281 static u64 perf_event_time(struct perf_event *event);
282
283 void __weak perf_event_print_debug(void) { }
284
285 extern __weak const char *perf_pmu_name(void)
286 {
287 return "pmu";
288 }
289
290 static inline u64 perf_clock(void)
291 {
292 return local_clock();
293 }
294
295 static inline struct perf_cpu_context *
296 __get_cpu_context(struct perf_event_context *ctx)
297 {
298 return this_cpu_ptr(ctx->pmu->pmu_cpu_context);
299 }
300
301 static void perf_ctx_lock(struct perf_cpu_context *cpuctx,
302 struct perf_event_context *ctx)
303 {
304 raw_spin_lock(&cpuctx->ctx.lock);
305 if (ctx)
306 raw_spin_lock(&ctx->lock);
307 }
308
309 static void perf_ctx_unlock(struct perf_cpu_context *cpuctx,
310 struct perf_event_context *ctx)
311 {
312 if (ctx)
313 raw_spin_unlock(&ctx->lock);
314 raw_spin_unlock(&cpuctx->ctx.lock);
315 }
316
317 #ifdef CONFIG_CGROUP_PERF
318
319 /*
320 * perf_cgroup_info keeps track of time_enabled for a cgroup.
321 * This is a per-cpu dynamically allocated data structure.
322 */
323 struct perf_cgroup_info {
324 u64 time;
325 u64 timestamp;
326 };
327
328 struct perf_cgroup {
329 struct cgroup_subsys_state css;
330 struct perf_cgroup_info __percpu *info;
331 };
332
333 /*
334 * Must ensure cgroup is pinned (css_get) before calling
335 * this function. In other words, we cannot call this function
336 * if there is no cgroup event for the current CPU context.
337 */
338 static inline struct perf_cgroup *
339 perf_cgroup_from_task(struct task_struct *task)
340 {
341 return container_of(task_subsys_state(task, perf_subsys_id),
342 struct perf_cgroup, css);
343 }
344
345 static inline bool
346 perf_cgroup_match(struct perf_event *event)
347 {
348 struct perf_event_context *ctx = event->ctx;
349 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
350
351 /* @event doesn't care about cgroup */
352 if (!event->cgrp)
353 return true;
354
355 /* wants specific cgroup scope but @cpuctx isn't associated with any */
356 if (!cpuctx->cgrp)
357 return false;
358
359 /*
360 * Cgroup scoping is recursive. An event enabled for a cgroup is
361 * also enabled for all its descendant cgroups. If @cpuctx's
362 * cgroup is a descendant of @event's (the test covers identity
363 * case), it's a match.
364 */
365 return cgroup_is_descendant(cpuctx->cgrp->css.cgroup,
366 event->cgrp->css.cgroup);
367 }
368
369 static inline bool perf_tryget_cgroup(struct perf_event *event)
370 {
371 return css_tryget(&event->cgrp->css);
372 }
373
374 static inline void perf_put_cgroup(struct perf_event *event)
375 {
376 css_put(&event->cgrp->css);
377 }
378
379 static inline void perf_detach_cgroup(struct perf_event *event)
380 {
381 perf_put_cgroup(event);
382 event->cgrp = NULL;
383 }
384
385 static inline int is_cgroup_event(struct perf_event *event)
386 {
387 return event->cgrp != NULL;
388 }
389
390 static inline u64 perf_cgroup_event_time(struct perf_event *event)
391 {
392 struct perf_cgroup_info *t;
393
394 t = per_cpu_ptr(event->cgrp->info, event->cpu);
395 return t->time;
396 }
397
398 static inline void __update_cgrp_time(struct perf_cgroup *cgrp)
399 {
400 struct perf_cgroup_info *info;
401 u64 now;
402
403 now = perf_clock();
404
405 info = this_cpu_ptr(cgrp->info);
406
407 info->time += now - info->timestamp;
408 info->timestamp = now;
409 }
410
411 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
412 {
413 struct perf_cgroup *cgrp_out = cpuctx->cgrp;
414 if (cgrp_out)
415 __update_cgrp_time(cgrp_out);
416 }
417
418 static inline void update_cgrp_time_from_event(struct perf_event *event)
419 {
420 struct perf_cgroup *cgrp;
421
422 /*
423 * ensure we access cgroup data only when needed and
424 * when we know the cgroup is pinned (css_get)
425 */
426 if (!is_cgroup_event(event))
427 return;
428
429 cgrp = perf_cgroup_from_task(current);
430 /*
431 * Do not update time when cgroup is not active
432 */
433 if (cgrp == event->cgrp)
434 __update_cgrp_time(event->cgrp);
435 }
436
437 static inline void
438 perf_cgroup_set_timestamp(struct task_struct *task,
439 struct perf_event_context *ctx)
440 {
441 struct perf_cgroup *cgrp;
442 struct perf_cgroup_info *info;
443
444 /*
445 * ctx->lock held by caller
446 * ensure we do not access cgroup data
447 * unless we have the cgroup pinned (css_get)
448 */
449 if (!task || !ctx->nr_cgroups)
450 return;
451
452 cgrp = perf_cgroup_from_task(task);
453 info = this_cpu_ptr(cgrp->info);
454 info->timestamp = ctx->timestamp;
455 }
456
457 #define PERF_CGROUP_SWOUT 0x1 /* cgroup switch out every event */
458 #define PERF_CGROUP_SWIN 0x2 /* cgroup switch in events based on task */
459
460 /*
461 * reschedule events based on the cgroup constraint of task.
462 *
463 * mode SWOUT : schedule out everything
464 * mode SWIN : schedule in based on cgroup for next
465 */
466 void perf_cgroup_switch(struct task_struct *task, int mode)
467 {
468 struct perf_cpu_context *cpuctx;
469 struct pmu *pmu;
470 unsigned long flags;
471
472 /*
473 * disable interrupts to avoid geting nr_cgroup
474 * changes via __perf_event_disable(). Also
475 * avoids preemption.
476 */
477 local_irq_save(flags);
478
479 /*
480 * we reschedule only in the presence of cgroup
481 * constrained events.
482 */
483 rcu_read_lock();
484
485 list_for_each_entry_rcu(pmu, &pmus, entry) {
486 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
487 if (cpuctx->unique_pmu != pmu)
488 continue; /* ensure we process each cpuctx once */
489
490 /*
491 * perf_cgroup_events says at least one
492 * context on this CPU has cgroup events.
493 *
494 * ctx->nr_cgroups reports the number of cgroup
495 * events for a context.
496 */
497 if (cpuctx->ctx.nr_cgroups > 0) {
498 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
499 perf_pmu_disable(cpuctx->ctx.pmu);
500
501 if (mode & PERF_CGROUP_SWOUT) {
502 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
503 /*
504 * must not be done before ctxswout due
505 * to event_filter_match() in event_sched_out()
506 */
507 cpuctx->cgrp = NULL;
508 }
509
510 if (mode & PERF_CGROUP_SWIN) {
511 WARN_ON_ONCE(cpuctx->cgrp);
512 /*
513 * set cgrp before ctxsw in to allow
514 * event_filter_match() to not have to pass
515 * task around
516 */
517 cpuctx->cgrp = perf_cgroup_from_task(task);
518 cpu_ctx_sched_in(cpuctx, EVENT_ALL, task);
519 }
520 perf_pmu_enable(cpuctx->ctx.pmu);
521 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
522 }
523 }
524
525 rcu_read_unlock();
526
527 local_irq_restore(flags);
528 }
529
530 static inline void perf_cgroup_sched_out(struct task_struct *task,
531 struct task_struct *next)
532 {
533 struct perf_cgroup *cgrp1;
534 struct perf_cgroup *cgrp2 = NULL;
535
536 /*
537 * we come here when we know perf_cgroup_events > 0
538 */
539 cgrp1 = perf_cgroup_from_task(task);
540
541 /*
542 * next is NULL when called from perf_event_enable_on_exec()
543 * that will systematically cause a cgroup_switch()
544 */
545 if (next)
546 cgrp2 = perf_cgroup_from_task(next);
547
548 /*
549 * only schedule out current cgroup events if we know
550 * that we are switching to a different cgroup. Otherwise,
551 * do no touch the cgroup events.
552 */
553 if (cgrp1 != cgrp2)
554 perf_cgroup_switch(task, PERF_CGROUP_SWOUT);
555 }
556
557 static inline void perf_cgroup_sched_in(struct task_struct *prev,
558 struct task_struct *task)
559 {
560 struct perf_cgroup *cgrp1;
561 struct perf_cgroup *cgrp2 = NULL;
562
563 /*
564 * we come here when we know perf_cgroup_events > 0
565 */
566 cgrp1 = perf_cgroup_from_task(task);
567
568 /* prev can never be NULL */
569 cgrp2 = perf_cgroup_from_task(prev);
570
571 /*
572 * only need to schedule in cgroup events if we are changing
573 * cgroup during ctxsw. Cgroup events were not scheduled
574 * out of ctxsw out if that was not the case.
575 */
576 if (cgrp1 != cgrp2)
577 perf_cgroup_switch(task, PERF_CGROUP_SWIN);
578 }
579
580 static inline int perf_cgroup_connect(int fd, struct perf_event *event,
581 struct perf_event_attr *attr,
582 struct perf_event *group_leader)
583 {
584 struct perf_cgroup *cgrp;
585 struct cgroup_subsys_state *css;
586 struct fd f = fdget(fd);
587 int ret = 0;
588
589 if (!f.file)
590 return -EBADF;
591
592 css = cgroup_css_from_dir(f.file, perf_subsys_id);
593 if (IS_ERR(css)) {
594 ret = PTR_ERR(css);
595 goto out;
596 }
597
598 cgrp = container_of(css, struct perf_cgroup, css);
599 event->cgrp = cgrp;
600
601 /* must be done before we fput() the file */
602 if (!perf_tryget_cgroup(event)) {
603 event->cgrp = NULL;
604 ret = -ENOENT;
605 goto out;
606 }
607
608 /*
609 * all events in a group must monitor
610 * the same cgroup because a task belongs
611 * to only one perf cgroup at a time
612 */
613 if (group_leader && group_leader->cgrp != cgrp) {
614 perf_detach_cgroup(event);
615 ret = -EINVAL;
616 }
617 out:
618 fdput(f);
619 return ret;
620 }
621
622 static inline void
623 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
624 {
625 struct perf_cgroup_info *t;
626 t = per_cpu_ptr(event->cgrp->info, event->cpu);
627 event->shadow_ctx_time = now - t->timestamp;
628 }
629
630 static inline void
631 perf_cgroup_defer_enabled(struct perf_event *event)
632 {
633 /*
634 * when the current task's perf cgroup does not match
635 * the event's, we need to remember to call the
636 * perf_mark_enable() function the first time a task with
637 * a matching perf cgroup is scheduled in.
638 */
639 if (is_cgroup_event(event) && !perf_cgroup_match(event))
640 event->cgrp_defer_enabled = 1;
641 }
642
643 static inline void
644 perf_cgroup_mark_enabled(struct perf_event *event,
645 struct perf_event_context *ctx)
646 {
647 struct perf_event *sub;
648 u64 tstamp = perf_event_time(event);
649
650 if (!event->cgrp_defer_enabled)
651 return;
652
653 event->cgrp_defer_enabled = 0;
654
655 event->tstamp_enabled = tstamp - event->total_time_enabled;
656 list_for_each_entry(sub, &event->sibling_list, group_entry) {
657 if (sub->state >= PERF_EVENT_STATE_INACTIVE) {
658 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
659 sub->cgrp_defer_enabled = 0;
660 }
661 }
662 }
663 #else /* !CONFIG_CGROUP_PERF */
664
665 static inline bool
666 perf_cgroup_match(struct perf_event *event)
667 {
668 return true;
669 }
670
671 static inline void perf_detach_cgroup(struct perf_event *event)
672 {}
673
674 static inline int is_cgroup_event(struct perf_event *event)
675 {
676 return 0;
677 }
678
679 static inline u64 perf_cgroup_event_cgrp_time(struct perf_event *event)
680 {
681 return 0;
682 }
683
684 static inline void update_cgrp_time_from_event(struct perf_event *event)
685 {
686 }
687
688 static inline void update_cgrp_time_from_cpuctx(struct perf_cpu_context *cpuctx)
689 {
690 }
691
692 static inline void perf_cgroup_sched_out(struct task_struct *task,
693 struct task_struct *next)
694 {
695 }
696
697 static inline void perf_cgroup_sched_in(struct task_struct *prev,
698 struct task_struct *task)
699 {
700 }
701
702 static inline int perf_cgroup_connect(pid_t pid, struct perf_event *event,
703 struct perf_event_attr *attr,
704 struct perf_event *group_leader)
705 {
706 return -EINVAL;
707 }
708
709 static inline void
710 perf_cgroup_set_timestamp(struct task_struct *task,
711 struct perf_event_context *ctx)
712 {
713 }
714
715 void
716 perf_cgroup_switch(struct task_struct *task, struct task_struct *next)
717 {
718 }
719
720 static inline void
721 perf_cgroup_set_shadow_time(struct perf_event *event, u64 now)
722 {
723 }
724
725 static inline u64 perf_cgroup_event_time(struct perf_event *event)
726 {
727 return 0;
728 }
729
730 static inline void
731 perf_cgroup_defer_enabled(struct perf_event *event)
732 {
733 }
734
735 static inline void
736 perf_cgroup_mark_enabled(struct perf_event *event,
737 struct perf_event_context *ctx)
738 {
739 }
740 #endif
741
742 void perf_pmu_disable(struct pmu *pmu)
743 {
744 int *count = this_cpu_ptr(pmu->pmu_disable_count);
745 if (!(*count)++)
746 pmu->pmu_disable(pmu);
747 }
748
749 void perf_pmu_enable(struct pmu *pmu)
750 {
751 int *count = this_cpu_ptr(pmu->pmu_disable_count);
752 if (!--(*count))
753 pmu->pmu_enable(pmu);
754 }
755
756 static DEFINE_PER_CPU(struct list_head, rotation_list);
757
758 /*
759 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
760 * because they're strictly cpu affine and rotate_start is called with IRQs
761 * disabled, while rotate_context is called from IRQ context.
762 */
763 static void perf_pmu_rotate_start(struct pmu *pmu)
764 {
765 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
766 struct list_head *head = &__get_cpu_var(rotation_list);
767
768 WARN_ON(!irqs_disabled());
769
770 if (list_empty(&cpuctx->rotation_list)) {
771 int was_empty = list_empty(head);
772 list_add(&cpuctx->rotation_list, head);
773 if (was_empty)
774 tick_nohz_full_kick();
775 }
776 }
777
778 static void get_ctx(struct perf_event_context *ctx)
779 {
780 WARN_ON(!atomic_inc_not_zero(&ctx->refcount));
781 }
782
783 static void put_ctx(struct perf_event_context *ctx)
784 {
785 if (atomic_dec_and_test(&ctx->refcount)) {
786 if (ctx->parent_ctx)
787 put_ctx(ctx->parent_ctx);
788 if (ctx->task)
789 put_task_struct(ctx->task);
790 kfree_rcu(ctx, rcu_head);
791 }
792 }
793
794 static void unclone_ctx(struct perf_event_context *ctx)
795 {
796 if (ctx->parent_ctx) {
797 put_ctx(ctx->parent_ctx);
798 ctx->parent_ctx = NULL;
799 }
800 }
801
802 static u32 perf_event_pid(struct perf_event *event, struct task_struct *p)
803 {
804 /*
805 * only top level events have the pid namespace they were created in
806 */
807 if (event->parent)
808 event = event->parent;
809
810 return task_tgid_nr_ns(p, event->ns);
811 }
812
813 static u32 perf_event_tid(struct perf_event *event, struct task_struct *p)
814 {
815 /*
816 * only top level events have the pid namespace they were created in
817 */
818 if (event->parent)
819 event = event->parent;
820
821 return task_pid_nr_ns(p, event->ns);
822 }
823
824 /*
825 * If we inherit events we want to return the parent event id
826 * to userspace.
827 */
828 static u64 primary_event_id(struct perf_event *event)
829 {
830 u64 id = event->id;
831
832 if (event->parent)
833 id = event->parent->id;
834
835 return id;
836 }
837
838 /*
839 * Get the perf_event_context for a task and lock it.
840 * This has to cope with with the fact that until it is locked,
841 * the context could get moved to another task.
842 */
843 static struct perf_event_context *
844 perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
845 {
846 struct perf_event_context *ctx;
847
848 retry:
849 /*
850 * One of the few rules of preemptible RCU is that one cannot do
851 * rcu_read_unlock() while holding a scheduler (or nested) lock when
852 * part of the read side critical section was preemptible -- see
853 * rcu_read_unlock_special().
854 *
855 * Since ctx->lock nests under rq->lock we must ensure the entire read
856 * side critical section is non-preemptible.
857 */
858 preempt_disable();
859 rcu_read_lock();
860 ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
861 if (ctx) {
862 /*
863 * If this context is a clone of another, it might
864 * get swapped for another underneath us by
865 * perf_event_task_sched_out, though the
866 * rcu_read_lock() protects us from any context
867 * getting freed. Lock the context and check if it
868 * got swapped before we could get the lock, and retry
869 * if so. If we locked the right context, then it
870 * can't get swapped on us any more.
871 */
872 raw_spin_lock_irqsave(&ctx->lock, *flags);
873 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
874 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
875 rcu_read_unlock();
876 preempt_enable();
877 goto retry;
878 }
879
880 if (!atomic_inc_not_zero(&ctx->refcount)) {
881 raw_spin_unlock_irqrestore(&ctx->lock, *flags);
882 ctx = NULL;
883 }
884 }
885 rcu_read_unlock();
886 preempt_enable();
887 return ctx;
888 }
889
890 /*
891 * Get the context for a task and increment its pin_count so it
892 * can't get swapped to another task. This also increments its
893 * reference count so that the context can't get freed.
894 */
895 static struct perf_event_context *
896 perf_pin_task_context(struct task_struct *task, int ctxn)
897 {
898 struct perf_event_context *ctx;
899 unsigned long flags;
900
901 ctx = perf_lock_task_context(task, ctxn, &flags);
902 if (ctx) {
903 ++ctx->pin_count;
904 raw_spin_unlock_irqrestore(&ctx->lock, flags);
905 }
906 return ctx;
907 }
908
909 static void perf_unpin_context(struct perf_event_context *ctx)
910 {
911 unsigned long flags;
912
913 raw_spin_lock_irqsave(&ctx->lock, flags);
914 --ctx->pin_count;
915 raw_spin_unlock_irqrestore(&ctx->lock, flags);
916 }
917
918 /*
919 * Update the record of the current time in a context.
920 */
921 static void update_context_time(struct perf_event_context *ctx)
922 {
923 u64 now = perf_clock();
924
925 ctx->time += now - ctx->timestamp;
926 ctx->timestamp = now;
927 }
928
929 static u64 perf_event_time(struct perf_event *event)
930 {
931 struct perf_event_context *ctx = event->ctx;
932
933 if (is_cgroup_event(event))
934 return perf_cgroup_event_time(event);
935
936 return ctx ? ctx->time : 0;
937 }
938
939 /*
940 * Update the total_time_enabled and total_time_running fields for a event.
941 * The caller of this function needs to hold the ctx->lock.
942 */
943 static void update_event_times(struct perf_event *event)
944 {
945 struct perf_event_context *ctx = event->ctx;
946 u64 run_end;
947
948 if (event->state < PERF_EVENT_STATE_INACTIVE ||
949 event->group_leader->state < PERF_EVENT_STATE_INACTIVE)
950 return;
951 /*
952 * in cgroup mode, time_enabled represents
953 * the time the event was enabled AND active
954 * tasks were in the monitored cgroup. This is
955 * independent of the activity of the context as
956 * there may be a mix of cgroup and non-cgroup events.
957 *
958 * That is why we treat cgroup events differently
959 * here.
960 */
961 if (is_cgroup_event(event))
962 run_end = perf_cgroup_event_time(event);
963 else if (ctx->is_active)
964 run_end = ctx->time;
965 else
966 run_end = event->tstamp_stopped;
967
968 event->total_time_enabled = run_end - event->tstamp_enabled;
969
970 if (event->state == PERF_EVENT_STATE_INACTIVE)
971 run_end = event->tstamp_stopped;
972 else
973 run_end = perf_event_time(event);
974
975 event->total_time_running = run_end - event->tstamp_running;
976
977 }
978
979 /*
980 * Update total_time_enabled and total_time_running for all events in a group.
981 */
982 static void update_group_times(struct perf_event *leader)
983 {
984 struct perf_event *event;
985
986 update_event_times(leader);
987 list_for_each_entry(event, &leader->sibling_list, group_entry)
988 update_event_times(event);
989 }
990
991 static struct list_head *
992 ctx_group_list(struct perf_event *event, struct perf_event_context *ctx)
993 {
994 if (event->attr.pinned)
995 return &ctx->pinned_groups;
996 else
997 return &ctx->flexible_groups;
998 }
999
1000 /*
1001 * Add a event from the lists for its context.
1002 * Must be called with ctx->mutex and ctx->lock held.
1003 */
1004 static void
1005 list_add_event(struct perf_event *event, struct perf_event_context *ctx)
1006 {
1007 WARN_ON_ONCE(event->attach_state & PERF_ATTACH_CONTEXT);
1008 event->attach_state |= PERF_ATTACH_CONTEXT;
1009
1010 /*
1011 * If we're a stand alone event or group leader, we go to the context
1012 * list, group events are kept attached to the group so that
1013 * perf_group_detach can, at all times, locate all siblings.
1014 */
1015 if (event->group_leader == event) {
1016 struct list_head *list;
1017
1018 if (is_software_event(event))
1019 event->group_flags |= PERF_GROUP_SOFTWARE;
1020
1021 list = ctx_group_list(event, ctx);
1022 list_add_tail(&event->group_entry, list);
1023 }
1024
1025 if (is_cgroup_event(event))
1026 ctx->nr_cgroups++;
1027
1028 if (has_branch_stack(event))
1029 ctx->nr_branch_stack++;
1030
1031 list_add_rcu(&event->event_entry, &ctx->event_list);
1032 if (!ctx->nr_events)
1033 perf_pmu_rotate_start(ctx->pmu);
1034 ctx->nr_events++;
1035 if (event->attr.inherit_stat)
1036 ctx->nr_stat++;
1037 }
1038
1039 /*
1040 * Initialize event state based on the perf_event_attr::disabled.
1041 */
1042 static inline void perf_event__state_init(struct perf_event *event)
1043 {
1044 event->state = event->attr.disabled ? PERF_EVENT_STATE_OFF :
1045 PERF_EVENT_STATE_INACTIVE;
1046 }
1047
1048 /*
1049 * Called at perf_event creation and when events are attached/detached from a
1050 * group.
1051 */
1052 static void perf_event__read_size(struct perf_event *event)
1053 {
1054 int entry = sizeof(u64); /* value */
1055 int size = 0;
1056 int nr = 1;
1057
1058 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
1059 size += sizeof(u64);
1060
1061 if (event->attr.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
1062 size += sizeof(u64);
1063
1064 if (event->attr.read_format & PERF_FORMAT_ID)
1065 entry += sizeof(u64);
1066
1067 if (event->attr.read_format & PERF_FORMAT_GROUP) {
1068 nr += event->group_leader->nr_siblings;
1069 size += sizeof(u64);
1070 }
1071
1072 size += entry * nr;
1073 event->read_size = size;
1074 }
1075
1076 static void perf_event__header_size(struct perf_event *event)
1077 {
1078 struct perf_sample_data *data;
1079 u64 sample_type = event->attr.sample_type;
1080 u16 size = 0;
1081
1082 perf_event__read_size(event);
1083
1084 if (sample_type & PERF_SAMPLE_IP)
1085 size += sizeof(data->ip);
1086
1087 if (sample_type & PERF_SAMPLE_ADDR)
1088 size += sizeof(data->addr);
1089
1090 if (sample_type & PERF_SAMPLE_PERIOD)
1091 size += sizeof(data->period);
1092
1093 if (sample_type & PERF_SAMPLE_WEIGHT)
1094 size += sizeof(data->weight);
1095
1096 if (sample_type & PERF_SAMPLE_READ)
1097 size += event->read_size;
1098
1099 if (sample_type & PERF_SAMPLE_DATA_SRC)
1100 size += sizeof(data->data_src.val);
1101
1102 event->header_size = size;
1103 }
1104
1105 static void perf_event__id_header_size(struct perf_event *event)
1106 {
1107 struct perf_sample_data *data;
1108 u64 sample_type = event->attr.sample_type;
1109 u16 size = 0;
1110
1111 if (sample_type & PERF_SAMPLE_TID)
1112 size += sizeof(data->tid_entry);
1113
1114 if (sample_type & PERF_SAMPLE_TIME)
1115 size += sizeof(data->time);
1116
1117 if (sample_type & PERF_SAMPLE_ID)
1118 size += sizeof(data->id);
1119
1120 if (sample_type & PERF_SAMPLE_STREAM_ID)
1121 size += sizeof(data->stream_id);
1122
1123 if (sample_type & PERF_SAMPLE_CPU)
1124 size += sizeof(data->cpu_entry);
1125
1126 event->id_header_size = size;
1127 }
1128
1129 static void perf_group_attach(struct perf_event *event)
1130 {
1131 struct perf_event *group_leader = event->group_leader, *pos;
1132
1133 /*
1134 * We can have double attach due to group movement in perf_event_open.
1135 */
1136 if (event->attach_state & PERF_ATTACH_GROUP)
1137 return;
1138
1139 event->attach_state |= PERF_ATTACH_GROUP;
1140
1141 if (group_leader == event)
1142 return;
1143
1144 if (group_leader->group_flags & PERF_GROUP_SOFTWARE &&
1145 !is_software_event(event))
1146 group_leader->group_flags &= ~PERF_GROUP_SOFTWARE;
1147
1148 list_add_tail(&event->group_entry, &group_leader->sibling_list);
1149 group_leader->nr_siblings++;
1150
1151 perf_event__header_size(group_leader);
1152
1153 list_for_each_entry(pos, &group_leader->sibling_list, group_entry)
1154 perf_event__header_size(pos);
1155 }
1156
1157 /*
1158 * Remove a event from the lists for its context.
1159 * Must be called with ctx->mutex and ctx->lock held.
1160 */
1161 static void
1162 list_del_event(struct perf_event *event, struct perf_event_context *ctx)
1163 {
1164 struct perf_cpu_context *cpuctx;
1165 /*
1166 * We can have double detach due to exit/hot-unplug + close.
1167 */
1168 if (!(event->attach_state & PERF_ATTACH_CONTEXT))
1169 return;
1170
1171 event->attach_state &= ~PERF_ATTACH_CONTEXT;
1172
1173 if (is_cgroup_event(event)) {
1174 ctx->nr_cgroups--;
1175 cpuctx = __get_cpu_context(ctx);
1176 /*
1177 * if there are no more cgroup events
1178 * then cler cgrp to avoid stale pointer
1179 * in update_cgrp_time_from_cpuctx()
1180 */
1181 if (!ctx->nr_cgroups)
1182 cpuctx->cgrp = NULL;
1183 }
1184
1185 if (has_branch_stack(event))
1186 ctx->nr_branch_stack--;
1187
1188 ctx->nr_events--;
1189 if (event->attr.inherit_stat)
1190 ctx->nr_stat--;
1191
1192 list_del_rcu(&event->event_entry);
1193
1194 if (event->group_leader == event)
1195 list_del_init(&event->group_entry);
1196
1197 update_group_times(event);
1198
1199 /*
1200 * If event was in error state, then keep it
1201 * that way, otherwise bogus counts will be
1202 * returned on read(). The only way to get out
1203 * of error state is by explicit re-enabling
1204 * of the event
1205 */
1206 if (event->state > PERF_EVENT_STATE_OFF)
1207 event->state = PERF_EVENT_STATE_OFF;
1208 }
1209
1210 static void perf_group_detach(struct perf_event *event)
1211 {
1212 struct perf_event *sibling, *tmp;
1213 struct list_head *list = NULL;
1214
1215 /*
1216 * We can have double detach due to exit/hot-unplug + close.
1217 */
1218 if (!(event->attach_state & PERF_ATTACH_GROUP))
1219 return;
1220
1221 event->attach_state &= ~PERF_ATTACH_GROUP;
1222
1223 /*
1224 * If this is a sibling, remove it from its group.
1225 */
1226 if (event->group_leader != event) {
1227 list_del_init(&event->group_entry);
1228 event->group_leader->nr_siblings--;
1229 goto out;
1230 }
1231
1232 if (!list_empty(&event->group_entry))
1233 list = &event->group_entry;
1234
1235 /*
1236 * If this was a group event with sibling events then
1237 * upgrade the siblings to singleton events by adding them
1238 * to whatever list we are on.
1239 * If this isn't on a list, make sure we still remove the sibling's
1240 * group_entry from this sibling_list; otherwise, when that sibling
1241 * is later deallocated, it will try to remove itself from this
1242 * sibling_list, which may well have been deallocated already,
1243 * resulting in a use-after-free.
1244 */
1245 list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
1246 if (list)
1247 list_move_tail(&sibling->group_entry, list);
1248 else
1249 list_del_init(&sibling->group_entry);
1250 sibling->group_leader = sibling;
1251
1252 /* Inherit group flags from the previous leader */
1253 sibling->group_flags = event->group_flags;
1254 }
1255
1256 out:
1257 perf_event__header_size(event->group_leader);
1258
1259 list_for_each_entry(tmp, &event->group_leader->sibling_list, group_entry)
1260 perf_event__header_size(tmp);
1261 }
1262
1263 static inline int
1264 event_filter_match(struct perf_event *event)
1265 {
1266 return (event->cpu == -1 || event->cpu == smp_processor_id())
1267 && perf_cgroup_match(event);
1268 }
1269
1270 static void
1271 event_sched_out(struct perf_event *event,
1272 struct perf_cpu_context *cpuctx,
1273 struct perf_event_context *ctx)
1274 {
1275 u64 tstamp = perf_event_time(event);
1276 u64 delta;
1277 /*
1278 * An event which could not be activated because of
1279 * filter mismatch still needs to have its timings
1280 * maintained, otherwise bogus information is return
1281 * via read() for time_enabled, time_running:
1282 */
1283 if (event->state == PERF_EVENT_STATE_INACTIVE
1284 && !event_filter_match(event)) {
1285 delta = tstamp - event->tstamp_stopped;
1286 event->tstamp_running += delta;
1287 event->tstamp_stopped = tstamp;
1288 }
1289
1290 if (event->state != PERF_EVENT_STATE_ACTIVE)
1291 return;
1292
1293 event->state = PERF_EVENT_STATE_INACTIVE;
1294 if (event->pending_disable) {
1295 event->pending_disable = 0;
1296 event->state = PERF_EVENT_STATE_OFF;
1297 }
1298 event->tstamp_stopped = tstamp;
1299 event->pmu->del(event, 0);
1300 event->oncpu = -1;
1301
1302 if (!is_software_event(event))
1303 cpuctx->active_oncpu--;
1304 ctx->nr_active--;
1305 if (event->attr.freq && event->attr.sample_freq)
1306 ctx->nr_freq--;
1307 if (event->attr.exclusive || !cpuctx->active_oncpu)
1308 cpuctx->exclusive = 0;
1309 }
1310
1311 static void
1312 group_sched_out(struct perf_event *group_event,
1313 struct perf_cpu_context *cpuctx,
1314 struct perf_event_context *ctx)
1315 {
1316 struct perf_event *event;
1317 int state = group_event->state;
1318
1319 event_sched_out(group_event, cpuctx, ctx);
1320
1321 /*
1322 * Schedule out siblings (if any):
1323 */
1324 list_for_each_entry(event, &group_event->sibling_list, group_entry)
1325 event_sched_out(event, cpuctx, ctx);
1326
1327 if (state == PERF_EVENT_STATE_ACTIVE && group_event->attr.exclusive)
1328 cpuctx->exclusive = 0;
1329 }
1330
1331 struct remove_event {
1332 struct perf_event *event;
1333 bool detach_group;
1334 };
1335
1336 /*
1337 * Cross CPU call to remove a performance event
1338 *
1339 * We disable the event on the hardware level first. After that we
1340 * remove it from the context list.
1341 */
1342 static int __perf_remove_from_context(void *info)
1343 {
1344 struct remove_event *re = info;
1345 struct perf_event *event = re->event;
1346 struct perf_event_context *ctx = event->ctx;
1347 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1348
1349 raw_spin_lock(&ctx->lock);
1350 event_sched_out(event, cpuctx, ctx);
1351 if (re->detach_group)
1352 perf_group_detach(event);
1353 list_del_event(event, ctx);
1354 if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
1355 ctx->is_active = 0;
1356 cpuctx->task_ctx = NULL;
1357 }
1358 raw_spin_unlock(&ctx->lock);
1359
1360 return 0;
1361 }
1362
1363
1364 /*
1365 * Remove the event from a task's (or a CPU's) list of events.
1366 *
1367 * CPU events are removed with a smp call. For task events we only
1368 * call when the task is on a CPU.
1369 *
1370 * If event->ctx is a cloned context, callers must make sure that
1371 * every task struct that event->ctx->task could possibly point to
1372 * remains valid. This is OK when called from perf_release since
1373 * that only calls us on the top-level context, which can't be a clone.
1374 * When called from perf_event_exit_task, it's OK because the
1375 * context has been detached from its task.
1376 */
1377 static void perf_remove_from_context(struct perf_event *event, bool detach_group)
1378 {
1379 struct perf_event_context *ctx = event->ctx;
1380 struct task_struct *task = ctx->task;
1381 struct remove_event re = {
1382 .event = event,
1383 .detach_group = detach_group,
1384 };
1385
1386 lockdep_assert_held(&ctx->mutex);
1387
1388 if (!task) {
1389 /*
1390 * Per cpu events are removed via an smp call and
1391 * the removal is always successful.
1392 */
1393 cpu_function_call(event->cpu, __perf_remove_from_context, &re);
1394 return;
1395 }
1396
1397 retry:
1398 if (!task_function_call(task, __perf_remove_from_context, &re))
1399 return;
1400
1401 raw_spin_lock_irq(&ctx->lock);
1402 /*
1403 * If we failed to find a running task, but find the context active now
1404 * that we've acquired the ctx->lock, retry.
1405 */
1406 if (ctx->is_active) {
1407 raw_spin_unlock_irq(&ctx->lock);
1408 /*
1409 * Reload the task pointer, it might have been changed by
1410 * a concurrent perf_event_context_sched_out().
1411 */
1412 task = ctx->task;
1413 goto retry;
1414 }
1415
1416 /*
1417 * Since the task isn't running, its safe to remove the event, us
1418 * holding the ctx->lock ensures the task won't get scheduled in.
1419 */
1420 if (detach_group)
1421 perf_group_detach(event);
1422 list_del_event(event, ctx);
1423 raw_spin_unlock_irq(&ctx->lock);
1424 }
1425
1426 /*
1427 * Cross CPU call to disable a performance event
1428 */
1429 int __perf_event_disable(void *info)
1430 {
1431 struct perf_event *event = info;
1432 struct perf_event_context *ctx = event->ctx;
1433 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1434
1435 /*
1436 * If this is a per-task event, need to check whether this
1437 * event's task is the current task on this cpu.
1438 *
1439 * Can trigger due to concurrent perf_event_context_sched_out()
1440 * flipping contexts around.
1441 */
1442 if (ctx->task && cpuctx->task_ctx != ctx)
1443 return -EINVAL;
1444
1445 raw_spin_lock(&ctx->lock);
1446
1447 /*
1448 * If the event is on, turn it off.
1449 * If it is in error state, leave it in error state.
1450 */
1451 if (event->state >= PERF_EVENT_STATE_INACTIVE) {
1452 update_context_time(ctx);
1453 update_cgrp_time_from_event(event);
1454 update_group_times(event);
1455 if (event == event->group_leader)
1456 group_sched_out(event, cpuctx, ctx);
1457 else
1458 event_sched_out(event, cpuctx, ctx);
1459 event->state = PERF_EVENT_STATE_OFF;
1460 }
1461
1462 raw_spin_unlock(&ctx->lock);
1463
1464 return 0;
1465 }
1466
1467 /*
1468 * Disable a event.
1469 *
1470 * If event->ctx is a cloned context, callers must make sure that
1471 * every task struct that event->ctx->task could possibly point to
1472 * remains valid. This condition is satisifed when called through
1473 * perf_event_for_each_child or perf_event_for_each because they
1474 * hold the top-level event's child_mutex, so any descendant that
1475 * goes to exit will block in sync_child_event.
1476 * When called from perf_pending_event it's OK because event->ctx
1477 * is the current context on this CPU and preemption is disabled,
1478 * hence we can't get into perf_event_task_sched_out for this context.
1479 */
1480 void perf_event_disable(struct perf_event *event)
1481 {
1482 struct perf_event_context *ctx = event->ctx;
1483 struct task_struct *task = ctx->task;
1484
1485 if (!task) {
1486 /*
1487 * Disable the event on the cpu that it's on
1488 */
1489 cpu_function_call(event->cpu, __perf_event_disable, event);
1490 return;
1491 }
1492
1493 retry:
1494 if (!task_function_call(task, __perf_event_disable, event))
1495 return;
1496
1497 raw_spin_lock_irq(&ctx->lock);
1498 /*
1499 * If the event is still active, we need to retry the cross-call.
1500 */
1501 if (event->state == PERF_EVENT_STATE_ACTIVE) {
1502 raw_spin_unlock_irq(&ctx->lock);
1503 /*
1504 * Reload the task pointer, it might have been changed by
1505 * a concurrent perf_event_context_sched_out().
1506 */
1507 task = ctx->task;
1508 goto retry;
1509 }
1510
1511 /*
1512 * Since we have the lock this context can't be scheduled
1513 * in, so we can change the state safely.
1514 */
1515 if (event->state == PERF_EVENT_STATE_INACTIVE) {
1516 update_group_times(event);
1517 event->state = PERF_EVENT_STATE_OFF;
1518 }
1519 raw_spin_unlock_irq(&ctx->lock);
1520 }
1521 EXPORT_SYMBOL_GPL(perf_event_disable);
1522
1523 static void perf_set_shadow_time(struct perf_event *event,
1524 struct perf_event_context *ctx,
1525 u64 tstamp)
1526 {
1527 /*
1528 * use the correct time source for the time snapshot
1529 *
1530 * We could get by without this by leveraging the
1531 * fact that to get to this function, the caller
1532 * has most likely already called update_context_time()
1533 * and update_cgrp_time_xx() and thus both timestamp
1534 * are identical (or very close). Given that tstamp is,
1535 * already adjusted for cgroup, we could say that:
1536 * tstamp - ctx->timestamp
1537 * is equivalent to
1538 * tstamp - cgrp->timestamp.
1539 *
1540 * Then, in perf_output_read(), the calculation would
1541 * work with no changes because:
1542 * - event is guaranteed scheduled in
1543 * - no scheduled out in between
1544 * - thus the timestamp would be the same
1545 *
1546 * But this is a bit hairy.
1547 *
1548 * So instead, we have an explicit cgroup call to remain
1549 * within the time time source all along. We believe it
1550 * is cleaner and simpler to understand.
1551 */
1552 if (is_cgroup_event(event))
1553 perf_cgroup_set_shadow_time(event, tstamp);
1554 else
1555 event->shadow_ctx_time = tstamp - ctx->timestamp;
1556 }
1557
1558 #define MAX_INTERRUPTS (~0ULL)
1559
1560 static void perf_log_throttle(struct perf_event *event, int enable);
1561
1562 static int
1563 event_sched_in(struct perf_event *event,
1564 struct perf_cpu_context *cpuctx,
1565 struct perf_event_context *ctx)
1566 {
1567 u64 tstamp = perf_event_time(event);
1568
1569 if (event->state <= PERF_EVENT_STATE_OFF)
1570 return 0;
1571
1572 event->state = PERF_EVENT_STATE_ACTIVE;
1573 event->oncpu = smp_processor_id();
1574
1575 /*
1576 * Unthrottle events, since we scheduled we might have missed several
1577 * ticks already, also for a heavily scheduling task there is little
1578 * guarantee it'll get a tick in a timely manner.
1579 */
1580 if (unlikely(event->hw.interrupts == MAX_INTERRUPTS)) {
1581 perf_log_throttle(event, 1);
1582 event->hw.interrupts = 0;
1583 }
1584
1585 /*
1586 * The new state must be visible before we turn it on in the hardware:
1587 */
1588 smp_wmb();
1589
1590 if (event->pmu->add(event, PERF_EF_START)) {
1591 event->state = PERF_EVENT_STATE_INACTIVE;
1592 event->oncpu = -1;
1593 return -EAGAIN;
1594 }
1595
1596 event->tstamp_running += tstamp - event->tstamp_stopped;
1597
1598 perf_set_shadow_time(event, ctx, tstamp);
1599
1600 if (!is_software_event(event))
1601 cpuctx->active_oncpu++;
1602 ctx->nr_active++;
1603 if (event->attr.freq && event->attr.sample_freq)
1604 ctx->nr_freq++;
1605
1606 if (event->attr.exclusive)
1607 cpuctx->exclusive = 1;
1608
1609 return 0;
1610 }
1611
1612 static int
1613 group_sched_in(struct perf_event *group_event,
1614 struct perf_cpu_context *cpuctx,
1615 struct perf_event_context *ctx)
1616 {
1617 struct perf_event *event, *partial_group = NULL;
1618 struct pmu *pmu = group_event->pmu;
1619 u64 now = ctx->time;
1620 bool simulate = false;
1621
1622 if (group_event->state == PERF_EVENT_STATE_OFF)
1623 return 0;
1624
1625 pmu->start_txn(pmu);
1626
1627 if (event_sched_in(group_event, cpuctx, ctx)) {
1628 pmu->cancel_txn(pmu);
1629 return -EAGAIN;
1630 }
1631
1632 /*
1633 * Schedule in siblings as one group (if any):
1634 */
1635 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1636 if (event_sched_in(event, cpuctx, ctx)) {
1637 partial_group = event;
1638 goto group_error;
1639 }
1640 }
1641
1642 if (!pmu->commit_txn(pmu))
1643 return 0;
1644
1645 group_error:
1646 /*
1647 * Groups can be scheduled in as one unit only, so undo any
1648 * partial group before returning:
1649 * The events up to the failed event are scheduled out normally,
1650 * tstamp_stopped will be updated.
1651 *
1652 * The failed events and the remaining siblings need to have
1653 * their timings updated as if they had gone thru event_sched_in()
1654 * and event_sched_out(). This is required to get consistent timings
1655 * across the group. This also takes care of the case where the group
1656 * could never be scheduled by ensuring tstamp_stopped is set to mark
1657 * the time the event was actually stopped, such that time delta
1658 * calculation in update_event_times() is correct.
1659 */
1660 list_for_each_entry(event, &group_event->sibling_list, group_entry) {
1661 if (event == partial_group)
1662 simulate = true;
1663
1664 if (simulate) {
1665 event->tstamp_running += now - event->tstamp_stopped;
1666 event->tstamp_stopped = now;
1667 } else {
1668 event_sched_out(event, cpuctx, ctx);
1669 }
1670 }
1671 event_sched_out(group_event, cpuctx, ctx);
1672
1673 pmu->cancel_txn(pmu);
1674
1675 return -EAGAIN;
1676 }
1677
1678 /*
1679 * Work out whether we can put this event group on the CPU now.
1680 */
1681 static int group_can_go_on(struct perf_event *event,
1682 struct perf_cpu_context *cpuctx,
1683 int can_add_hw)
1684 {
1685 /*
1686 * Groups consisting entirely of software events can always go on.
1687 */
1688 if (event->group_flags & PERF_GROUP_SOFTWARE)
1689 return 1;
1690 /*
1691 * If an exclusive group is already on, no other hardware
1692 * events can go on.
1693 */
1694 if (cpuctx->exclusive)
1695 return 0;
1696 /*
1697 * If this group is exclusive and there are already
1698 * events on the CPU, it can't go on.
1699 */
1700 if (event->attr.exclusive && cpuctx->active_oncpu)
1701 return 0;
1702 /*
1703 * Otherwise, try to add it if all previous groups were able
1704 * to go on.
1705 */
1706 return can_add_hw;
1707 }
1708
1709 static void add_event_to_ctx(struct perf_event *event,
1710 struct perf_event_context *ctx)
1711 {
1712 u64 tstamp = perf_event_time(event);
1713
1714 list_add_event(event, ctx);
1715 perf_group_attach(event);
1716 event->tstamp_enabled = tstamp;
1717 event->tstamp_running = tstamp;
1718 event->tstamp_stopped = tstamp;
1719 }
1720
1721 static void task_ctx_sched_out(struct perf_event_context *ctx);
1722 static void
1723 ctx_sched_in(struct perf_event_context *ctx,
1724 struct perf_cpu_context *cpuctx,
1725 enum event_type_t event_type,
1726 struct task_struct *task);
1727
1728 static void perf_event_sched_in(struct perf_cpu_context *cpuctx,
1729 struct perf_event_context *ctx,
1730 struct task_struct *task)
1731 {
1732 cpu_ctx_sched_in(cpuctx, EVENT_PINNED, task);
1733 if (ctx)
1734 ctx_sched_in(ctx, cpuctx, EVENT_PINNED, task);
1735 cpu_ctx_sched_in(cpuctx, EVENT_FLEXIBLE, task);
1736 if (ctx)
1737 ctx_sched_in(ctx, cpuctx, EVENT_FLEXIBLE, task);
1738 }
1739
1740 /*
1741 * Cross CPU call to install and enable a performance event
1742 *
1743 * Must be called with ctx->mutex held
1744 */
1745 static int __perf_install_in_context(void *info)
1746 {
1747 struct perf_event *event = info;
1748 struct perf_event_context *ctx = event->ctx;
1749 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1750 struct perf_event_context *task_ctx = cpuctx->task_ctx;
1751 struct task_struct *task = current;
1752
1753 perf_ctx_lock(cpuctx, task_ctx);
1754 perf_pmu_disable(cpuctx->ctx.pmu);
1755
1756 /*
1757 * If there was an active task_ctx schedule it out.
1758 */
1759 if (task_ctx)
1760 task_ctx_sched_out(task_ctx);
1761
1762 /*
1763 * If the context we're installing events in is not the
1764 * active task_ctx, flip them.
1765 */
1766 if (ctx->task && task_ctx != ctx) {
1767 if (task_ctx)
1768 raw_spin_unlock(&task_ctx->lock);
1769 raw_spin_lock(&ctx->lock);
1770 task_ctx = ctx;
1771 }
1772
1773 if (task_ctx) {
1774 cpuctx->task_ctx = task_ctx;
1775 task = task_ctx->task;
1776 }
1777
1778 cpu_ctx_sched_out(cpuctx, EVENT_ALL);
1779
1780 update_context_time(ctx);
1781 /*
1782 * update cgrp time only if current cgrp
1783 * matches event->cgrp. Must be done before
1784 * calling add_event_to_ctx()
1785 */
1786 update_cgrp_time_from_event(event);
1787
1788 add_event_to_ctx(event, ctx);
1789
1790 /*
1791 * Schedule everything back in
1792 */
1793 perf_event_sched_in(cpuctx, task_ctx, task);
1794
1795 perf_pmu_enable(cpuctx->ctx.pmu);
1796 perf_ctx_unlock(cpuctx, task_ctx);
1797
1798 return 0;
1799 }
1800
1801 /*
1802 * Attach a performance event to a context
1803 *
1804 * First we add the event to the list with the hardware enable bit
1805 * in event->hw_config cleared.
1806 *
1807 * If the event is attached to a task which is on a CPU we use a smp
1808 * call to enable it in the task context. The task might have been
1809 * scheduled away, but we check this in the smp call again.
1810 */
1811 static void
1812 perf_install_in_context(struct perf_event_context *ctx,
1813 struct perf_event *event,
1814 int cpu)
1815 {
1816 struct task_struct *task = ctx->task;
1817
1818 lockdep_assert_held(&ctx->mutex);
1819
1820 event->ctx = ctx;
1821 if (event->cpu != -1)
1822 event->cpu = cpu;
1823
1824 if (!task) {
1825 /*
1826 * Per cpu events are installed via an smp call and
1827 * the install is always successful.
1828 */
1829 cpu_function_call(cpu, __perf_install_in_context, event);
1830 return;
1831 }
1832
1833 retry:
1834 if (!task_function_call(task, __perf_install_in_context, event))
1835 return;
1836
1837 raw_spin_lock_irq(&ctx->lock);
1838 /*
1839 * If we failed to find a running task, but find the context active now
1840 * that we've acquired the ctx->lock, retry.
1841 */
1842 if (ctx->is_active) {
1843 raw_spin_unlock_irq(&ctx->lock);
1844 /*
1845 * Reload the task pointer, it might have been changed by
1846 * a concurrent perf_event_context_sched_out().
1847 */
1848 task = ctx->task;
1849 goto retry;
1850 }
1851
1852 /*
1853 * Since the task isn't running, its safe to add the event, us holding
1854 * the ctx->lock ensures the task won't get scheduled in.
1855 */
1856 add_event_to_ctx(event, ctx);
1857 raw_spin_unlock_irq(&ctx->lock);
1858 }
1859
1860 /*
1861 * Put a event into inactive state and update time fields.
1862 * Enabling the leader of a group effectively enables all
1863 * the group members that aren't explicitly disabled, so we
1864 * have to update their ->tstamp_enabled also.
1865 * Note: this works for group members as well as group leaders
1866 * since the non-leader members' sibling_lists will be empty.
1867 */
1868 static void __perf_event_mark_enabled(struct perf_event *event)
1869 {
1870 struct perf_event *sub;
1871 u64 tstamp = perf_event_time(event);
1872
1873 event->state = PERF_EVENT_STATE_INACTIVE;
1874 event->tstamp_enabled = tstamp - event->total_time_enabled;
1875 list_for_each_entry(sub, &event->sibling_list, group_entry) {
1876 if (sub->state >= PERF_EVENT_STATE_INACTIVE)
1877 sub->tstamp_enabled = tstamp - sub->total_time_enabled;
1878 }
1879 }
1880
1881 /*
1882 * Cross CPU call to enable a performance event
1883 */
1884 static int __perf_event_enable(void *info)
1885 {
1886 struct perf_event *event = info;
1887 struct perf_event_context *ctx = event->ctx;
1888 struct perf_event *leader = event->group_leader;
1889 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
1890 int err;
1891
1892 /*
1893 * There's a time window between 'ctx->is_active' check
1894 * in perf_event_enable function and this place having:
1895 * - IRQs on
1896 * - ctx->lock unlocked
1897 *
1898 * where the task could be killed and 'ctx' deactivated
1899 * by perf_event_exit_task.
1900 */
1901 if (!ctx->is_active)
1902 return -EINVAL;
1903
1904 raw_spin_lock(&ctx->lock);
1905 update_context_time(ctx);
1906
1907 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1908 goto unlock;
1909
1910 /*
1911 * set current task's cgroup time reference point
1912 */
1913 perf_cgroup_set_timestamp(current, ctx);
1914
1915 __perf_event_mark_enabled(event);
1916
1917 if (!event_filter_match(event)) {
1918 if (is_cgroup_event(event))
1919 perf_cgroup_defer_enabled(event);
1920 goto unlock;
1921 }
1922
1923 /*
1924 * If the event is in a group and isn't the group leader,
1925 * then don't put it on unless the group is on.
1926 */
1927 if (leader != event && leader->state != PERF_EVENT_STATE_ACTIVE)
1928 goto unlock;
1929
1930 if (!group_can_go_on(event, cpuctx, 1)) {
1931 err = -EEXIST;
1932 } else {
1933 if (event == leader)
1934 err = group_sched_in(event, cpuctx, ctx);
1935 else
1936 err = event_sched_in(event, cpuctx, ctx);
1937 }
1938
1939 if (err) {
1940 /*
1941 * If this event can't go on and it's part of a
1942 * group, then the whole group has to come off.
1943 */
1944 if (leader != event)
1945 group_sched_out(leader, cpuctx, ctx);
1946 if (leader->attr.pinned) {
1947 update_group_times(leader);
1948 leader->state = PERF_EVENT_STATE_ERROR;
1949 }
1950 }
1951
1952 unlock:
1953 raw_spin_unlock(&ctx->lock);
1954
1955 return 0;
1956 }
1957
1958 /*
1959 * Enable a event.
1960 *
1961 * If event->ctx is a cloned context, callers must make sure that
1962 * every task struct that event->ctx->task could possibly point to
1963 * remains valid. This condition is satisfied when called through
1964 * perf_event_for_each_child or perf_event_for_each as described
1965 * for perf_event_disable.
1966 */
1967 void perf_event_enable(struct perf_event *event)
1968 {
1969 struct perf_event_context *ctx = event->ctx;
1970 struct task_struct *task = ctx->task;
1971
1972 if (!task) {
1973 /*
1974 * Enable the event on the cpu that it's on
1975 */
1976 cpu_function_call(event->cpu, __perf_event_enable, event);
1977 return;
1978 }
1979
1980 raw_spin_lock_irq(&ctx->lock);
1981 if (event->state >= PERF_EVENT_STATE_INACTIVE)
1982 goto out;
1983
1984 /*
1985 * If the event is in error state, clear that first.
1986 * That way, if we see the event in error state below, we
1987 * know that it has gone back into error state, as distinct
1988 * from the task having been scheduled away before the
1989 * cross-call arrived.
1990 */
1991 if (event->state == PERF_EVENT_STATE_ERROR)
1992 event->state = PERF_EVENT_STATE_OFF;
1993
1994 retry:
1995 if (!ctx->is_active) {
1996 __perf_event_mark_enabled(event);
1997 goto out;
1998 }
1999
2000 raw_spin_unlock_irq(&ctx->lock);
2001
2002 if (!task_function_call(task, __perf_event_enable, event))
2003 return;
2004
2005 raw_spin_lock_irq(&ctx->lock);
2006
2007 /*
2008 * If the context is active and the event is still off,
2009 * we need to retry the cross-call.
2010 */
2011 if (ctx->is_active && event->state == PERF_EVENT_STATE_OFF) {
2012 /*
2013 * task could have been flipped by a concurrent
2014 * perf_event_context_sched_out()
2015 */
2016 task = ctx->task;
2017 goto retry;
2018 }
2019
2020 out:
2021 raw_spin_unlock_irq(&ctx->lock);
2022 }
2023 EXPORT_SYMBOL_GPL(perf_event_enable);
2024
2025 int perf_event_refresh(struct perf_event *event, int refresh)
2026 {
2027 /*
2028 * not supported on inherited events
2029 */
2030 if (event->attr.inherit || !is_sampling_event(event))
2031 return -EINVAL;
2032
2033 atomic_add(refresh, &event->event_limit);
2034 perf_event_enable(event);
2035
2036 return 0;
2037 }
2038 EXPORT_SYMBOL_GPL(perf_event_refresh);
2039
2040 static void ctx_sched_out(struct perf_event_context *ctx,
2041 struct perf_cpu_context *cpuctx,
2042 enum event_type_t event_type)
2043 {
2044 struct perf_event *event;
2045 int is_active = ctx->is_active;
2046
2047 ctx->is_active &= ~event_type;
2048 if (likely(!ctx->nr_events))
2049 return;
2050
2051 update_context_time(ctx);
2052 update_cgrp_time_from_cpuctx(cpuctx);
2053 if (!ctx->nr_active)
2054 return;
2055
2056 perf_pmu_disable(ctx->pmu);
2057 if ((is_active & EVENT_PINNED) && (event_type & EVENT_PINNED)) {
2058 list_for_each_entry(event, &ctx->pinned_groups, group_entry)
2059 group_sched_out(event, cpuctx, ctx);
2060 }
2061
2062 if ((is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE)) {
2063 list_for_each_entry(event, &ctx->flexible_groups, group_entry)
2064 group_sched_out(event, cpuctx, ctx);
2065 }
2066 perf_pmu_enable(ctx->pmu);
2067 }
2068
2069 /*
2070 * Test whether two contexts are equivalent, i.e. whether they
2071 * have both been cloned from the same version of the same context
2072 * and they both have the same number of enabled events.
2073 * If the number of enabled events is the same, then the set
2074 * of enabled events should be the same, because these are both
2075 * inherited contexts, therefore we can't access individual events
2076 * in them directly with an fd; we can only enable/disable all
2077 * events via prctl, or enable/disable all events in a family
2078 * via ioctl, which will have the same effect on both contexts.
2079 */
2080 static int context_equiv(struct perf_event_context *ctx1,
2081 struct perf_event_context *ctx2)
2082 {
2083 return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx
2084 && ctx1->parent_gen == ctx2->parent_gen
2085 && !ctx1->pin_count && !ctx2->pin_count;
2086 }
2087
2088 static void __perf_event_sync_stat(struct perf_event *event,
2089 struct perf_event *next_event)
2090 {
2091 u64 value;
2092
2093 if (!event->attr.inherit_stat)
2094 return;
2095
2096 /*
2097 * Update the event value, we cannot use perf_event_read()
2098 * because we're in the middle of a context switch and have IRQs
2099 * disabled, which upsets smp_call_function_single(), however
2100 * we know the event must be on the current CPU, therefore we
2101 * don't need to use it.
2102 */
2103 switch (event->state) {
2104 case PERF_EVENT_STATE_ACTIVE:
2105 event->pmu->read(event);
2106 /* fall-through */
2107
2108 case PERF_EVENT_STATE_INACTIVE:
2109 update_event_times(event);
2110 break;
2111
2112 default:
2113 break;
2114 }
2115
2116 /*
2117 * In order to keep per-task stats reliable we need to flip the event
2118 * values when we flip the contexts.
2119 */
2120 value = local64_read(&next_event->count);
2121 value = local64_xchg(&event->count, value);
2122 local64_set(&next_event->count, value);
2123
2124 swap(event->total_time_enabled, next_event->total_time_enabled);
2125 swap(event->total_time_running, next_event->total_time_running);
2126
2127 /*
2128 * Since we swizzled the values, update the user visible data too.
2129 */
2130 perf_event_update_userpage(event);
2131 perf_event_update_userpage(next_event);
2132 }
2133
2134 static void perf_event_sync_stat(struct perf_event_context *ctx,
2135 struct perf_event_context *next_ctx)
2136 {
2137 struct perf_event *event, *next_event;
2138
2139 if (!ctx->nr_stat)
2140 return;
2141
2142 update_context_time(ctx);
2143
2144 event = list_first_entry(&ctx->event_list,
2145 struct perf_event, event_entry);
2146
2147 next_event = list_first_entry(&next_ctx->event_list,
2148 struct perf_event, event_entry);
2149
2150 while (&event->event_entry != &ctx->event_list &&
2151 &next_event->event_entry != &next_ctx->event_list) {
2152
2153 __perf_event_sync_stat(event, next_event);
2154
2155 event = list_next_entry(event, event_entry);
2156 next_event = list_next_entry(next_event, event_entry);
2157 }
2158 }
2159
2160 static void perf_event_context_sched_out(struct task_struct *task, int ctxn,
2161 struct task_struct *next)
2162 {
2163 struct perf_event_context *ctx = task->perf_event_ctxp[ctxn];
2164 struct perf_event_context *next_ctx;
2165 struct perf_event_context *parent;
2166 struct perf_cpu_context *cpuctx;
2167 int do_switch = 1;
2168
2169 if (likely(!ctx))
2170 return;
2171
2172 cpuctx = __get_cpu_context(ctx);
2173 if (!cpuctx->task_ctx)
2174 return;
2175
2176 rcu_read_lock();
2177 parent = rcu_dereference(ctx->parent_ctx);
2178 next_ctx = next->perf_event_ctxp[ctxn];
2179 if (parent && next_ctx &&
2180 rcu_dereference(next_ctx->parent_ctx) == parent) {
2181 /*
2182 * Looks like the two contexts are clones, so we might be
2183 * able to optimize the context switch. We lock both
2184 * contexts and check that they are clones under the
2185 * lock (including re-checking that neither has been
2186 * uncloned in the meantime). It doesn't matter which
2187 * order we take the locks because no other cpu could
2188 * be trying to lock both of these tasks.
2189 */
2190 raw_spin_lock(&ctx->lock);
2191 raw_spin_lock_nested(&next_ctx->lock, SINGLE_DEPTH_NESTING);
2192 if (context_equiv(ctx, next_ctx)) {
2193 /*
2194 * XXX do we need a memory barrier of sorts
2195 * wrt to rcu_dereference() of perf_event_ctxp
2196 */
2197 task->perf_event_ctxp[ctxn] = next_ctx;
2198 next->perf_event_ctxp[ctxn] = ctx;
2199 ctx->task = next;
2200 next_ctx->task = task;
2201 do_switch = 0;
2202
2203 perf_event_sync_stat(ctx, next_ctx);
2204 }
2205 raw_spin_unlock(&next_ctx->lock);
2206 raw_spin_unlock(&ctx->lock);
2207 }
2208 rcu_read_unlock();
2209
2210 if (do_switch) {
2211 raw_spin_lock(&ctx->lock);
2212 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2213 cpuctx->task_ctx = NULL;
2214 raw_spin_unlock(&ctx->lock);
2215 }
2216 }
2217
2218 #define for_each_task_context_nr(ctxn) \
2219 for ((ctxn) = 0; (ctxn) < perf_nr_task_contexts; (ctxn)++)
2220
2221 /*
2222 * Called from scheduler to remove the events of the current task,
2223 * with interrupts disabled.
2224 *
2225 * We stop each event and update the event value in event->count.
2226 *
2227 * This does not protect us against NMI, but disable()
2228 * sets the disabled bit in the control field of event _before_
2229 * accessing the event control register. If a NMI hits, then it will
2230 * not restart the event.
2231 */
2232 void __perf_event_task_sched_out(struct task_struct *task,
2233 struct task_struct *next)
2234 {
2235 int ctxn;
2236
2237 for_each_task_context_nr(ctxn)
2238 perf_event_context_sched_out(task, ctxn, next);
2239
2240 /*
2241 * if cgroup events exist on this CPU, then we need
2242 * to check if we have to switch out PMU state.
2243 * cgroup event are system-wide mode only
2244 */
2245 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2246 perf_cgroup_sched_out(task, next);
2247 }
2248
2249 static void task_ctx_sched_out(struct perf_event_context *ctx)
2250 {
2251 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2252
2253 if (!cpuctx->task_ctx)
2254 return;
2255
2256 if (WARN_ON_ONCE(ctx != cpuctx->task_ctx))
2257 return;
2258
2259 ctx_sched_out(ctx, cpuctx, EVENT_ALL);
2260 cpuctx->task_ctx = NULL;
2261 }
2262
2263 /*
2264 * Called with IRQs disabled
2265 */
2266 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
2267 enum event_type_t event_type)
2268 {
2269 ctx_sched_out(&cpuctx->ctx, cpuctx, event_type);
2270 }
2271
2272 static void
2273 ctx_pinned_sched_in(struct perf_event_context *ctx,
2274 struct perf_cpu_context *cpuctx)
2275 {
2276 struct perf_event *event;
2277
2278 list_for_each_entry(event, &ctx->pinned_groups, group_entry) {
2279 if (event->state <= PERF_EVENT_STATE_OFF)
2280 continue;
2281 if (!event_filter_match(event))
2282 continue;
2283
2284 /* may need to reset tstamp_enabled */
2285 if (is_cgroup_event(event))
2286 perf_cgroup_mark_enabled(event, ctx);
2287
2288 if (group_can_go_on(event, cpuctx, 1))
2289 group_sched_in(event, cpuctx, ctx);
2290
2291 /*
2292 * If this pinned group hasn't been scheduled,
2293 * put it in error state.
2294 */
2295 if (event->state == PERF_EVENT_STATE_INACTIVE) {
2296 update_group_times(event);
2297 event->state = PERF_EVENT_STATE_ERROR;
2298 }
2299 }
2300 }
2301
2302 static void
2303 ctx_flexible_sched_in(struct perf_event_context *ctx,
2304 struct perf_cpu_context *cpuctx)
2305 {
2306 struct perf_event *event;
2307 int can_add_hw = 1;
2308
2309 list_for_each_entry(event, &ctx->flexible_groups, group_entry) {
2310 /* Ignore events in OFF or ERROR state */
2311 if (event->state <= PERF_EVENT_STATE_OFF)
2312 continue;
2313 /*
2314 * Listen to the 'cpu' scheduling filter constraint
2315 * of events:
2316 */
2317 if (!event_filter_match(event))
2318 continue;
2319
2320 /* may need to reset tstamp_enabled */
2321 if (is_cgroup_event(event))
2322 perf_cgroup_mark_enabled(event, ctx);
2323
2324 if (group_can_go_on(event, cpuctx, can_add_hw)) {
2325 if (group_sched_in(event, cpuctx, ctx))
2326 can_add_hw = 0;
2327 }
2328 }
2329 }
2330
2331 static void
2332 ctx_sched_in(struct perf_event_context *ctx,
2333 struct perf_cpu_context *cpuctx,
2334 enum event_type_t event_type,
2335 struct task_struct *task)
2336 {
2337 u64 now;
2338 int is_active = ctx->is_active;
2339
2340 ctx->is_active |= event_type;
2341 if (likely(!ctx->nr_events))
2342 return;
2343
2344 now = perf_clock();
2345 ctx->timestamp = now;
2346 perf_cgroup_set_timestamp(task, ctx);
2347 /*
2348 * First go through the list and put on any pinned groups
2349 * in order to give them the best chance of going on.
2350 */
2351 if (!(is_active & EVENT_PINNED) && (event_type & EVENT_PINNED))
2352 ctx_pinned_sched_in(ctx, cpuctx);
2353
2354 /* Then walk through the lower prio flexible groups */
2355 if (!(is_active & EVENT_FLEXIBLE) && (event_type & EVENT_FLEXIBLE))
2356 ctx_flexible_sched_in(ctx, cpuctx);
2357 }
2358
2359 static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
2360 enum event_type_t event_type,
2361 struct task_struct *task)
2362 {
2363 struct perf_event_context *ctx = &cpuctx->ctx;
2364
2365 ctx_sched_in(ctx, cpuctx, event_type, task);
2366 }
2367
2368 static void perf_event_context_sched_in(struct perf_event_context *ctx,
2369 struct task_struct *task)
2370 {
2371 struct perf_cpu_context *cpuctx;
2372
2373 cpuctx = __get_cpu_context(ctx);
2374 if (cpuctx->task_ctx == ctx)
2375 return;
2376
2377 perf_ctx_lock(cpuctx, ctx);
2378 perf_pmu_disable(ctx->pmu);
2379 /*
2380 * We want to keep the following priority order:
2381 * cpu pinned (that don't need to move), task pinned,
2382 * cpu flexible, task flexible.
2383 */
2384 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2385
2386 if (ctx->nr_events)
2387 cpuctx->task_ctx = ctx;
2388
2389 perf_event_sched_in(cpuctx, cpuctx->task_ctx, task);
2390
2391 perf_pmu_enable(ctx->pmu);
2392 perf_ctx_unlock(cpuctx, ctx);
2393
2394 /*
2395 * Since these rotations are per-cpu, we need to ensure the
2396 * cpu-context we got scheduled on is actually rotating.
2397 */
2398 perf_pmu_rotate_start(ctx->pmu);
2399 }
2400
2401 /*
2402 * When sampling the branck stack in system-wide, it may be necessary
2403 * to flush the stack on context switch. This happens when the branch
2404 * stack does not tag its entries with the pid of the current task.
2405 * Otherwise it becomes impossible to associate a branch entry with a
2406 * task. This ambiguity is more likely to appear when the branch stack
2407 * supports priv level filtering and the user sets it to monitor only
2408 * at the user level (which could be a useful measurement in system-wide
2409 * mode). In that case, the risk is high of having a branch stack with
2410 * branch from multiple tasks. Flushing may mean dropping the existing
2411 * entries or stashing them somewhere in the PMU specific code layer.
2412 *
2413 * This function provides the context switch callback to the lower code
2414 * layer. It is invoked ONLY when there is at least one system-wide context
2415 * with at least one active event using taken branch sampling.
2416 */
2417 static void perf_branch_stack_sched_in(struct task_struct *prev,
2418 struct task_struct *task)
2419 {
2420 struct perf_cpu_context *cpuctx;
2421 struct pmu *pmu;
2422 unsigned long flags;
2423
2424 /* no need to flush branch stack if not changing task */
2425 if (prev == task)
2426 return;
2427
2428 local_irq_save(flags);
2429
2430 rcu_read_lock();
2431
2432 list_for_each_entry_rcu(pmu, &pmus, entry) {
2433 cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
2434
2435 /*
2436 * check if the context has at least one
2437 * event using PERF_SAMPLE_BRANCH_STACK
2438 */
2439 if (cpuctx->ctx.nr_branch_stack > 0
2440 && pmu->flush_branch_stack) {
2441
2442 pmu = cpuctx->ctx.pmu;
2443
2444 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2445
2446 perf_pmu_disable(pmu);
2447
2448 pmu->flush_branch_stack();
2449
2450 perf_pmu_enable(pmu);
2451
2452 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2453 }
2454 }
2455
2456 rcu_read_unlock();
2457
2458 local_irq_restore(flags);
2459 }
2460
2461 /*
2462 * Called from scheduler to add the events of the current task
2463 * with interrupts disabled.
2464 *
2465 * We restore the event value and then enable it.
2466 *
2467 * This does not protect us against NMI, but enable()
2468 * sets the enabled bit in the control field of event _before_
2469 * accessing the event control register. If a NMI hits, then it will
2470 * keep the event running.
2471 */
2472 void __perf_event_task_sched_in(struct task_struct *prev,
2473 struct task_struct *task)
2474 {
2475 struct perf_event_context *ctx;
2476 int ctxn;
2477
2478 for_each_task_context_nr(ctxn) {
2479 ctx = task->perf_event_ctxp[ctxn];
2480 if (likely(!ctx))
2481 continue;
2482
2483 perf_event_context_sched_in(ctx, task);
2484 }
2485 /*
2486 * if cgroup events exist on this CPU, then we need
2487 * to check if we have to switch in PMU state.
2488 * cgroup event are system-wide mode only
2489 */
2490 if (atomic_read(&__get_cpu_var(perf_cgroup_events)))
2491 perf_cgroup_sched_in(prev, task);
2492
2493 /* check for system-wide branch_stack events */
2494 if (atomic_read(&__get_cpu_var(perf_branch_stack_events)))
2495 perf_branch_stack_sched_in(prev, task);
2496 }
2497
2498 static u64 perf_calculate_period(struct perf_event *event, u64 nsec, u64 count)
2499 {
2500 u64 frequency = event->attr.sample_freq;
2501 u64 sec = NSEC_PER_SEC;
2502 u64 divisor, dividend;
2503
2504 int count_fls, nsec_fls, frequency_fls, sec_fls;
2505
2506 count_fls = fls64(count);
2507 nsec_fls = fls64(nsec);
2508 frequency_fls = fls64(frequency);
2509 sec_fls = 30;
2510
2511 /*
2512 * We got @count in @nsec, with a target of sample_freq HZ
2513 * the target period becomes:
2514 *
2515 * @count * 10^9
2516 * period = -------------------
2517 * @nsec * sample_freq
2518 *
2519 */
2520
2521 /*
2522 * Reduce accuracy by one bit such that @a and @b converge
2523 * to a similar magnitude.
2524 */
2525 #define REDUCE_FLS(a, b) \
2526 do { \
2527 if (a##_fls > b##_fls) { \
2528 a >>= 1; \
2529 a##_fls--; \
2530 } else { \
2531 b >>= 1; \
2532 b##_fls--; \
2533 } \
2534 } while (0)
2535
2536 /*
2537 * Reduce accuracy until either term fits in a u64, then proceed with
2538 * the other, so that finally we can do a u64/u64 division.
2539 */
2540 while (count_fls + sec_fls > 64 && nsec_fls + frequency_fls > 64) {
2541 REDUCE_FLS(nsec, frequency);
2542 REDUCE_FLS(sec, count);
2543 }
2544
2545 if (count_fls + sec_fls > 64) {
2546 divisor = nsec * frequency;
2547
2548 while (count_fls + sec_fls > 64) {
2549 REDUCE_FLS(count, sec);
2550 divisor >>= 1;
2551 }
2552
2553 dividend = count * sec;
2554 } else {
2555 dividend = count * sec;
2556
2557 while (nsec_fls + frequency_fls > 64) {
2558 REDUCE_FLS(nsec, frequency);
2559 dividend >>= 1;
2560 }
2561
2562 divisor = nsec * frequency;
2563 }
2564
2565 if (!divisor)
2566 return dividend;
2567
2568 return div64_u64(dividend, divisor);
2569 }
2570
2571 static DEFINE_PER_CPU(int, perf_throttled_count);
2572 static DEFINE_PER_CPU(u64, perf_throttled_seq);
2573
2574 static void perf_adjust_period(struct perf_event *event, u64 nsec, u64 count, bool disable)
2575 {
2576 struct hw_perf_event *hwc = &event->hw;
2577 s64 period, sample_period;
2578 s64 delta;
2579
2580 period = perf_calculate_period(event, nsec, count);
2581
2582 delta = (s64)(period - hwc->sample_period);
2583 delta = (delta + 7) / 8; /* low pass filter */
2584
2585 sample_period = hwc->sample_period + delta;
2586
2587 if (!sample_period)
2588 sample_period = 1;
2589
2590 hwc->sample_period = sample_period;
2591
2592 if (local64_read(&hwc->period_left) > 8*sample_period) {
2593 if (disable)
2594 event->pmu->stop(event, PERF_EF_UPDATE);
2595
2596 local64_set(&hwc->period_left, 0);
2597
2598 if (disable)
2599 event->pmu->start(event, PERF_EF_RELOAD);
2600 }
2601 }
2602
2603 /*
2604 * combine freq adjustment with unthrottling to avoid two passes over the
2605 * events. At the same time, make sure, having freq events does not change
2606 * the rate of unthrottling as that would introduce bias.
2607 */
2608 static void perf_adjust_freq_unthr_context(struct perf_event_context *ctx,
2609 int needs_unthr)
2610 {
2611 struct perf_event *event;
2612 struct hw_perf_event *hwc;
2613 u64 now, period = TICK_NSEC;
2614 s64 delta;
2615
2616 /*
2617 * only need to iterate over all events iff:
2618 * - context have events in frequency mode (needs freq adjust)
2619 * - there are events to unthrottle on this cpu
2620 */
2621 if (!(ctx->nr_freq || needs_unthr))
2622 return;
2623
2624 raw_spin_lock(&ctx->lock);
2625 perf_pmu_disable(ctx->pmu);
2626
2627 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
2628 if (event->state != PERF_EVENT_STATE_ACTIVE)
2629 continue;
2630
2631 if (!event_filter_match(event))
2632 continue;
2633
2634 hwc = &event->hw;
2635
2636 if (needs_unthr && hwc->interrupts == MAX_INTERRUPTS) {
2637 hwc->interrupts = 0;
2638 perf_log_throttle(event, 1);
2639 event->pmu->start(event, 0);
2640 }
2641
2642 if (!event->attr.freq || !event->attr.sample_freq)
2643 continue;
2644
2645 /*
2646 * stop the event and update event->count
2647 */
2648 event->pmu->stop(event, PERF_EF_UPDATE);
2649
2650 now = local64_read(&event->count);
2651 delta = now - hwc->freq_count_stamp;
2652 hwc->freq_count_stamp = now;
2653
2654 /*
2655 * restart the event
2656 * reload only if value has changed
2657 * we have stopped the event so tell that
2658 * to perf_adjust_period() to avoid stopping it
2659 * twice.
2660 */
2661 if (delta > 0)
2662 perf_adjust_period(event, period, delta, false);
2663
2664 event->pmu->start(event, delta > 0 ? PERF_EF_RELOAD : 0);
2665 }
2666
2667 perf_pmu_enable(ctx->pmu);
2668 raw_spin_unlock(&ctx->lock);
2669 }
2670
2671 /*
2672 * Round-robin a context's events:
2673 */
2674 static void rotate_ctx(struct perf_event_context *ctx)
2675 {
2676 /*
2677 * Rotate the first entry last of non-pinned groups. Rotation might be
2678 * disabled by the inheritance code.
2679 */
2680 if (!ctx->rotate_disable)
2681 list_rotate_left(&ctx->flexible_groups);
2682 }
2683
2684 /*
2685 * perf_pmu_rotate_start() and perf_rotate_context() are fully serialized
2686 * because they're strictly cpu affine and rotate_start is called with IRQs
2687 * disabled, while rotate_context is called from IRQ context.
2688 */
2689 static void perf_rotate_context(struct perf_cpu_context *cpuctx)
2690 {
2691 struct perf_event_context *ctx = NULL;
2692 int rotate = 0, remove = 1;
2693
2694 if (cpuctx->ctx.nr_events) {
2695 remove = 0;
2696 if (cpuctx->ctx.nr_events != cpuctx->ctx.nr_active)
2697 rotate = 1;
2698 }
2699
2700 ctx = cpuctx->task_ctx;
2701 if (ctx && ctx->nr_events) {
2702 remove = 0;
2703 if (ctx->nr_events != ctx->nr_active)
2704 rotate = 1;
2705 }
2706
2707 if (!rotate)
2708 goto done;
2709
2710 perf_ctx_lock(cpuctx, cpuctx->task_ctx);
2711 perf_pmu_disable(cpuctx->ctx.pmu);
2712
2713 cpu_ctx_sched_out(cpuctx, EVENT_FLEXIBLE);
2714 if (ctx)
2715 ctx_sched_out(ctx, cpuctx, EVENT_FLEXIBLE);
2716
2717 rotate_ctx(&cpuctx->ctx);
2718 if (ctx)
2719 rotate_ctx(ctx);
2720
2721 perf_event_sched_in(cpuctx, ctx, current);
2722
2723 perf_pmu_enable(cpuctx->ctx.pmu);
2724 perf_ctx_unlock(cpuctx, cpuctx->task_ctx);
2725 done:
2726 if (remove)
2727 list_del_init(&cpuctx->rotation_list);
2728 }
2729
2730 #ifdef CONFIG_NO_HZ_FULL
2731 bool perf_event_can_stop_tick(void)
2732 {
2733 if (list_empty(&__get_cpu_var(rotation_list)))
2734 return true;
2735 else
2736 return false;
2737 }
2738 #endif
2739
2740 void perf_event_task_tick(void)
2741 {
2742 struct list_head *head = &__get_cpu_var(rotation_list);
2743 struct perf_cpu_context *cpuctx, *tmp;
2744 struct perf_event_context *ctx;
2745 int throttled;
2746
2747 WARN_ON(!irqs_disabled());
2748
2749 __this_cpu_inc(perf_throttled_seq);
2750 throttled = __this_cpu_xchg(perf_throttled_count, 0);
2751
2752 list_for_each_entry_safe(cpuctx, tmp, head, rotation_list) {
2753 ctx = &cpuctx->ctx;
2754 perf_adjust_freq_unthr_context(ctx, throttled);
2755
2756 ctx = cpuctx->task_ctx;
2757 if (ctx)
2758 perf_adjust_freq_unthr_context(ctx, throttled);
2759
2760 if (cpuctx->jiffies_interval == 1 ||
2761 !(jiffies % cpuctx->jiffies_interval))
2762 perf_rotate_context(cpuctx);
2763 }
2764 }
2765
2766 static int event_enable_on_exec(struct perf_event *event,
2767 struct perf_event_context *ctx)
2768 {
2769 if (!event->attr.enable_on_exec)
2770 return 0;
2771
2772 event->attr.enable_on_exec = 0;
2773 if (event->state >= PERF_EVENT_STATE_INACTIVE)
2774 return 0;
2775
2776 __perf_event_mark_enabled(event);
2777
2778 return 1;
2779 }
2780
2781 /*
2782 * Enable all of a task's events that have been marked enable-on-exec.
2783 * This expects task == current.
2784 */
2785 static void perf_event_enable_on_exec(struct perf_event_context *ctx)
2786 {
2787 struct perf_event *event;
2788 unsigned long flags;
2789 int enabled = 0;
2790 int ret;
2791
2792 local_irq_save(flags);
2793 if (!ctx || !ctx->nr_events)
2794 goto out;
2795
2796 /*
2797 * We must ctxsw out cgroup events to avoid conflict
2798 * when invoking perf_task_event_sched_in() later on
2799 * in this function. Otherwise we end up trying to
2800 * ctxswin cgroup events which are already scheduled
2801 * in.
2802 */
2803 perf_cgroup_sched_out(current, NULL);
2804
2805 raw_spin_lock(&ctx->lock);
2806 task_ctx_sched_out(ctx);
2807
2808 list_for_each_entry(event, &ctx->event_list, event_entry) {
2809 ret = event_enable_on_exec(event, ctx);
2810 if (ret)
2811 enabled = 1;
2812 }
2813
2814 /*
2815 * Unclone this context if we enabled any event.
2816 */
2817 if (enabled)
2818 unclone_ctx(ctx);
2819
2820 raw_spin_unlock(&ctx->lock);
2821
2822 /*
2823 * Also calls ctxswin for cgroup events, if any:
2824 */
2825 perf_event_context_sched_in(ctx, ctx->task);
2826 out:
2827 local_irq_restore(flags);
2828 }
2829
2830 /*
2831 * Cross CPU call to read the hardware event
2832 */
2833 static void __perf_event_read(void *info)
2834 {
2835 struct perf_event *event = info;
2836 struct perf_event_context *ctx = event->ctx;
2837 struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
2838
2839 /*
2840 * If this is a task context, we need to check whether it is
2841 * the current task context of this cpu. If not it has been
2842 * scheduled out before the smp call arrived. In that case
2843 * event->count would have been updated to a recent sample
2844 * when the event was scheduled out.
2845 */
2846 if (ctx->task && cpuctx->task_ctx != ctx)
2847 return;
2848
2849 raw_spin_lock(&ctx->lock);
2850 if (ctx->is_active) {
2851 update_context_time(ctx);
2852 update_cgrp_time_from_event(event);
2853 }
2854 update_event_times(event);
2855 if (event->state == PERF_EVENT_STATE_ACTIVE)
2856 event->pmu->read(event);
2857 raw_spin_unlock(&ctx->lock);
2858 }
2859
2860 static inline u64 perf_event_count(struct perf_event *event)
2861 {
2862 return local64_read(&event->count) + atomic64_read(&event->child_count);
2863 }
2864
2865 static u64 perf_event_read(struct perf_event *event)
2866 {
2867 /*
2868 * If event is enabled and currently active on a CPU, update the
2869 * value in the event structure:
2870 */
2871 if (event->state == PERF_EVENT_STATE_ACTIVE) {
2872 smp_call_function_single(event->oncpu,
2873 __perf_event_read, event, 1);
2874 } else if (event->state == PERF_EVENT_STATE_INACTIVE) {
2875 struct perf_event_context *ctx = event->ctx;
2876 unsigned long flags;
2877
2878 raw_spin_lock_irqsave(&ctx->lock, flags);
2879 /*
2880 * may read while context is not active
2881 * (e.g., thread is blocked), in that case
2882 * we cannot update context time
2883 */
2884 if (ctx->is_active) {
2885 update_context_time(ctx);
2886 update_cgrp_time_from_event(event);
2887 }
2888 update_event_times(event);
2889 raw_spin_unlock_irqrestore(&ctx->lock, flags);
2890 }
2891
2892 return perf_event_count(event);
2893 }
2894
2895 /*
2896 * Initialize the perf_event context in a task_struct:
2897 */
2898 static void __perf_event_init_context(struct perf_event_context *ctx)
2899 {
2900 raw_spin_lock_init(&ctx->lock);
2901 mutex_init(&ctx->mutex);
2902 INIT_LIST_HEAD(&ctx->pinned_groups);
2903 INIT_LIST_HEAD(&ctx->flexible_groups);
2904 INIT_LIST_HEAD(&ctx->event_list);
2905 atomic_set(&ctx->refcount, 1);
2906 }
2907
2908 static struct perf_event_context *
2909 alloc_perf_context(struct pmu *pmu, struct task_struct *task)
2910 {
2911 struct perf_event_context *ctx;
2912
2913 ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL);
2914 if (!ctx)
2915 return NULL;
2916
2917 __perf_event_init_context(ctx);
2918 if (task) {
2919 ctx->task = task;
2920 get_task_struct(task);
2921 }
2922 ctx->pmu = pmu;
2923
2924 return ctx;
2925 }
2926
2927 static struct task_struct *
2928 find_lively_task_by_vpid(pid_t vpid)
2929 {
2930 struct task_struct *task;
2931 int err;
2932
2933 rcu_read_lock();
2934 if (!vpid)
2935 task = current;
2936 else
2937 task = find_task_by_vpid(vpid);
2938 if (task)
2939 get_task_struct(task);
2940 rcu_read_unlock();
2941
2942 if (!task)
2943 return ERR_PTR(-ESRCH);
2944
2945 /* Reuse ptrace permission checks for now. */
2946 err = -EACCES;
2947 if (!ptrace_may_access(task, PTRACE_MODE_READ))
2948 goto errout;
2949
2950 return task;
2951 errout:
2952 put_task_struct(task);
2953 return ERR_PTR(err);
2954
2955 }
2956
2957 /*
2958 * Returns a matching context with refcount and pincount.
2959 */
2960 static struct perf_event_context *
2961 find_get_context(struct pmu *pmu, struct task_struct *task, int cpu)
2962 {
2963 struct perf_event_context *ctx;
2964 struct perf_cpu_context *cpuctx;
2965 unsigned long flags;
2966 int ctxn, err;
2967
2968 if (!task) {
2969 /* Must be root to operate on a CPU event: */
2970 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
2971 return ERR_PTR(-EACCES);
2972
2973 /*
2974 * We could be clever and allow to attach a event to an
2975 * offline CPU and activate it when the CPU comes up, but
2976 * that's for later.
2977 */
2978 if (!cpu_online(cpu))
2979 return ERR_PTR(-ENODEV);
2980
2981 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
2982 ctx = &cpuctx->ctx;
2983 get_ctx(ctx);
2984 ++ctx->pin_count;
2985
2986 return ctx;
2987 }
2988
2989 err = -EINVAL;
2990 ctxn = pmu->task_ctx_nr;
2991 if (ctxn < 0)
2992 goto errout;
2993
2994 retry:
2995 ctx = perf_lock_task_context(task, ctxn, &flags);
2996 if (ctx) {
2997 unclone_ctx(ctx);
2998 ++ctx->pin_count;
2999 raw_spin_unlock_irqrestore(&ctx->lock, flags);
3000 } else {
3001 ctx = alloc_perf_context(pmu, task);
3002 err = -ENOMEM;
3003 if (!ctx)
3004 goto errout;
3005
3006 err = 0;
3007 mutex_lock(&task->perf_event_mutex);
3008 /*
3009 * If it has already passed perf_event_exit_task().
3010 * we must see PF_EXITING, it takes this mutex too.
3011 */
3012 if (task->flags & PF_EXITING)
3013 err = -ESRCH;
3014 else if (task->perf_event_ctxp[ctxn])
3015 err = -EAGAIN;
3016 else {
3017 get_ctx(ctx);
3018 ++ctx->pin_count;
3019 rcu_assign_pointer(task->perf_event_ctxp[ctxn], ctx);
3020 }
3021 mutex_unlock(&task->perf_event_mutex);
3022
3023 if (unlikely(err)) {
3024 put_ctx(ctx);
3025
3026 if (err == -EAGAIN)
3027 goto retry;
3028 goto errout;
3029 }
3030 }
3031
3032 return ctx;
3033
3034 errout:
3035 return ERR_PTR(err);
3036 }
3037
3038 static void perf_event_free_filter(struct perf_event *event);
3039
3040 static void free_event_rcu(struct rcu_head *head)
3041 {
3042 struct perf_event *event;
3043
3044 event = container_of(head, struct perf_event, rcu_head);
3045 if (event->ns)
3046 put_pid_ns(event->ns);
3047 perf_event_free_filter(event);
3048 kfree(event);
3049 }
3050
3051 static void ring_buffer_put(struct ring_buffer *rb);
3052 static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
3053
3054 static void free_event(struct perf_event *event)
3055 {
3056 irq_work_sync(&event->pending);
3057
3058 if (!event->parent) {
3059 if (event->attach_state & PERF_ATTACH_TASK)
3060 static_key_slow_dec_deferred(&perf_sched_events);
3061 if (event->attr.mmap || event->attr.mmap_data)
3062 atomic_dec(&nr_mmap_events);
3063 if (event->attr.comm)
3064 atomic_dec(&nr_comm_events);
3065 if (event->attr.task)
3066 atomic_dec(&nr_task_events);
3067 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN)
3068 put_callchain_buffers();
3069 if (is_cgroup_event(event)) {
3070 atomic_dec(&per_cpu(perf_cgroup_events, event->cpu));
3071 static_key_slow_dec_deferred(&perf_sched_events);
3072 }
3073
3074 if (has_branch_stack(event)) {
3075 static_key_slow_dec_deferred(&perf_sched_events);
3076 /* is system-wide event */
3077 if (!(event->attach_state & PERF_ATTACH_TASK)) {
3078 atomic_dec(&per_cpu(perf_branch_stack_events,
3079 event->cpu));
3080 }
3081 }
3082 }
3083
3084 if (event->rb) {
3085 struct ring_buffer *rb;
3086
3087 /*
3088 * Can happen when we close an event with re-directed output.
3089 *
3090 * Since we have a 0 refcount, perf_mmap_close() will skip
3091 * over us; possibly making our ring_buffer_put() the last.
3092 */
3093 mutex_lock(&event->mmap_mutex);
3094 rb = event->rb;
3095 if (rb) {
3096 rcu_assign_pointer(event->rb, NULL);
3097 ring_buffer_detach(event, rb);
3098 ring_buffer_put(rb); /* could be last */
3099 }
3100 mutex_unlock(&event->mmap_mutex);
3101 }
3102
3103 if (is_cgroup_event(event))
3104 perf_detach_cgroup(event);
3105
3106 if (event->destroy)
3107 event->destroy(event);
3108
3109 if (event->ctx)
3110 put_ctx(event->ctx);
3111
3112 call_rcu(&event->rcu_head, free_event_rcu);
3113 }
3114
3115 int perf_event_release_kernel(struct perf_event *event)
3116 {
3117 struct perf_event_context *ctx = event->ctx;
3118
3119 WARN_ON_ONCE(ctx->parent_ctx);
3120 /*
3121 * There are two ways this annotation is useful:
3122 *
3123 * 1) there is a lock recursion from perf_event_exit_task
3124 * see the comment there.
3125 *
3126 * 2) there is a lock-inversion with mmap_sem through
3127 * perf_event_read_group(), which takes faults while
3128 * holding ctx->mutex, however this is called after
3129 * the last filedesc died, so there is no possibility
3130 * to trigger the AB-BA case.
3131 */
3132 mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
3133 perf_remove_from_context(event, true);
3134 mutex_unlock(&ctx->mutex);
3135
3136 free_event(event);
3137
3138 return 0;
3139 }
3140 EXPORT_SYMBOL_GPL(perf_event_release_kernel);
3141
3142 /*
3143 * Called when the last reference to the file is gone.
3144 */
3145 static void put_event(struct perf_event *event)
3146 {
3147 struct task_struct *owner;
3148
3149 if (!atomic_long_dec_and_test(&event->refcount))
3150 return;
3151
3152 rcu_read_lock();
3153 owner = ACCESS_ONCE(event->owner);
3154 /*
3155 * Matches the smp_wmb() in perf_event_exit_task(). If we observe
3156 * !owner it means the list deletion is complete and we can indeed
3157 * free this event, otherwise we need to serialize on
3158 * owner->perf_event_mutex.
3159 */
3160 smp_read_barrier_depends();
3161 if (owner) {
3162 /*
3163 * Since delayed_put_task_struct() also drops the last
3164 * task reference we can safely take a new reference
3165 * while holding the rcu_read_lock().
3166 */
3167 get_task_struct(owner);
3168 }
3169 rcu_read_unlock();
3170
3171 if (owner) {
3172 mutex_lock(&owner->perf_event_mutex);
3173 /*
3174 * We have to re-check the event->owner field, if it is cleared
3175 * we raced with perf_event_exit_task(), acquiring the mutex
3176 * ensured they're done, and we can proceed with freeing the
3177 * event.
3178 */
3179 if (event->owner)
3180 list_del_init(&event->owner_entry);
3181 mutex_unlock(&owner->perf_event_mutex);
3182 put_task_struct(owner);
3183 }
3184
3185 perf_event_release_kernel(event);
3186 }
3187
3188 static int perf_release(struct inode *inode, struct file *file)
3189 {
3190 put_event(file->private_data);
3191 return 0;
3192 }
3193
3194 u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running)
3195 {
3196 struct perf_event *child;
3197 u64 total = 0;
3198
3199 *enabled = 0;
3200 *running = 0;
3201
3202 mutex_lock(&event->child_mutex);
3203 total += perf_event_read(event);
3204 *enabled += event->total_time_enabled +
3205 atomic64_read(&event->child_total_time_enabled);
3206 *running += event->total_time_running +
3207 atomic64_read(&event->child_total_time_running);
3208
3209 list_for_each_entry(child, &event->child_list, child_list) {
3210 total += perf_event_read(child);
3211 *enabled += child->total_time_enabled;
3212 *running += child->total_time_running;
3213 }
3214 mutex_unlock(&event->child_mutex);
3215
3216 return total;
3217 }
3218 EXPORT_SYMBOL_GPL(perf_event_read_value);
3219
3220 static int perf_event_read_group(struct perf_event *event,
3221 u64 read_format, char __user *buf)
3222 {
3223 struct perf_event *leader = event->group_leader, *sub;
3224 int n = 0, size = 0, ret = -EFAULT;
3225 struct perf_event_context *ctx = leader->ctx;
3226 u64 values[5];
3227 u64 count, enabled, running;
3228
3229 mutex_lock(&ctx->mutex);
3230 count = perf_event_read_value(leader, &enabled, &running);
3231
3232 values[n++] = 1 + leader->nr_siblings;
3233 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3234 values[n++] = enabled;
3235 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3236 values[n++] = running;
3237 values[n++] = count;
3238 if (read_format & PERF_FORMAT_ID)
3239 values[n++] = primary_event_id(leader);
3240
3241 size = n * sizeof(u64);
3242
3243 if (copy_to_user(buf, values, size))
3244 goto unlock;
3245
3246 ret = size;
3247
3248 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
3249 n = 0;
3250
3251 values[n++] = perf_event_read_value(sub, &enabled, &running);
3252 if (read_format & PERF_FORMAT_ID)
3253 values[n++] = primary_event_id(sub);
3254
3255 size = n * sizeof(u64);
3256
3257 if (copy_to_user(buf + ret, values, size)) {
3258 ret = -EFAULT;
3259 goto unlock;
3260 }
3261
3262 ret += size;
3263 }
3264 unlock:
3265 mutex_unlock(&ctx->mutex);
3266
3267 return ret;
3268 }
3269
3270 static int perf_event_read_one(struct perf_event *event,
3271 u64 read_format, char __user *buf)
3272 {
3273 u64 enabled, running;
3274 u64 values[4];
3275 int n = 0;
3276
3277 values[n++] = perf_event_read_value(event, &enabled, &running);
3278 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
3279 values[n++] = enabled;
3280 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
3281 values[n++] = running;
3282 if (read_format & PERF_FORMAT_ID)
3283 values[n++] = primary_event_id(event);
3284
3285 if (copy_to_user(buf, values, n * sizeof(u64)))
3286 return -EFAULT;
3287
3288 return n * sizeof(u64);
3289 }
3290
3291 /*
3292 * Read the performance event - simple non blocking version for now
3293 */
3294 static ssize_t
3295 perf_read_hw(struct perf_event *event, char __user *buf, size_t count)
3296 {
3297 u64 read_format = event->attr.read_format;
3298 int ret;
3299
3300 /*
3301 * Return end-of-file for a read on a event that is in
3302 * error state (i.e. because it was pinned but it couldn't be
3303 * scheduled on to the CPU at some point).
3304 */
3305 if (event->state == PERF_EVENT_STATE_ERROR)
3306 return 0;
3307
3308 if (count < event->read_size)
3309 return -ENOSPC;
3310
3311 WARN_ON_ONCE(event->ctx->parent_ctx);
3312 if (read_format & PERF_FORMAT_GROUP)
3313 ret = perf_event_read_group(event, read_format, buf);
3314 else
3315 ret = perf_event_read_one(event, read_format, buf);
3316
3317 return ret;
3318 }
3319
3320 static ssize_t
3321 perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos)
3322 {
3323 struct perf_event *event = file->private_data;
3324
3325 return perf_read_hw(event, buf, count);
3326 }
3327
3328 static unsigned int perf_poll(struct file *file, poll_table *wait)
3329 {
3330 struct perf_event *event = file->private_data;
3331 struct ring_buffer *rb;
3332 unsigned int events = POLL_HUP;
3333
3334 /*
3335 * Pin the event->rb by taking event->mmap_mutex; otherwise
3336 * perf_event_set_output() can swizzle our rb and make us miss wakeups.
3337 */
3338 mutex_lock(&event->mmap_mutex);
3339 rb = event->rb;
3340 if (rb)
3341 events = atomic_xchg(&rb->poll, 0);
3342 mutex_unlock(&event->mmap_mutex);
3343
3344 poll_wait(file, &event->waitq, wait);
3345
3346 return events;
3347 }
3348
3349 static void perf_event_reset(struct perf_event *event)
3350 {
3351 (void)perf_event_read(event);
3352 local64_set(&event->count, 0);
3353 perf_event_update_userpage(event);
3354 }
3355
3356 /*
3357 * Holding the top-level event's child_mutex means that any
3358 * descendant process that has inherited this event will block
3359 * in sync_child_event if it goes to exit, thus satisfying the
3360 * task existence requirements of perf_event_enable/disable.
3361 */
3362 static void perf_event_for_each_child(struct perf_event *event,
3363 void (*func)(struct perf_event *))
3364 {
3365 struct perf_event *child;
3366
3367 WARN_ON_ONCE(event->ctx->parent_ctx);
3368 mutex_lock(&event->child_mutex);
3369 func(event);
3370 list_for_each_entry(child, &event->child_list, child_list)
3371 func(child);
3372 mutex_unlock(&event->child_mutex);
3373 }
3374
3375 static void perf_event_for_each(struct perf_event *event,
3376 void (*func)(struct perf_event *))
3377 {
3378 struct perf_event_context *ctx = event->ctx;
3379 struct perf_event *sibling;
3380
3381 WARN_ON_ONCE(ctx->parent_ctx);
3382 mutex_lock(&ctx->mutex);
3383 event = event->group_leader;
3384
3385 perf_event_for_each_child(event, func);
3386 list_for_each_entry(sibling, &event->sibling_list, group_entry)
3387 perf_event_for_each_child(sibling, func);
3388 mutex_unlock(&ctx->mutex);
3389 }
3390
3391 static int perf_event_period(struct perf_event *event, u64 __user *arg)
3392 {
3393 struct perf_event_context *ctx = event->ctx;
3394 int ret = 0;
3395 u64 value;
3396
3397 if (!is_sampling_event(event))
3398 return -EINVAL;
3399
3400 if (copy_from_user(&value, arg, sizeof(value)))
3401 return -EFAULT;
3402
3403 if (!value)
3404 return -EINVAL;
3405
3406 raw_spin_lock_irq(&ctx->lock);
3407 if (event->attr.freq) {
3408 if (value > sysctl_perf_event_sample_rate) {
3409 ret = -EINVAL;
3410 goto unlock;
3411 }
3412
3413 event->attr.sample_freq = value;
3414 } else {
3415 event->attr.sample_period = value;
3416 event->hw.sample_period = value;
3417 }
3418 unlock:
3419 raw_spin_unlock_irq(&ctx->lock);
3420
3421 return ret;
3422 }
3423
3424 static const struct file_operations perf_fops;
3425
3426 static inline int perf_fget_light(int fd, struct fd *p)
3427 {
3428 struct fd f = fdget(fd);
3429 if (!f.file)
3430 return -EBADF;
3431
3432 if (f.file->f_op != &perf_fops) {
3433 fdput(f);
3434 return -EBADF;
3435 }
3436 *p = f;
3437 return 0;
3438 }
3439
3440 static int perf_event_set_output(struct perf_event *event,
3441 struct perf_event *output_event);
3442 static int perf_event_set_filter(struct perf_event *event, void __user *arg);
3443
3444 static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
3445 {
3446 struct perf_event *event = file->private_data;
3447 void (*func)(struct perf_event *);
3448 u32 flags = arg;
3449
3450 switch (cmd) {
3451 case PERF_EVENT_IOC_ENABLE:
3452 func = perf_event_enable;
3453 break;
3454 case PERF_EVENT_IOC_DISABLE:
3455 func = perf_event_disable;
3456 break;
3457 case PERF_EVENT_IOC_RESET:
3458 func = perf_event_reset;
3459 break;
3460
3461 case PERF_EVENT_IOC_REFRESH:
3462 return perf_event_refresh(event, arg);
3463
3464 case PERF_EVENT_IOC_PERIOD:
3465 return perf_event_period(event, (u64 __user *)arg);
3466
3467 case PERF_EVENT_IOC_SET_OUTPUT:
3468 {
3469 int ret;
3470 if (arg != -1) {
3471 struct perf_event *output_event;
3472 struct fd output;
3473 ret = perf_fget_light(arg, &output);
3474 if (ret)
3475 return ret;
3476 output_event = output.file->private_data;
3477 ret = perf_event_set_output(event, output_event);
3478 fdput(output);
3479 } else {
3480 ret = perf_event_set_output(event, NULL);
3481 }
3482 return ret;
3483 }
3484
3485 case PERF_EVENT_IOC_SET_FILTER:
3486 return perf_event_set_filter(event, (void __user *)arg);
3487
3488 default:
3489 return -ENOTTY;
3490 }
3491
3492 if (flags & PERF_IOC_FLAG_GROUP)
3493 perf_event_for_each(event, func);
3494 else
3495 perf_event_for_each_child(event, func);
3496
3497 return 0;
3498 }
3499
3500 int perf_event_task_enable(void)
3501 {
3502 struct perf_event *event;
3503
3504 mutex_lock(&current->perf_event_mutex);
3505 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3506 perf_event_for_each_child(event, perf_event_enable);
3507 mutex_unlock(&current->perf_event_mutex);
3508
3509 return 0;
3510 }
3511
3512 int perf_event_task_disable(void)
3513 {
3514 struct perf_event *event;
3515
3516 mutex_lock(&current->perf_event_mutex);
3517 list_for_each_entry(event, &current->perf_event_list, owner_entry)
3518 perf_event_for_each_child(event, perf_event_disable);
3519 mutex_unlock(&current->perf_event_mutex);
3520
3521 return 0;
3522 }
3523
3524 static int perf_event_index(struct perf_event *event)
3525 {
3526 if (event->hw.state & PERF_HES_STOPPED)
3527 return 0;
3528
3529 if (event->state != PERF_EVENT_STATE_ACTIVE)
3530 return 0;
3531
3532 return event->pmu->event_idx(event);
3533 }
3534
3535 static void calc_timer_values(struct perf_event *event,
3536 u64 *now,
3537 u64 *enabled,
3538 u64 *running)
3539 {
3540 u64 ctx_time;
3541
3542 *now = perf_clock();
3543 ctx_time = event->shadow_ctx_time + *now;
3544 *enabled = ctx_time - event->tstamp_enabled;
3545 *running = ctx_time - event->tstamp_running;
3546 }
3547
3548 void __weak arch_perf_update_userpage(struct perf_event_mmap_page *userpg, u64 now)
3549 {
3550 }
3551
3552 /*
3553 * Callers need to ensure there can be no nesting of this function, otherwise
3554 * the seqlock logic goes bad. We can not serialize this because the arch
3555 * code calls this from NMI context.
3556 */
3557 void perf_event_update_userpage(struct perf_event *event)
3558 {
3559 struct perf_event_mmap_page *userpg;
3560 struct ring_buffer *rb;
3561 u64 enabled, running, now;
3562
3563 rcu_read_lock();
3564 /*
3565 * compute total_time_enabled, total_time_running
3566 * based on snapshot values taken when the event
3567 * was last scheduled in.
3568 *
3569 * we cannot simply called update_context_time()
3570 * because of locking issue as we can be called in
3571 * NMI context
3572 */
3573 calc_timer_values(event, &now, &enabled, &running);
3574 rb = rcu_dereference(event->rb);
3575 if (!rb)
3576 goto unlock;
3577
3578 userpg = rb->user_page;
3579
3580 /*
3581 * Disable preemption so as to not let the corresponding user-space
3582 * spin too long if we get preempted.
3583 */
3584 preempt_disable();
3585 ++userpg->lock;
3586 barrier();
3587 userpg->index = perf_event_index(event);
3588 userpg->offset = perf_event_count(event);
3589 if (userpg->index)
3590 userpg->offset -= local64_read(&event->hw.prev_count);
3591
3592 userpg->time_enabled = enabled +
3593 atomic64_read(&event->child_total_time_enabled);
3594
3595 userpg->time_running = running +
3596 atomic64_read(&event->child_total_time_running);
3597
3598 arch_perf_update_userpage(userpg, now);
3599
3600 barrier();
3601 ++userpg->lock;
3602 preempt_enable();
3603 unlock:
3604 rcu_read_unlock();
3605 }
3606
3607 static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
3608 {
3609 struct perf_event *event = vma->vm_file->private_data;
3610 struct ring_buffer *rb;
3611 int ret = VM_FAULT_SIGBUS;
3612
3613 if (vmf->flags & FAULT_FLAG_MKWRITE) {
3614 if (vmf->pgoff == 0)
3615 ret = 0;
3616 return ret;
3617 }
3618
3619 rcu_read_lock();
3620 rb = rcu_dereference(event->rb);
3621 if (!rb)
3622 goto unlock;
3623
3624 if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
3625 goto unlock;
3626
3627 vmf->page = perf_mmap_to_page(rb, vmf->pgoff);
3628 if (!vmf->page)
3629 goto unlock;
3630
3631 get_page(vmf->page);
3632 vmf->page->mapping = vma->vm_file->f_mapping;
3633 vmf->page->index = vmf->pgoff;
3634
3635 ret = 0;
3636 unlock:
3637 rcu_read_unlock();
3638
3639 return ret;
3640 }
3641
3642 static void ring_buffer_attach(struct perf_event *event,
3643 struct ring_buffer *rb)
3644 {
3645 unsigned long flags;
3646
3647 if (!list_empty(&event->rb_entry))
3648 return;
3649
3650 spin_lock_irqsave(&rb->event_lock, flags);
3651 if (list_empty(&event->rb_entry))
3652 list_add(&event->rb_entry, &rb->event_list);
3653 spin_unlock_irqrestore(&rb->event_lock, flags);
3654 }
3655
3656 static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
3657 {
3658 unsigned long flags;
3659
3660 if (list_empty(&event->rb_entry))
3661 return;
3662
3663 spin_lock_irqsave(&rb->event_lock, flags);
3664 list_del_init(&event->rb_entry);
3665 wake_up_all(&event->waitq);
3666 spin_unlock_irqrestore(&rb->event_lock, flags);
3667 }
3668
3669 static void ring_buffer_wakeup(struct perf_event *event)
3670 {
3671 struct ring_buffer *rb;
3672
3673 rcu_read_lock();
3674 rb = rcu_dereference(event->rb);
3675 if (rb) {
3676 list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
3677 wake_up_all(&event->waitq);
3678 }
3679 rcu_read_unlock();
3680 }
3681
3682 static void rb_free_rcu(struct rcu_head *rcu_head)
3683 {
3684 struct ring_buffer *rb;
3685
3686 rb = container_of(rcu_head, struct ring_buffer, rcu_head);
3687 rb_free(rb);
3688 }
3689
3690 static struct ring_buffer *ring_buffer_get(struct perf_event *event)
3691 {
3692 struct ring_buffer *rb;
3693
3694 rcu_read_lock();
3695 rb = rcu_dereference(event->rb);
3696 if (rb) {
3697 if (!atomic_inc_not_zero(&rb->refcount))
3698 rb = NULL;
3699 }
3700 rcu_read_unlock();
3701
3702 return rb;
3703 }
3704
3705 static void ring_buffer_put(struct ring_buffer *rb)
3706 {
3707 if (!atomic_dec_and_test(&rb->refcount))
3708 return;
3709
3710 WARN_ON_ONCE(!list_empty(&rb->event_list));
3711
3712 call_rcu(&rb->rcu_head, rb_free_rcu);
3713 }
3714
3715 static void perf_mmap_open(struct vm_area_struct *vma)
3716 {
3717 struct perf_event *event = vma->vm_file->private_data;
3718
3719 atomic_inc(&event->mmap_count);
3720 atomic_inc(&event->rb->mmap_count);
3721 }
3722
3723 /*
3724 * A buffer can be mmap()ed multiple times; either directly through the same
3725 * event, or through other events by use of perf_event_set_output().
3726 *
3727 * In order to undo the VM accounting done by perf_mmap() we need to destroy
3728 * the buffer here, where we still have a VM context. This means we need
3729 * to detach all events redirecting to us.
3730 */
3731 static void perf_mmap_close(struct vm_area_struct *vma)
3732 {
3733 struct perf_event *event = vma->vm_file->private_data;
3734
3735 struct ring_buffer *rb = event->rb;
3736 struct user_struct *mmap_user = rb->mmap_user;
3737 int mmap_locked = rb->mmap_locked;
3738 unsigned long size = perf_data_size(rb);
3739
3740 atomic_dec(&rb->mmap_count);
3741
3742 if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
3743 return;
3744
3745 /* Detach current event from the buffer. */
3746 rcu_assign_pointer(event->rb, NULL);
3747 ring_buffer_detach(event, rb);
3748 mutex_unlock(&event->mmap_mutex);
3749
3750 /* If there's still other mmap()s of this buffer, we're done. */
3751 if (atomic_read(&rb->mmap_count)) {
3752 ring_buffer_put(rb); /* can't be last */
3753 return;
3754 }
3755
3756 /*
3757 * No other mmap()s, detach from all other events that might redirect
3758 * into the now unreachable buffer. Somewhat complicated by the
3759 * fact that rb::event_lock otherwise nests inside mmap_mutex.
3760 */
3761 again:
3762 rcu_read_lock();
3763 list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
3764 if (!atomic_long_inc_not_zero(&event->refcount)) {
3765 /*
3766 * This event is en-route to free_event() which will
3767 * detach it and remove it from the list.
3768 */
3769 continue;
3770 }
3771 rcu_read_unlock();
3772
3773 mutex_lock(&event->mmap_mutex);
3774 /*
3775 * Check we didn't race with perf_event_set_output() which can
3776 * swizzle the rb from under us while we were waiting to
3777 * acquire mmap_mutex.
3778 *
3779 * If we find a different rb; ignore this event, a next
3780 * iteration will no longer find it on the list. We have to
3781 * still restart the iteration to make sure we're not now
3782 * iterating the wrong list.
3783 */
3784 if (event->rb == rb) {
3785 rcu_assign_pointer(event->rb, NULL);
3786 ring_buffer_detach(event, rb);
3787 ring_buffer_put(rb); /* can't be last, we still have one */
3788 }
3789 mutex_unlock(&event->mmap_mutex);
3790 put_event(event);
3791
3792 /*
3793 * Restart the iteration; either we're on the wrong list or
3794 * destroyed its integrity by doing a deletion.
3795 */
3796 goto again;
3797 }
3798 rcu_read_unlock();
3799
3800 /*
3801 * It could be there's still a few 0-ref events on the list; they'll
3802 * get cleaned up by free_event() -- they'll also still have their
3803 * ref on the rb and will free it whenever they are done with it.
3804 *
3805 * Aside from that, this buffer is 'fully' detached and unmapped,
3806 * undo the VM accounting.
3807 */
3808
3809 atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
3810 vma->vm_mm->pinned_vm -= mmap_locked;
3811 free_uid(mmap_user);
3812
3813 ring_buffer_put(rb); /* could be last */
3814 }
3815
3816 static const struct vm_operations_struct perf_mmap_vmops = {
3817 .open = perf_mmap_open,
3818 .close = perf_mmap_close,
3819 .fault = perf_mmap_fault,
3820 .page_mkwrite = perf_mmap_fault,
3821 };
3822
3823 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
3824 {
3825 struct perf_event *event = file->private_data;
3826 unsigned long user_locked, user_lock_limit;
3827 struct user_struct *user = current_user();
3828 unsigned long locked, lock_limit;
3829 struct ring_buffer *rb;
3830 unsigned long vma_size;
3831 unsigned long nr_pages;
3832 long user_extra, extra;
3833 int ret = 0, flags = 0;
3834
3835 /*
3836 * Don't allow mmap() of inherited per-task counters. This would
3837 * create a performance issue due to all children writing to the
3838 * same rb.
3839 */
3840 if (event->cpu == -1 && event->attr.inherit)
3841 return -EINVAL;
3842
3843 if (!(vma->vm_flags & VM_SHARED))
3844 return -EINVAL;
3845
3846 vma_size = vma->vm_end - vma->vm_start;
3847 nr_pages = (vma_size / PAGE_SIZE) - 1;
3848
3849 /*
3850 * If we have rb pages ensure they're a power-of-two number, so we
3851 * can do bitmasks instead of modulo.
3852 */
3853 if (nr_pages != 0 && !is_power_of_2(nr_pages))
3854 return -EINVAL;
3855
3856 if (vma_size != PAGE_SIZE * (1 + nr_pages))
3857 return -EINVAL;
3858
3859 if (vma->vm_pgoff != 0)
3860 return -EINVAL;
3861
3862 WARN_ON_ONCE(event->ctx->parent_ctx);
3863 again:
3864 mutex_lock(&event->mmap_mutex);
3865 if (event->rb) {
3866 if (event->rb->nr_pages != nr_pages) {
3867 ret = -EINVAL;
3868 goto unlock;
3869 }
3870
3871 if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
3872 /*
3873 * Raced against perf_mmap_close() through
3874 * perf_event_set_output(). Try again, hope for better
3875 * luck.
3876 */
3877 mutex_unlock(&event->mmap_mutex);
3878 goto again;
3879 }
3880
3881 goto unlock;
3882 }
3883
3884 user_extra = nr_pages + 1;
3885 user_lock_limit = sysctl_perf_event_mlock >> (PAGE_SHIFT - 10);
3886
3887 /*
3888 * Increase the limit linearly with more CPUs:
3889 */
3890 user_lock_limit *= num_online_cpus();
3891
3892 user_locked = atomic_long_read(&user->locked_vm) + user_extra;
3893
3894 extra = 0;
3895 if (user_locked > user_lock_limit)
3896 extra = user_locked - user_lock_limit;
3897
3898 lock_limit = rlimit(RLIMIT_MEMLOCK);
3899 lock_limit >>= PAGE_SHIFT;
3900 locked = vma->vm_mm->pinned_vm + extra;
3901
3902 if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
3903 !capable(CAP_IPC_LOCK)) {
3904 ret = -EPERM;
3905 goto unlock;
3906 }
3907
3908 WARN_ON(event->rb);
3909
3910 if (vma->vm_flags & VM_WRITE)
3911 flags |= RING_BUFFER_WRITABLE;
3912
3913 rb = rb_alloc(nr_pages,
3914 event->attr.watermark ? event->attr.wakeup_watermark : 0,
3915 event->cpu, flags);
3916
3917 if (!rb) {
3918 ret = -ENOMEM;
3919 goto unlock;
3920 }
3921
3922 atomic_set(&rb->mmap_count, 1);
3923 rb->mmap_locked = extra;
3924 rb->mmap_user = get_current_user();
3925
3926 atomic_long_add(user_extra, &user->locked_vm);
3927 vma->vm_mm->pinned_vm += extra;
3928
3929 ring_buffer_attach(event, rb);
3930 rcu_assign_pointer(event->rb, rb);
3931
3932 perf_event_update_userpage(event);
3933
3934 unlock:
3935 if (!ret)
3936 atomic_inc(&event->mmap_count);
3937 mutex_unlock(&event->mmap_mutex);
3938
3939 /*
3940 * Since pinned accounting is per vm we cannot allow fork() to copy our
3941 * vma.
3942 */
3943 vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
3944 vma->vm_ops = &perf_mmap_vmops;
3945
3946 return ret;
3947 }
3948
3949 static int perf_fasync(int fd, struct file *filp, int on)
3950 {
3951 struct inode *inode = file_inode(filp);
3952 struct perf_event *event = filp->private_data;
3953 int retval;
3954
3955 mutex_lock(&inode->i_mutex);
3956 retval = fasync_helper(fd, filp, on, &event->fasync);
3957 mutex_unlock(&inode->i_mutex);
3958
3959 if (retval < 0)
3960 return retval;
3961
3962 return 0;
3963 }
3964
3965 static const struct file_operations perf_fops = {
3966 .llseek = no_llseek,
3967 .release = perf_release,
3968 .read = perf_read,
3969 .poll = perf_poll,
3970 .unlocked_ioctl = perf_ioctl,
3971 .compat_ioctl = perf_ioctl,
3972 .mmap = perf_mmap,
3973 .fasync = perf_fasync,
3974 };
3975
3976 /*
3977 * Perf event wakeup
3978 *
3979 * If there's data, ensure we set the poll() state and publish everything
3980 * to user-space before waking everybody up.
3981 */
3982
3983 void perf_event_wakeup(struct perf_event *event)
3984 {
3985 ring_buffer_wakeup(event);
3986
3987 if (event->pending_kill) {
3988 kill_fasync(&event->fasync, SIGIO, event->pending_kill);
3989 event->pending_kill = 0;
3990 }
3991 }
3992
3993 static void perf_pending_event(struct irq_work *entry)
3994 {
3995 struct perf_event *event = container_of(entry,
3996 struct perf_event, pending);
3997
3998 if (event->pending_disable) {
3999 event->pending_disable = 0;
4000 __perf_event_disable(event);
4001 }
4002
4003 if (event->pending_wakeup) {
4004 event->pending_wakeup = 0;
4005 perf_event_wakeup(event);
4006 }
4007 }
4008
4009 /*
4010 * We assume there is only KVM supporting the callbacks.
4011 * Later on, we might change it to a list if there is
4012 * another virtualization implementation supporting the callbacks.
4013 */
4014 struct perf_guest_info_callbacks *perf_guest_cbs;
4015
4016 int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4017 {
4018 perf_guest_cbs = cbs;
4019 return 0;
4020 }
4021 EXPORT_SYMBOL_GPL(perf_register_guest_info_callbacks);
4022
4023 int perf_unregister_guest_info_callbacks(struct perf_guest_info_callbacks *cbs)
4024 {
4025 perf_guest_cbs = NULL;
4026 return 0;
4027 }
4028 EXPORT_SYMBOL_GPL(perf_unregister_guest_info_callbacks);
4029
4030 static void
4031 perf_output_sample_regs(struct perf_output_handle *handle,
4032 struct pt_regs *regs, u64 mask)
4033 {
4034 int bit;
4035
4036 for_each_set_bit(bit, (const unsigned long *) &mask,
4037 sizeof(mask) * BITS_PER_BYTE) {
4038 u64 val;
4039
4040 val = perf_reg_value(regs, bit);
4041 perf_output_put(handle, val);
4042 }
4043 }
4044
4045 static void perf_sample_regs_user(struct perf_regs_user *regs_user,
4046 struct pt_regs *regs)
4047 {
4048 if (!user_mode(regs)) {
4049 if (current->mm)
4050 regs = task_pt_regs(current);
4051 else
4052 regs = NULL;
4053 }
4054
4055 if (regs) {
4056 regs_user->regs = regs;
4057 regs_user->abi = perf_reg_abi(current);
4058 }
4059 }
4060
4061 /*
4062 * Get remaining task size from user stack pointer.
4063 *
4064 * It'd be better to take stack vma map and limit this more
4065 * precisly, but there's no way to get it safely under interrupt,
4066 * so using TASK_SIZE as limit.
4067 */
4068 static u64 perf_ustack_task_size(struct pt_regs *regs)
4069 {
4070 unsigned long addr = perf_user_stack_pointer(regs);
4071
4072 if (!addr || addr >= TASK_SIZE)
4073 return 0;
4074
4075 return TASK_SIZE - addr;
4076 }
4077
4078 static u16
4079 perf_sample_ustack_size(u16 stack_size, u16 header_size,
4080 struct pt_regs *regs)
4081 {
4082 u64 task_size;
4083
4084 /* No regs, no stack pointer, no dump. */
4085 if (!regs)
4086 return 0;
4087
4088 /*
4089 * Check if we fit in with the requested stack size into the:
4090 * - TASK_SIZE
4091 * If we don't, we limit the size to the TASK_SIZE.
4092 *
4093 * - remaining sample size
4094 * If we don't, we customize the stack size to
4095 * fit in to the remaining sample size.
4096 */
4097
4098 task_size = min((u64) USHRT_MAX, perf_ustack_task_size(regs));
4099 stack_size = min(stack_size, (u16) task_size);
4100
4101 /* Current header size plus static size and dynamic size. */
4102 header_size += 2 * sizeof(u64);
4103
4104 /* Do we fit in with the current stack dump size? */
4105 if ((u16) (header_size + stack_size) < header_size) {
4106 /*
4107 * If we overflow the maximum size for the sample,
4108 * we customize the stack dump size to fit in.
4109 */
4110 stack_size = USHRT_MAX - header_size - sizeof(u64);
4111 stack_size = round_up(stack_size, sizeof(u64));
4112 }
4113
4114 return stack_size;
4115 }
4116
4117 static void
4118 perf_output_sample_ustack(struct perf_output_handle *handle, u64 dump_size,
4119 struct pt_regs *regs)
4120 {
4121 /* Case of a kernel thread, nothing to dump */
4122 if (!regs) {
4123 u64 size = 0;
4124 perf_output_put(handle, size);
4125 } else {
4126 unsigned long sp;
4127 unsigned int rem;
4128 u64 dyn_size;
4129
4130 /*
4131 * We dump:
4132 * static size
4133 * - the size requested by user or the best one we can fit
4134 * in to the sample max size
4135 * data
4136 * - user stack dump data
4137 * dynamic size
4138 * - the actual dumped size
4139 */
4140
4141 /* Static size. */
4142 perf_output_put(handle, dump_size);
4143
4144 /* Data. */
4145 sp = perf_user_stack_pointer(regs);
4146 rem = __output_copy_user(handle, (void *) sp, dump_size);
4147 dyn_size = dump_size - rem;
4148
4149 perf_output_skip(handle, rem);
4150
4151 /* Dynamic size. */
4152 perf_output_put(handle, dyn_size);
4153 }
4154 }
4155
4156 static void __perf_event_header__init_id(struct perf_event_header *header,
4157 struct perf_sample_data *data,
4158 struct perf_event *event)
4159 {
4160 u64 sample_type = event->attr.sample_type;
4161
4162 data->type = sample_type;
4163 header->size += event->id_header_size;
4164
4165 if (sample_type & PERF_SAMPLE_TID) {
4166 /* namespace issues */
4167 data->tid_entry.pid = perf_event_pid(event, current);
4168 data->tid_entry.tid = perf_event_tid(event, current);
4169 }
4170
4171 if (sample_type & PERF_SAMPLE_TIME)
4172 data->time = perf_clock();
4173
4174 if (sample_type & PERF_SAMPLE_ID)
4175 data->id = primary_event_id(event);
4176
4177 if (sample_type & PERF_SAMPLE_STREAM_ID)
4178 data->stream_id = event->id;
4179
4180 if (sample_type & PERF_SAMPLE_CPU) {
4181 data->cpu_entry.cpu = raw_smp_processor_id();
4182 data->cpu_entry.reserved = 0;
4183 }
4184 }
4185
4186 void perf_event_header__init_id(struct perf_event_header *header,
4187 struct perf_sample_data *data,
4188 struct perf_event *event)
4189 {
4190 if (event->attr.sample_id_all)
4191 __perf_event_header__init_id(header, data, event);
4192 }
4193
4194 static void __perf_event__output_id_sample(struct perf_output_handle *handle,
4195 struct perf_sample_data *data)
4196 {
4197 u64 sample_type = data->type;
4198
4199 if (sample_type & PERF_SAMPLE_TID)
4200 perf_output_put(handle, data->tid_entry);
4201
4202 if (sample_type & PERF_SAMPLE_TIME)
4203 perf_output_put(handle, data->time);
4204
4205 if (sample_type & PERF_SAMPLE_ID)
4206 perf_output_put(handle, data->id);
4207
4208 if (sample_type & PERF_SAMPLE_STREAM_ID)
4209 perf_output_put(handle, data->stream_id);
4210
4211 if (sample_type & PERF_SAMPLE_CPU)
4212 perf_output_put(handle, data->cpu_entry);
4213 }
4214
4215 void perf_event__output_id_sample(struct perf_event *event,
4216 struct perf_output_handle *handle,
4217 struct perf_sample_data *sample)
4218 {
4219 if (event->attr.sample_id_all)
4220 __perf_event__output_id_sample(handle, sample);
4221 }
4222
4223 static void perf_output_read_one(struct perf_output_handle *handle,
4224 struct perf_event *event,
4225 u64 enabled, u64 running)
4226 {
4227 u64 read_format = event->attr.read_format;
4228 u64 values[4];
4229 int n = 0;
4230
4231 values[n++] = perf_event_count(event);
4232 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) {
4233 values[n++] = enabled +
4234 atomic64_read(&event->child_total_time_enabled);
4235 }
4236 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) {
4237 values[n++] = running +
4238 atomic64_read(&event->child_total_time_running);
4239 }
4240 if (read_format & PERF_FORMAT_ID)
4241 values[n++] = primary_event_id(event);
4242
4243 __output_copy(handle, values, n * sizeof(u64));
4244 }
4245
4246 /*
4247 * XXX PERF_FORMAT_GROUP vs inherited events seems difficult.
4248 */
4249 static void perf_output_read_group(struct perf_output_handle *handle,
4250 struct perf_event *event,
4251 u64 enabled, u64 running)
4252 {
4253 struct perf_event *leader = event->group_leader, *sub;
4254 u64 read_format = event->attr.read_format;
4255 u64 values[5];
4256 int n = 0;
4257
4258 values[n++] = 1 + leader->nr_siblings;
4259
4260 if (read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
4261 values[n++] = enabled;
4262
4263 if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
4264 values[n++] = running;
4265
4266 if (leader != event)
4267 leader->pmu->read(leader);
4268
4269 values[n++] = perf_event_count(leader);
4270 if (read_format & PERF_FORMAT_ID)
4271 values[n++] = primary_event_id(leader);
4272
4273 __output_copy(handle, values, n * sizeof(u64));
4274
4275 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
4276 n = 0;
4277
4278 if (sub != event)
4279 sub->pmu->read(sub);
4280
4281 values[n++] = perf_event_count(sub);
4282 if (read_format & PERF_FORMAT_ID)
4283 values[n++] = primary_event_id(sub);
4284
4285 __output_copy(handle, values, n * sizeof(u64));
4286 }
4287 }
4288
4289 #define PERF_FORMAT_TOTAL_TIMES (PERF_FORMAT_TOTAL_TIME_ENABLED|\
4290 PERF_FORMAT_TOTAL_TIME_RUNNING)
4291
4292 static void perf_output_read(struct perf_output_handle *handle,
4293 struct perf_event *event)
4294 {
4295 u64 enabled = 0, running = 0, now;
4296 u64 read_format = event->attr.read_format;
4297
4298 /*
4299 * compute total_time_enabled, total_time_running
4300 * based on snapshot values taken when the event
4301 * was last scheduled in.
4302 *
4303 * we cannot simply called update_context_time()
4304 * because of locking issue as we are called in
4305 * NMI context
4306 */
4307 if (read_format & PERF_FORMAT_TOTAL_TIMES)
4308 calc_timer_values(event, &now, &enabled, &running);
4309
4310 if (event->attr.read_format & PERF_FORMAT_GROUP)
4311 perf_output_read_group(handle, event, enabled, running);
4312 else
4313 perf_output_read_one(handle, event, enabled, running);
4314 }
4315
4316 void perf_output_sample(struct perf_output_handle *handle,
4317 struct perf_event_header *header,
4318 struct perf_sample_data *data,
4319 struct perf_event *event)
4320 {
4321 u64 sample_type = data->type;
4322
4323 perf_output_put(handle, *header);
4324
4325 if (sample_type & PERF_SAMPLE_IP)
4326 perf_output_put(handle, data->ip);
4327
4328 if (sample_type & PERF_SAMPLE_TID)
4329 perf_output_put(handle, data->tid_entry);
4330
4331 if (sample_type & PERF_SAMPLE_TIME)
4332 perf_output_put(handle, data->time);
4333
4334 if (sample_type & PERF_SAMPLE_ADDR)
4335 perf_output_put(handle, data->addr);
4336
4337 if (sample_type & PERF_SAMPLE_ID)
4338 perf_output_put(handle, data->id);
4339
4340 if (sample_type & PERF_SAMPLE_STREAM_ID)
4341 perf_output_put(handle, data->stream_id);
4342
4343 if (sample_type & PERF_SAMPLE_CPU)
4344 perf_output_put(handle, data->cpu_entry);
4345
4346 if (sample_type & PERF_SAMPLE_PERIOD)
4347 perf_output_put(handle, data->period);
4348
4349 if (sample_type & PERF_SAMPLE_READ)
4350 perf_output_read(handle, event);
4351
4352 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4353 if (data->callchain) {
4354 int size = 1;
4355
4356 if (data->callchain)
4357 size += data->callchain->nr;
4358
4359 size *= sizeof(u64);
4360
4361 __output_copy(handle, data->callchain, size);
4362 } else {
4363 u64 nr = 0;
4364 perf_output_put(handle, nr);
4365 }
4366 }
4367
4368 if (sample_type & PERF_SAMPLE_RAW) {
4369 if (data->raw) {
4370 perf_output_put(handle, data->raw->size);
4371 __output_copy(handle, data->raw->data,
4372 data->raw->size);
4373 } else {
4374 struct {
4375 u32 size;
4376 u32 data;
4377 } raw = {
4378 .size = sizeof(u32),
4379 .data = 0,
4380 };
4381 perf_output_put(handle, raw);
4382 }
4383 }
4384
4385 if (!event->attr.watermark) {
4386 int wakeup_events = event->attr.wakeup_events;
4387
4388 if (wakeup_events) {
4389 struct ring_buffer *rb = handle->rb;
4390 int events = local_inc_return(&rb->events);
4391
4392 if (events >= wakeup_events) {
4393 local_sub(wakeup_events, &rb->events);
4394 local_inc(&rb->wakeup);
4395 }
4396 }
4397 }
4398
4399 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4400 if (data->br_stack) {
4401 size_t size;
4402
4403 size = data->br_stack->nr
4404 * sizeof(struct perf_branch_entry);
4405
4406 perf_output_put(handle, data->br_stack->nr);
4407 perf_output_copy(handle, data->br_stack->entries, size);
4408 } else {
4409 /*
4410 * we always store at least the value of nr
4411 */
4412 u64 nr = 0;
4413 perf_output_put(handle, nr);
4414 }
4415 }
4416
4417 if (sample_type & PERF_SAMPLE_REGS_USER) {
4418 u64 abi = data->regs_user.abi;
4419
4420 /*
4421 * If there are no regs to dump, notice it through
4422 * first u64 being zero (PERF_SAMPLE_REGS_ABI_NONE).
4423 */
4424 perf_output_put(handle, abi);
4425
4426 if (abi) {
4427 u64 mask = event->attr.sample_regs_user;
4428 perf_output_sample_regs(handle,
4429 data->regs_user.regs,
4430 mask);
4431 }
4432 }
4433
4434 if (sample_type & PERF_SAMPLE_STACK_USER)
4435 perf_output_sample_ustack(handle,
4436 data->stack_user_size,
4437 data->regs_user.regs);
4438
4439 if (sample_type & PERF_SAMPLE_WEIGHT)
4440 perf_output_put(handle, data->weight);
4441
4442 if (sample_type & PERF_SAMPLE_DATA_SRC)
4443 perf_output_put(handle, data->data_src.val);
4444 }
4445
4446 void perf_prepare_sample(struct perf_event_header *header,
4447 struct perf_sample_data *data,
4448 struct perf_event *event,
4449 struct pt_regs *regs)
4450 {
4451 u64 sample_type = event->attr.sample_type;
4452
4453 header->type = PERF_RECORD_SAMPLE;
4454 header->size = sizeof(*header) + event->header_size;
4455
4456 header->misc = 0;
4457 header->misc |= perf_misc_flags(regs);
4458
4459 __perf_event_header__init_id(header, data, event);
4460
4461 if (sample_type & PERF_SAMPLE_IP)
4462 data->ip = perf_instruction_pointer(regs);
4463
4464 if (sample_type & PERF_SAMPLE_CALLCHAIN) {
4465 int size = 1;
4466
4467 data->callchain = perf_callchain(event, regs);
4468
4469 if (data->callchain)
4470 size += data->callchain->nr;
4471
4472 header->size += size * sizeof(u64);
4473 }
4474
4475 if (sample_type & PERF_SAMPLE_RAW) {
4476 int size = sizeof(u32);
4477
4478 if (data->raw)
4479 size += data->raw->size;
4480 else
4481 size += sizeof(u32);
4482
4483 WARN_ON_ONCE(size & (sizeof(u64)-1));
4484 header->size += size;
4485 }
4486
4487 if (sample_type & PERF_SAMPLE_BRANCH_STACK) {
4488 int size = sizeof(u64); /* nr */
4489 if (data->br_stack) {
4490 size += data->br_stack->nr
4491 * sizeof(struct perf_branch_entry);
4492 }
4493 header->size += size;
4494 }
4495
4496 if (sample_type & PERF_SAMPLE_REGS_USER) {
4497 /* regs dump ABI info */
4498 int size = sizeof(u64);
4499
4500 perf_sample_regs_user(&data->regs_user, regs);
4501
4502 if (data->regs_user.regs) {
4503 u64 mask = event->attr.sample_regs_user;
4504 size += hweight64(mask) * sizeof(u64);
4505 }
4506
4507 header->size += size;
4508 }
4509
4510 if (sample_type & PERF_SAMPLE_STACK_USER) {
4511 /*
4512 * Either we need PERF_SAMPLE_STACK_USER bit to be allways
4513 * processed as the last one or have additional check added
4514 * in case new sample type is added, because we could eat
4515 * up the rest of the sample size.
4516 */
4517 struct perf_regs_user *uregs = &data->regs_user;
4518 u16 stack_size = event->attr.sample_stack_user;
4519 u16 size = sizeof(u64);
4520
4521 if (!uregs->abi)
4522 perf_sample_regs_user(uregs, regs);
4523
4524 stack_size = perf_sample_ustack_size(stack_size, header->size,
4525 uregs->regs);
4526
4527 /*
4528 * If there is something to dump, add space for the dump
4529 * itself and for the field that tells the dynamic size,
4530 * which is how many have been actually dumped.
4531 */
4532 if (stack_size)
4533 size += sizeof(u64) + stack_size;
4534
4535 data->stack_user_size = stack_size;
4536 header->size += size;
4537 }
4538 }
4539
4540 static void perf_event_output(struct perf_event *event,
4541 struct perf_sample_data *data,
4542 struct pt_regs *regs)
4543 {
4544 struct perf_output_handle handle;
4545 struct perf_event_header header;
4546
4547 /* protect the callchain buffers */
4548 rcu_read_lock();
4549
4550 perf_prepare_sample(&header, data, event, regs);
4551
4552 if (perf_output_begin(&handle, event, header.size))
4553 goto exit;
4554
4555 perf_output_sample(&handle, &header, data, event);
4556
4557 perf_output_end(&handle);
4558
4559 exit:
4560 rcu_read_unlock();
4561 }
4562
4563 /*
4564 * read event_id
4565 */
4566
4567 struct perf_read_event {
4568 struct perf_event_header header;
4569
4570 u32 pid;
4571 u32 tid;
4572 };
4573
4574 static void
4575 perf_event_read_event(struct perf_event *event,
4576 struct task_struct *task)
4577 {
4578 struct perf_output_handle handle;
4579 struct perf_sample_data sample;
4580 struct perf_read_event read_event = {
4581 .header = {
4582 .type = PERF_RECORD_READ,
4583 .misc = 0,
4584 .size = sizeof(read_event) + event->read_size,
4585 },
4586 .pid = perf_event_pid(event, task),
4587 .tid = perf_event_tid(event, task),
4588 };
4589 int ret;
4590
4591 perf_event_header__init_id(&read_event.header, &sample, event);
4592 ret = perf_output_begin(&handle, event, read_event.header.size);
4593 if (ret)
4594 return;
4595
4596 perf_output_put(&handle, read_event);
4597 perf_output_read(&handle, event);
4598 perf_event__output_id_sample(event, &handle, &sample);
4599
4600 perf_output_end(&handle);
4601 }
4602
4603 typedef int (perf_event_aux_match_cb)(struct perf_event *event, void *data);
4604 typedef void (perf_event_aux_output_cb)(struct perf_event *event, void *data);
4605
4606 static void
4607 perf_event_aux_ctx(struct perf_event_context *ctx,
4608 perf_event_aux_match_cb match,
4609 perf_event_aux_output_cb output,
4610 void *data)
4611 {
4612 struct perf_event *event;
4613
4614 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
4615 if (event->state < PERF_EVENT_STATE_INACTIVE)
4616 continue;
4617 if (!event_filter_match(event))
4618 continue;
4619 if (match(event, data))
4620 output(event, data);
4621 }
4622 }
4623
4624 static void
4625 perf_event_aux(perf_event_aux_match_cb match,
4626 perf_event_aux_output_cb output,
4627 void *data,
4628 struct perf_event_context *task_ctx)
4629 {
4630 struct perf_cpu_context *cpuctx;
4631 struct perf_event_context *ctx;
4632 struct pmu *pmu;
4633 int ctxn;
4634
4635 rcu_read_lock();
4636 list_for_each_entry_rcu(pmu, &pmus, entry) {
4637 cpuctx = get_cpu_ptr(pmu->pmu_cpu_context);
4638 if (cpuctx->unique_pmu != pmu)
4639 goto next;
4640 perf_event_aux_ctx(&cpuctx->ctx, match, output, data);
4641 if (task_ctx)
4642 goto next;
4643 ctxn = pmu->task_ctx_nr;
4644 if (ctxn < 0)
4645 goto next;
4646 ctx = rcu_dereference(current->perf_event_ctxp[ctxn]);
4647 if (ctx)
4648 perf_event_aux_ctx(ctx, match, output, data);
4649 next:
4650 put_cpu_ptr(pmu->pmu_cpu_context);
4651 }
4652
4653 if (task_ctx) {
4654 preempt_disable();
4655 perf_event_aux_ctx(task_ctx, match, output, data);
4656 preempt_enable();
4657 }
4658 rcu_read_unlock();
4659 }
4660
4661 /*
4662 * task tracking -- fork/exit
4663 *
4664 * enabled by: attr.comm | attr.mmap | attr.mmap_data | attr.task
4665 */
4666
4667 struct perf_task_event {
4668 struct task_struct *task;
4669 struct perf_event_context *task_ctx;
4670
4671 struct {
4672 struct perf_event_header header;
4673
4674 u32 pid;
4675 u32 ppid;
4676 u32 tid;
4677 u32 ptid;
4678 u64 time;
4679 } event_id;
4680 };
4681
4682 static void perf_event_task_output(struct perf_event *event,
4683 void *data)
4684 {
4685 struct perf_task_event *task_event = data;
4686 struct perf_output_handle handle;
4687 struct perf_sample_data sample;
4688 struct task_struct *task = task_event->task;
4689 int ret, size = task_event->event_id.header.size;
4690
4691 perf_event_header__init_id(&task_event->event_id.header, &sample, event);
4692
4693 ret = perf_output_begin(&handle, event,
4694 task_event->event_id.header.size);
4695 if (ret)
4696 goto out;
4697
4698 task_event->event_id.pid = perf_event_pid(event, task);
4699 task_event->event_id.ppid = perf_event_pid(event, current);
4700
4701 task_event->event_id.tid = perf_event_tid(event, task);
4702 task_event->event_id.ptid = perf_event_tid(event, current);
4703
4704 perf_output_put(&handle, task_event->event_id);
4705
4706 perf_event__output_id_sample(event, &handle, &sample);
4707
4708 perf_output_end(&handle);
4709 out:
4710 task_event->event_id.header.size = size;
4711 }
4712
4713 static int perf_event_task_match(struct perf_event *event,
4714 void *data __maybe_unused)
4715 {
4716 return event->attr.comm || event->attr.mmap ||
4717 event->attr.mmap_data || event->attr.task;
4718 }
4719
4720 static void perf_event_task(struct task_struct *task,
4721 struct perf_event_context *task_ctx,
4722 int new)
4723 {
4724 struct perf_task_event task_event;
4725
4726 if (!atomic_read(&nr_comm_events) &&
4727 !atomic_read(&nr_mmap_events) &&
4728 !atomic_read(&nr_task_events))
4729 return;
4730
4731 task_event = (struct perf_task_event){
4732 .task = task,
4733 .task_ctx = task_ctx,
4734 .event_id = {
4735 .header = {
4736 .type = new ? PERF_RECORD_FORK : PERF_RECORD_EXIT,
4737 .misc = 0,
4738 .size = sizeof(task_event.event_id),
4739 },
4740 /* .pid */
4741 /* .ppid */
4742 /* .tid */
4743 /* .ptid */
4744 .time = perf_clock(),
4745 },
4746 };
4747
4748 perf_event_aux(perf_event_task_match,
4749 perf_event_task_output,
4750 &task_event,
4751 task_ctx);
4752 }
4753
4754 void perf_event_fork(struct task_struct *task)
4755 {
4756 perf_event_task(task, NULL, 1);
4757 }
4758
4759 /*
4760 * comm tracking
4761 */
4762
4763 struct perf_comm_event {
4764 struct task_struct *task;
4765 char *comm;
4766 int comm_size;
4767
4768 struct {
4769 struct perf_event_header header;
4770
4771 u32 pid;
4772 u32 tid;
4773 } event_id;
4774 };
4775
4776 static void perf_event_comm_output(struct perf_event *event,
4777 void *data)
4778 {
4779 struct perf_comm_event *comm_event = data;
4780 struct perf_output_handle handle;
4781 struct perf_sample_data sample;
4782 int size = comm_event->event_id.header.size;
4783 int ret;
4784
4785 perf_event_header__init_id(&comm_event->event_id.header, &sample, event);
4786 ret = perf_output_begin(&handle, event,
4787 comm_event->event_id.header.size);
4788
4789 if (ret)
4790 goto out;
4791
4792 comm_event->event_id.pid = perf_event_pid(event, comm_event->task);
4793 comm_event->event_id.tid = perf_event_tid(event, comm_event->task);
4794
4795 perf_output_put(&handle, comm_event->event_id);
4796 __output_copy(&handle, comm_event->comm,
4797 comm_event->comm_size);
4798
4799 perf_event__output_id_sample(event, &handle, &sample);
4800
4801 perf_output_end(&handle);
4802 out:
4803 comm_event->event_id.header.size = size;
4804 }
4805
4806 static int perf_event_comm_match(struct perf_event *event,
4807 void *data __maybe_unused)
4808 {
4809 return event->attr.comm;
4810 }
4811
4812 static void perf_event_comm_event(struct perf_comm_event *comm_event)
4813 {
4814 char comm[TASK_COMM_LEN];
4815 unsigned int size;
4816
4817 memset(comm, 0, sizeof(comm));
4818 strlcpy(comm, comm_event->task->comm, sizeof(comm));
4819 size = ALIGN(strlen(comm)+1, sizeof(u64));
4820
4821 comm_event->comm = comm;
4822 comm_event->comm_size = size;
4823
4824 comm_event->event_id.header.size = sizeof(comm_event->event_id) + size;
4825
4826 perf_event_aux(perf_event_comm_match,
4827 perf_event_comm_output,
4828 comm_event,
4829 NULL);
4830 }
4831
4832 void perf_event_comm(struct task_struct *task)
4833 {
4834 struct perf_comm_event comm_event;
4835 struct perf_event_context *ctx;
4836 int ctxn;
4837
4838 rcu_read_lock();
4839 for_each_task_context_nr(ctxn) {
4840 ctx = task->perf_event_ctxp[ctxn];
4841 if (!ctx)
4842 continue;
4843
4844 perf_event_enable_on_exec(ctx);
4845 }
4846 rcu_read_unlock();
4847
4848 if (!atomic_read(&nr_comm_events))
4849 return;
4850
4851 comm_event = (struct perf_comm_event){
4852 .task = task,
4853 /* .comm */
4854 /* .comm_size */
4855 .event_id = {
4856 .header = {
4857 .type = PERF_RECORD_COMM,
4858 .misc = 0,
4859 /* .size */
4860 },
4861 /* .pid */
4862 /* .tid */
4863 },
4864 };
4865
4866 perf_event_comm_event(&comm_event);
4867 }
4868
4869 /*
4870 * mmap tracking
4871 */
4872
4873 struct perf_mmap_event {
4874 struct vm_area_struct *vma;
4875
4876 const char *file_name;
4877 int file_size;
4878
4879 struct {
4880 struct perf_event_header header;
4881
4882 u32 pid;
4883 u32 tid;
4884 u64 start;
4885 u64 len;
4886 u64 pgoff;
4887 } event_id;
4888 };
4889
4890 static void perf_event_mmap_output(struct perf_event *event,
4891 void *data)
4892 {
4893 struct perf_mmap_event *mmap_event = data;
4894 struct perf_output_handle handle;
4895 struct perf_sample_data sample;
4896 int size = mmap_event->event_id.header.size;
4897 int ret;
4898
4899 perf_event_header__init_id(&mmap_event->event_id.header, &sample, event);
4900 ret = perf_output_begin(&handle, event,
4901 mmap_event->event_id.header.size);
4902 if (ret)
4903 goto out;
4904
4905 mmap_event->event_id.pid = perf_event_pid(event, current);
4906 mmap_event->event_id.tid = perf_event_tid(event, current);
4907
4908 perf_output_put(&handle, mmap_event->event_id);
4909 __output_copy(&handle, mmap_event->file_name,
4910 mmap_event->file_size);
4911
4912 perf_event__output_id_sample(event, &handle, &sample);
4913
4914 perf_output_end(&handle);
4915 out:
4916 mmap_event->event_id.header.size = size;
4917 }
4918
4919 static int perf_event_mmap_match(struct perf_event *event,
4920 void *data)
4921 {
4922 struct perf_mmap_event *mmap_event = data;
4923 struct vm_area_struct *vma = mmap_event->vma;
4924 int executable = vma->vm_flags & VM_EXEC;
4925
4926 return (!executable && event->attr.mmap_data) ||
4927 (executable && event->attr.mmap);
4928 }
4929
4930 static void perf_event_mmap_event(struct perf_mmap_event *mmap_event)
4931 {
4932 struct vm_area_struct *vma = mmap_event->vma;
4933 struct file *file = vma->vm_file;
4934 unsigned int size;
4935 char tmp[16];
4936 char *buf = NULL;
4937 const char *name;
4938
4939 memset(tmp, 0, sizeof(tmp));
4940
4941 if (file) {
4942 /*
4943 * d_path works from the end of the rb backwards, so we
4944 * need to add enough zero bytes after the string to handle
4945 * the 64bit alignment we do later.
4946 */
4947 buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
4948 if (!buf) {
4949 name = strncpy(tmp, "//enomem", sizeof(tmp));
4950 goto got_name;
4951 }
4952 name = d_path(&file->f_path, buf, PATH_MAX);
4953 if (IS_ERR(name)) {
4954 name = strncpy(tmp, "//toolong", sizeof(tmp));
4955 goto got_name;
4956 }
4957 } else {
4958 if (arch_vma_name(mmap_event->vma)) {
4959 name = strncpy(tmp, arch_vma_name(mmap_event->vma),
4960 sizeof(tmp) - 1);
4961 tmp[sizeof(tmp) - 1] = '\0';
4962 goto got_name;
4963 }
4964
4965 if (!vma->vm_mm) {
4966 name = strncpy(tmp, "[vdso]", sizeof(tmp));
4967 goto got_name;
4968 } else if (vma->vm_start <= vma->vm_mm->start_brk &&
4969 vma->vm_end >= vma->vm_mm->brk) {
4970 name = strncpy(tmp, "[heap]", sizeof(tmp));
4971 goto got_name;
4972 } else if (vma->vm_start <= vma->vm_mm->start_stack &&
4973 vma->vm_end >= vma->vm_mm->start_stack) {
4974 name = strncpy(tmp, "[stack]", sizeof(tmp));
4975 goto got_name;
4976 }
4977
4978 name = strncpy(tmp, "//anon", sizeof(tmp));
4979 goto got_name;
4980 }
4981
4982 got_name:
4983 size = ALIGN(strlen(name)+1, sizeof(u64));
4984
4985 mmap_event->file_name = name;
4986 mmap_event->file_size = size;
4987
4988 if (!(vma->vm_flags & VM_EXEC))
4989 mmap_event->event_id.header.misc |= PERF_RECORD_MISC_MMAP_DATA;
4990
4991 mmap_event->event_id.header.size = sizeof(mmap_event->event_id) + size;
4992
4993 perf_event_aux(perf_event_mmap_match,
4994 perf_event_mmap_output,
4995 mmap_event,
4996 NULL);
4997
4998 kfree(buf);
4999 }
5000
5001 void perf_event_mmap(struct vm_area_struct *vma)
5002 {
5003 struct perf_mmap_event mmap_event;
5004
5005 if (!atomic_read(&nr_mmap_events))
5006 return;
5007
5008 mmap_event = (struct perf_mmap_event){
5009 .vma = vma,
5010 /* .file_name */
5011 /* .file_size */
5012 .event_id = {
5013 .header = {
5014 .type = PERF_RECORD_MMAP,
5015 .misc = PERF_RECORD_MISC_USER,
5016 /* .size */
5017 },
5018 /* .pid */
5019 /* .tid */
5020 .start = vma->vm_start,
5021 .len = vma->vm_end - vma->vm_start,
5022 .pgoff = (u64)vma->vm_pgoff << PAGE_SHIFT,
5023 },
5024 };
5025
5026 perf_event_mmap_event(&mmap_event);
5027 }
5028
5029 /*
5030 * IRQ throttle logging
5031 */
5032
5033 static void perf_log_throttle(struct perf_event *event, int enable)
5034 {
5035 struct perf_output_handle handle;
5036 struct perf_sample_data sample;
5037 int ret;
5038
5039 struct {
5040 struct perf_event_header header;
5041 u64 time;
5042 u64 id;
5043 u64 stream_id;
5044 } throttle_event = {
5045 .header = {
5046 .type = PERF_RECORD_THROTTLE,
5047 .misc = 0,
5048 .size = sizeof(throttle_event),
5049 },
5050 .time = perf_clock(),
5051 .id = primary_event_id(event),
5052 .stream_id = event->id,
5053 };
5054
5055 if (enable)
5056 throttle_event.header.type = PERF_RECORD_UNTHROTTLE;
5057
5058 perf_event_header__init_id(&throttle_event.header, &sample, event);
5059
5060 ret = perf_output_begin(&handle, event,
5061 throttle_event.header.size);
5062 if (ret)
5063 return;
5064
5065 perf_output_put(&handle, throttle_event);
5066 perf_event__output_id_sample(event, &handle, &sample);
5067 perf_output_end(&handle);
5068 }
5069
5070 /*
5071 * Generic event overflow handling, sampling.
5072 */
5073
5074 static int __perf_event_overflow(struct perf_event *event,
5075 int throttle, struct perf_sample_data *data,
5076 struct pt_regs *regs)
5077 {
5078 int events = atomic_read(&event->event_limit);
5079 struct hw_perf_event *hwc = &event->hw;
5080 u64 seq;
5081 int ret = 0;
5082
5083 /*
5084 * Non-sampling counters might still use the PMI to fold short
5085 * hardware counters, ignore those.
5086 */
5087 if (unlikely(!is_sampling_event(event)))
5088 return 0;
5089
5090 seq = __this_cpu_read(perf_throttled_seq);
5091 if (seq != hwc->interrupts_seq) {
5092 hwc->interrupts_seq = seq;
5093 hwc->interrupts = 1;
5094 } else {
5095 hwc->interrupts++;
5096 if (unlikely(throttle
5097 && hwc->interrupts >= max_samples_per_tick)) {
5098 __this_cpu_inc(perf_throttled_count);
5099 hwc->interrupts = MAX_INTERRUPTS;
5100 perf_log_throttle(event, 0);
5101 ret = 1;
5102 }
5103 }
5104
5105 if (event->attr.freq) {
5106 u64 now = perf_clock();
5107 s64 delta = now - hwc->freq_time_stamp;
5108
5109 hwc->freq_time_stamp = now;
5110
5111 if (delta > 0 && delta < 2*TICK_NSEC)
5112 perf_adjust_period(event, delta, hwc->last_period, true);
5113 }
5114
5115 /*
5116 * XXX event_limit might not quite work as expected on inherited
5117 * events
5118 */
5119
5120 event->pending_kill = POLL_IN;
5121 if (events && atomic_dec_and_test(&event->event_limit)) {
5122 ret = 1;
5123 event->pending_kill = POLL_HUP;
5124 event->pending_disable = 1;
5125 irq_work_queue(&event->pending);
5126 }
5127
5128 if (event->overflow_handler)
5129 event->overflow_handler(event, data, regs);
5130 else
5131 perf_event_output(event, data, regs);
5132
5133 if (event->fasync && event->pending_kill) {
5134 event->pending_wakeup = 1;
5135 irq_work_queue(&event->pending);
5136 }
5137
5138 return ret;
5139 }
5140
5141 int perf_event_overflow(struct perf_event *event,
5142 struct perf_sample_data *data,
5143 struct pt_regs *regs)
5144 {
5145 return __perf_event_overflow(event, 1, data, regs);
5146 }
5147
5148 /*
5149 * Generic software event infrastructure
5150 */
5151
5152 struct swevent_htable {
5153 struct swevent_hlist *swevent_hlist;
5154 struct mutex hlist_mutex;
5155 int hlist_refcount;
5156
5157 /* Recursion avoidance in each contexts */
5158 int recursion[PERF_NR_CONTEXTS];
5159 };
5160
5161 static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
5162
5163 /*
5164 * We directly increment event->count and keep a second value in
5165 * event->hw.period_left to count intervals. This period event
5166 * is kept in the range [-sample_period, 0] so that we can use the
5167 * sign as trigger.
5168 */
5169
5170 static u64 perf_swevent_set_period(struct perf_event *event)
5171 {
5172 struct hw_perf_event *hwc = &event->hw;
5173 u64 period = hwc->last_period;
5174 u64 nr, offset;
5175 s64 old, val;
5176
5177 hwc->last_period = hwc->sample_period;
5178
5179 again:
5180 old = val = local64_read(&hwc->period_left);
5181 if (val < 0)
5182 return 0;
5183
5184 nr = div64_u64(period + val, period);
5185 offset = nr * period;
5186 val -= offset;
5187 if (local64_cmpxchg(&hwc->period_left, old, val) != old)
5188 goto again;
5189
5190 return nr;
5191 }
5192
5193 static void perf_swevent_overflow(struct perf_event *event, u64 overflow,
5194 struct perf_sample_data *data,
5195 struct pt_regs *regs)
5196 {
5197 struct hw_perf_event *hwc = &event->hw;
5198 int throttle = 0;
5199
5200 if (!overflow)
5201 overflow = perf_swevent_set_period(event);
5202
5203 if (hwc->interrupts == MAX_INTERRUPTS)
5204 return;
5205
5206 for (; overflow; overflow--) {
5207 if (__perf_event_overflow(event, throttle,
5208 data, regs)) {
5209 /*
5210 * We inhibit the overflow from happening when
5211 * hwc->interrupts == MAX_INTERRUPTS.
5212 */
5213 break;
5214 }
5215 throttle = 1;
5216 }
5217 }
5218
5219 static void perf_swevent_event(struct perf_event *event, u64 nr,
5220 struct perf_sample_data *data,
5221 struct pt_regs *regs)
5222 {
5223 struct hw_perf_event *hwc = &event->hw;
5224
5225 local64_add(nr, &event->count);
5226
5227 if (!regs)
5228 return;
5229
5230 if (!is_sampling_event(event))
5231 return;
5232
5233 if ((event->attr.sample_type & PERF_SAMPLE_PERIOD) && !event->attr.freq) {
5234 data->period = nr;
5235 return perf_swevent_overflow(event, 1, data, regs);
5236 } else
5237 data->period = event->hw.last_period;
5238
5239 if (nr == 1 && hwc->sample_period == 1 && !event->attr.freq)
5240 return perf_swevent_overflow(event, 1, data, regs);
5241
5242 if (local64_add_negative(nr, &hwc->period_left))
5243 return;
5244
5245 perf_swevent_overflow(event, 0, data, regs);
5246 }
5247
5248 static int perf_exclude_event(struct perf_event *event,
5249 struct pt_regs *regs)
5250 {
5251 if (event->hw.state & PERF_HES_STOPPED)
5252 return 1;
5253
5254 if (regs) {
5255 if (event->attr.exclude_user && user_mode(regs))
5256 return 1;
5257
5258 if (event->attr.exclude_kernel && !user_mode(regs))
5259 return 1;
5260 }
5261
5262 return 0;
5263 }
5264
5265 static int perf_swevent_match(struct perf_event *event,
5266 enum perf_type_id type,
5267 u32 event_id,
5268 struct perf_sample_data *data,
5269 struct pt_regs *regs)
5270 {
5271 if (event->attr.type != type)
5272 return 0;
5273
5274 if (event->attr.config != event_id)
5275 return 0;
5276
5277 if (perf_exclude_event(event, regs))
5278 return 0;
5279
5280 return 1;
5281 }
5282
5283 static inline u64 swevent_hash(u64 type, u32 event_id)
5284 {
5285 u64 val = event_id | (type << 32);
5286
5287 return hash_64(val, SWEVENT_HLIST_BITS);
5288 }
5289
5290 static inline struct hlist_head *
5291 __find_swevent_head(struct swevent_hlist *hlist, u64 type, u32 event_id)
5292 {
5293 u64 hash = swevent_hash(type, event_id);
5294
5295 return &hlist->heads[hash];
5296 }
5297
5298 /* For the read side: events when they trigger */
5299 static inline struct hlist_head *
5300 find_swevent_head_rcu(struct swevent_htable *swhash, u64 type, u32 event_id)
5301 {
5302 struct swevent_hlist *hlist;
5303
5304 hlist = rcu_dereference(swhash->swevent_hlist);
5305 if (!hlist)
5306 return NULL;
5307
5308 return __find_swevent_head(hlist, type, event_id);
5309 }
5310
5311 /* For the event head insertion and removal in the hlist */
5312 static inline struct hlist_head *
5313 find_swevent_head(struct swevent_htable *swhash, struct perf_event *event)
5314 {
5315 struct swevent_hlist *hlist;
5316 u32 event_id = event->attr.config;
5317 u64 type = event->attr.type;
5318
5319 /*
5320 * Event scheduling is always serialized against hlist allocation
5321 * and release. Which makes the protected version suitable here.
5322 * The context lock guarantees that.
5323 */
5324 hlist = rcu_dereference_protected(swhash->swevent_hlist,
5325 lockdep_is_held(&event->ctx->lock));
5326 if (!hlist)
5327 return NULL;
5328
5329 return __find_swevent_head(hlist, type, event_id);
5330 }
5331
5332 static void do_perf_sw_event(enum perf_type_id type, u32 event_id,
5333 u64 nr,
5334 struct perf_sample_data *data,
5335 struct pt_regs *regs)
5336 {
5337 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5338 struct perf_event *event;
5339 struct hlist_head *head;
5340
5341 rcu_read_lock();
5342 head = find_swevent_head_rcu(swhash, type, event_id);
5343 if (!head)
5344 goto end;
5345
5346 hlist_for_each_entry_rcu(event, head, hlist_entry) {
5347 if (perf_swevent_match(event, type, event_id, data, regs))
5348 perf_swevent_event(event, nr, data, regs);
5349 }
5350 end:
5351 rcu_read_unlock();
5352 }
5353
5354 int perf_swevent_get_recursion_context(void)
5355 {
5356 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5357
5358 return get_recursion_context(swhash->recursion);
5359 }
5360 EXPORT_SYMBOL_GPL(perf_swevent_get_recursion_context);
5361
5362 inline void perf_swevent_put_recursion_context(int rctx)
5363 {
5364 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5365
5366 put_recursion_context(swhash->recursion, rctx);
5367 }
5368
5369 void __perf_sw_event(u32 event_id, u64 nr, struct pt_regs *regs, u64 addr)
5370 {
5371 struct perf_sample_data data;
5372 int rctx;
5373
5374 preempt_disable_notrace();
5375 rctx = perf_swevent_get_recursion_context();
5376 if (rctx < 0)
5377 return;
5378
5379 perf_sample_data_init(&data, addr, 0);
5380
5381 do_perf_sw_event(PERF_TYPE_SOFTWARE, event_id, nr, &data, regs);
5382
5383 perf_swevent_put_recursion_context(rctx);
5384 preempt_enable_notrace();
5385 }
5386
5387 static void perf_swevent_read(struct perf_event *event)
5388 {
5389 }
5390
5391 static int perf_swevent_add(struct perf_event *event, int flags)
5392 {
5393 struct swevent_htable *swhash = &__get_cpu_var(swevent_htable);
5394 struct hw_perf_event *hwc = &event->hw;
5395 struct hlist_head *head;
5396
5397 if (is_sampling_event(event)) {
5398 hwc->last_period = hwc->sample_period;
5399 perf_swevent_set_period(event);
5400 }
5401
5402 hwc->state = !(flags & PERF_EF_START);
5403
5404 head = find_swevent_head(swhash, event);
5405 if (WARN_ON_ONCE(!head))
5406 return -EINVAL;
5407
5408 hlist_add_head_rcu(&event->hlist_entry, head);
5409
5410 return 0;
5411 }
5412
5413 static void perf_swevent_del(struct perf_event *event, int flags)
5414 {
5415 hlist_del_rcu(&event->hlist_entry);
5416 }
5417
5418 static void perf_swevent_start(struct perf_event *event, int flags)
5419 {
5420 event->hw.state = 0;
5421 }
5422
5423 static void perf_swevent_stop(struct perf_event *event, int flags)
5424 {
5425 event->hw.state = PERF_HES_STOPPED;
5426 }
5427
5428 /* Deref the hlist from the update side */
5429 static inline struct swevent_hlist *
5430 swevent_hlist_deref(struct swevent_htable *swhash)
5431 {
5432 return rcu_dereference_protected(swhash->swevent_hlist,
5433 lockdep_is_held(&swhash->hlist_mutex));
5434 }
5435
5436 static void swevent_hlist_release(struct swevent_htable *swhash)
5437 {
5438 struct swevent_hlist *hlist = swevent_hlist_deref(swhash);
5439
5440 if (!hlist)
5441 return;
5442
5443 rcu_assign_pointer(swhash->swevent_hlist, NULL);
5444 kfree_rcu(hlist, rcu_head);
5445 }
5446
5447 static void swevent_hlist_put_cpu(struct perf_event *event, int cpu)
5448 {
5449 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5450
5451 mutex_lock(&swhash->hlist_mutex);
5452
5453 if (!--swhash->hlist_refcount)
5454 swevent_hlist_release(swhash);
5455
5456 mutex_unlock(&swhash->hlist_mutex);
5457 }
5458
5459 static void swevent_hlist_put(struct perf_event *event)
5460 {
5461 int cpu;
5462
5463 if (event->cpu != -1) {
5464 swevent_hlist_put_cpu(event, event->cpu);
5465 return;
5466 }
5467
5468 for_each_possible_cpu(cpu)
5469 swevent_hlist_put_cpu(event, cpu);
5470 }
5471
5472 static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
5473 {
5474 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
5475 int err = 0;
5476
5477 mutex_lock(&swhash->hlist_mutex);
5478 if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
5479 struct swevent_hlist *hlist;
5480
5481 hlist = kzalloc(sizeof(*hlist), GFP_KERNEL);
5482 if (!hlist) {
5483 err = -ENOMEM;
5484 goto exit;
5485 }
5486 rcu_assign_pointer(swhash->swevent_hlist, hlist);
5487 }
5488 swhash->hlist_refcount++;
5489 exit:
5490 mutex_unlock(&swhash->hlist_mutex);
5491
5492 return err;
5493 }
5494
5495 static int swevent_hlist_get(struct perf_event *event)
5496 {
5497 int err;
5498 int cpu, failed_cpu;
5499
5500 if (event->cpu != -1)
5501 return swevent_hlist_get_cpu(event, event->cpu);
5502
5503 get_online_cpus();
5504 for_each_possible_cpu(cpu) {
5505 err = swevent_hlist_get_cpu(event, cpu);
5506 if (err) {
5507 failed_cpu = cpu;
5508 goto fail;
5509 }
5510 }
5511 put_online_cpus();
5512
5513 return 0;
5514 fail:
5515 for_each_possible_cpu(cpu) {
5516 if (cpu == failed_cpu)
5517 break;
5518 swevent_hlist_put_cpu(event, cpu);
5519 }
5520
5521 put_online_cpus();
5522 return err;
5523 }
5524
5525 struct static_key perf_swevent_enabled[PERF_COUNT_SW_MAX];
5526
5527 static void sw_perf_event_destroy(struct perf_event *event)
5528 {
5529 u64 event_id = event->attr.config;
5530
5531 WARN_ON(event->parent);
5532
5533 static_key_slow_dec(&perf_swevent_enabled[event_id]);
5534 swevent_hlist_put(event);
5535 }
5536
5537 static int perf_swevent_init(struct perf_event *event)
5538 {
5539 u64 event_id = event->attr.config;
5540
5541 if (event->attr.type != PERF_TYPE_SOFTWARE)
5542 return -ENOENT;
5543
5544 /*
5545 * no branch sampling for software events
5546 */
5547 if (has_branch_stack(event))
5548 return -EOPNOTSUPP;
5549
5550 switch (event_id) {
5551 case PERF_COUNT_SW_CPU_CLOCK:
5552 case PERF_COUNT_SW_TASK_CLOCK:
5553 return -ENOENT;
5554
5555 default:
5556 break;
5557 }
5558
5559 if (event_id >= PERF_COUNT_SW_MAX)
5560 return -ENOENT;
5561
5562 if (!event->parent) {
5563 int err;
5564
5565 err = swevent_hlist_get(event);
5566 if (err)
5567 return err;
5568
5569 static_key_slow_inc(&perf_swevent_enabled[event_id]);
5570 event->destroy = sw_perf_event_destroy;
5571 }
5572
5573 return 0;
5574 }
5575
5576 static int perf_swevent_event_idx(struct perf_event *event)
5577 {
5578 return 0;
5579 }
5580
5581 static struct pmu perf_swevent = {
5582 .task_ctx_nr = perf_sw_context,
5583
5584 .event_init = perf_swevent_init,
5585 .add = perf_swevent_add,
5586 .del = perf_swevent_del,
5587 .start = perf_swevent_start,
5588 .stop = perf_swevent_stop,
5589 .read = perf_swevent_read,
5590
5591 .event_idx = perf_swevent_event_idx,
5592 };
5593
5594 #ifdef CONFIG_EVENT_TRACING
5595
5596 static int perf_tp_filter_match(struct perf_event *event,
5597 struct perf_sample_data *data)
5598 {
5599 void *record = data->raw->data;
5600
5601 if (likely(!event->filter) || filter_match_preds(event->filter, record))
5602 return 1;
5603 return 0;
5604 }
5605
5606 static int perf_tp_event_match(struct perf_event *event,
5607 struct perf_sample_data *data,
5608 struct pt_regs *regs)
5609 {
5610 if (event->hw.state & PERF_HES_STOPPED)
5611 return 0;
5612 /*
5613 * All tracepoints are from kernel-space.
5614 */
5615 if (event->attr.exclude_kernel)
5616 return 0;
5617
5618 if (!perf_tp_filter_match(event, data))
5619 return 0;
5620
5621 return 1;
5622 }
5623
5624 void perf_tp_event(u64 addr, u64 count, void *record, int entry_size,
5625 struct pt_regs *regs, struct hlist_head *head, int rctx,
5626 struct task_struct *task)
5627 {
5628 struct perf_sample_data data;
5629 struct perf_event *event;
5630
5631 struct perf_raw_record raw = {
5632 .size = entry_size,
5633 .data = record,
5634 };
5635
5636 perf_sample_data_init(&data, addr, 0);
5637 data.raw = &raw;
5638
5639 hlist_for_each_entry_rcu(event, head, hlist_entry) {
5640 if (perf_tp_event_match(event, &data, regs))
5641 perf_swevent_event(event, count, &data, regs);
5642 }
5643
5644 /*
5645 * If we got specified a target task, also iterate its context and
5646 * deliver this event there too.
5647 */
5648 if (task && task != current) {
5649 struct perf_event_context *ctx;
5650 struct trace_entry *entry = record;
5651
5652 rcu_read_lock();
5653 ctx = rcu_dereference(task->perf_event_ctxp[perf_sw_context]);
5654 if (!ctx)
5655 goto unlock;
5656
5657 list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
5658 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5659 continue;
5660 if (event->attr.config != entry->type)
5661 continue;
5662 if (perf_tp_event_match(event, &data, regs))
5663 perf_swevent_event(event, count, &data, regs);
5664 }
5665 unlock:
5666 rcu_read_unlock();
5667 }
5668
5669 perf_swevent_put_recursion_context(rctx);
5670 }
5671 EXPORT_SYMBOL_GPL(perf_tp_event);
5672
5673 static void tp_perf_event_destroy(struct perf_event *event)
5674 {
5675 perf_trace_destroy(event);
5676 }
5677
5678 static int perf_tp_event_init(struct perf_event *event)
5679 {
5680 int err;
5681
5682 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5683 return -ENOENT;
5684
5685 /*
5686 * no branch sampling for tracepoint events
5687 */
5688 if (has_branch_stack(event))
5689 return -EOPNOTSUPP;
5690
5691 err = perf_trace_init(event);
5692 if (err)
5693 return err;
5694
5695 event->destroy = tp_perf_event_destroy;
5696
5697 return 0;
5698 }
5699
5700 static struct pmu perf_tracepoint = {
5701 .task_ctx_nr = perf_sw_context,
5702
5703 .event_init = perf_tp_event_init,
5704 .add = perf_trace_add,
5705 .del = perf_trace_del,
5706 .start = perf_swevent_start,
5707 .stop = perf_swevent_stop,
5708 .read = perf_swevent_read,
5709
5710 .event_idx = perf_swevent_event_idx,
5711 };
5712
5713 static inline void perf_tp_register(void)
5714 {
5715 perf_pmu_register(&perf_tracepoint, "tracepoint", PERF_TYPE_TRACEPOINT);
5716 }
5717
5718 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5719 {
5720 char *filter_str;
5721 int ret;
5722
5723 if (event->attr.type != PERF_TYPE_TRACEPOINT)
5724 return -EINVAL;
5725
5726 filter_str = strndup_user(arg, PAGE_SIZE);
5727 if (IS_ERR(filter_str))
5728 return PTR_ERR(filter_str);
5729
5730 ret = ftrace_profile_set_filter(event, event->attr.config, filter_str);
5731
5732 kfree(filter_str);
5733 return ret;
5734 }
5735
5736 static void perf_event_free_filter(struct perf_event *event)
5737 {
5738 ftrace_profile_free_filter(event);
5739 }
5740
5741 #else
5742
5743 static inline void perf_tp_register(void)
5744 {
5745 }
5746
5747 static int perf_event_set_filter(struct perf_event *event, void __user *arg)
5748 {
5749 return -ENOENT;
5750 }
5751
5752 static void perf_event_free_filter(struct perf_event *event)
5753 {
5754 }
5755
5756 #endif /* CONFIG_EVENT_TRACING */
5757
5758 #ifdef CONFIG_HAVE_HW_BREAKPOINT
5759 void perf_bp_event(struct perf_event *bp, void *data)
5760 {
5761 struct perf_sample_data sample;
5762 struct pt_regs *regs = data;
5763
5764 perf_sample_data_init(&sample, bp->attr.bp_addr, 0);
5765
5766 if (!bp->hw.state && !perf_exclude_event(bp, regs))
5767 perf_swevent_event(bp, 1, &sample, regs);
5768 }
5769 #endif
5770
5771 /*
5772 * hrtimer based swevent callback
5773 */
5774
5775 static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
5776 {
5777 enum hrtimer_restart ret = HRTIMER_RESTART;
5778 struct perf_sample_data data;
5779 struct pt_regs *regs;
5780 struct perf_event *event;
5781 u64 period;
5782
5783 event = container_of(hrtimer, struct perf_event, hw.hrtimer);
5784
5785 if (event->state != PERF_EVENT_STATE_ACTIVE)
5786 return HRTIMER_NORESTART;
5787
5788 event->pmu->read(event);
5789
5790 perf_sample_data_init(&data, 0, event->hw.last_period);
5791 regs = get_irq_regs();
5792
5793 if (regs && !perf_exclude_event(event, regs)) {
5794 if (!(event->attr.exclude_idle && is_idle_task(current)))
5795 if (__perf_event_overflow(event, 1, &data, regs))
5796 ret = HRTIMER_NORESTART;
5797 }
5798
5799 period = max_t(u64, 10000, event->hw.sample_period);
5800 hrtimer_forward_now(hrtimer, ns_to_ktime(period));
5801
5802 return ret;
5803 }
5804
5805 static void perf_swevent_start_hrtimer(struct perf_event *event)
5806 {
5807 struct hw_perf_event *hwc = &event->hw;
5808 s64 period;
5809
5810 if (!is_sampling_event(event))
5811 return;
5812
5813 period = local64_read(&hwc->period_left);
5814 if (period) {
5815 if (period < 0)
5816 period = 10000;
5817
5818 local64_set(&hwc->period_left, 0);
5819 } else {
5820 period = max_t(u64, 10000, hwc->sample_period);
5821 }
5822 __hrtimer_start_range_ns(&hwc->hrtimer,
5823 ns_to_ktime(period), 0,
5824 HRTIMER_MODE_REL_PINNED, 0);
5825 }
5826
5827 static void perf_swevent_cancel_hrtimer(struct perf_event *event)
5828 {
5829 struct hw_perf_event *hwc = &event->hw;
5830
5831 if (is_sampling_event(event)) {
5832 ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
5833 local64_set(&hwc->period_left, ktime_to_ns(remaining));
5834
5835 hrtimer_cancel(&hwc->hrtimer);
5836 }
5837 }
5838
5839 static void perf_swevent_init_hrtimer(struct perf_event *event)
5840 {
5841 struct hw_perf_event *hwc = &event->hw;
5842
5843 if (!is_sampling_event(event))
5844 return;
5845
5846 hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
5847 hwc->hrtimer.function = perf_swevent_hrtimer;
5848
5849 /*
5850 * Since hrtimers have a fixed rate, we can do a static freq->period
5851 * mapping and avoid the whole period adjust feedback stuff.
5852 */
5853 if (event->attr.freq) {
5854 long freq = event->attr.sample_freq;
5855
5856 event->attr.sample_period = NSEC_PER_SEC / freq;
5857 hwc->sample_period = event->attr.sample_period;
5858 local64_set(&hwc->period_left, hwc->sample_period);
5859 hwc->last_period = hwc->sample_period;
5860 event->attr.freq = 0;
5861 }
5862 }
5863
5864 /*
5865 * Software event: cpu wall time clock
5866 */
5867
5868 static void cpu_clock_event_update(struct perf_event *event)
5869 {
5870 s64 prev;
5871 u64 now;
5872
5873 now = local_clock();
5874 prev = local64_xchg(&event->hw.prev_count, now);
5875 local64_add(now - prev, &event->count);
5876 }
5877
5878 static void cpu_clock_event_start(struct perf_event *event, int flags)
5879 {
5880 local64_set(&event->hw.prev_count, local_clock());
5881 perf_swevent_start_hrtimer(event);
5882 }
5883
5884 static void cpu_clock_event_stop(struct perf_event *event, int flags)
5885 {
5886 perf_swevent_cancel_hrtimer(event);
5887 cpu_clock_event_update(event);
5888 }
5889
5890 static int cpu_clock_event_add(struct perf_event *event, int flags)
5891 {
5892 if (flags & PERF_EF_START)
5893 cpu_clock_event_start(event, flags);
5894
5895 return 0;
5896 }
5897
5898 static void cpu_clock_event_del(struct perf_event *event, int flags)
5899 {
5900 cpu_clock_event_stop(event, flags);
5901 }
5902
5903 static void cpu_clock_event_read(struct perf_event *event)
5904 {
5905 cpu_clock_event_update(event);
5906 }
5907
5908 static int cpu_clock_event_init(struct perf_event *event)
5909 {
5910 if (event->attr.type != PERF_TYPE_SOFTWARE)
5911 return -ENOENT;
5912
5913 if (event->attr.config != PERF_COUNT_SW_CPU_CLOCK)
5914 return -ENOENT;
5915
5916 /*
5917 * no branch sampling for software events
5918 */
5919 if (has_branch_stack(event))
5920 return -EOPNOTSUPP;
5921
5922 perf_swevent_init_hrtimer(event);
5923
5924 return 0;
5925 }
5926
5927 static struct pmu perf_cpu_clock = {
5928 .task_ctx_nr = perf_sw_context,
5929
5930 .event_init = cpu_clock_event_init,
5931 .add = cpu_clock_event_add,
5932 .del = cpu_clock_event_del,
5933 .start = cpu_clock_event_start,
5934 .stop = cpu_clock_event_stop,
5935 .read = cpu_clock_event_read,
5936
5937 .event_idx = perf_swevent_event_idx,
5938 };
5939
5940 /*
5941 * Software event: task time clock
5942 */
5943
5944 static void task_clock_event_update(struct perf_event *event, u64 now)
5945 {
5946 u64 prev;
5947 s64 delta;
5948
5949 prev = local64_xchg(&event->hw.prev_count, now);
5950 delta = now - prev;
5951 local64_add(delta, &event->count);
5952 }
5953
5954 static void task_clock_event_start(struct perf_event *event, int flags)
5955 {
5956 local64_set(&event->hw.prev_count, event->ctx->time);
5957 perf_swevent_start_hrtimer(event);
5958 }
5959
5960 static void task_clock_event_stop(struct perf_event *event, int flags)
5961 {
5962 perf_swevent_cancel_hrtimer(event);
5963 task_clock_event_update(event, event->ctx->time);
5964 }
5965
5966 static int task_clock_event_add(struct perf_event *event, int flags)
5967 {
5968 if (flags & PERF_EF_START)
5969 task_clock_event_start(event, flags);
5970
5971 return 0;
5972 }
5973
5974 static void task_clock_event_del(struct perf_event *event, int flags)
5975 {
5976 task_clock_event_stop(event, PERF_EF_UPDATE);
5977 }
5978
5979 static void task_clock_event_read(struct perf_event *event)
5980 {
5981 u64 now = perf_clock();
5982 u64 delta = now - event->ctx->timestamp;
5983 u64 time = event->ctx->time + delta;
5984
5985 task_clock_event_update(event, time);
5986 }
5987
5988 static int task_clock_event_init(struct perf_event *event)
5989 {
5990 if (event->attr.type != PERF_TYPE_SOFTWARE)
5991 return -ENOENT;
5992
5993 if (event->attr.config != PERF_COUNT_SW_TASK_CLOCK)
5994 return -ENOENT;
5995
5996 /*
5997 * no branch sampling for software events
5998 */
5999 if (has_branch_stack(event))
6000 return -EOPNOTSUPP;
6001
6002 perf_swevent_init_hrtimer(event);
6003
6004 return 0;
6005 }
6006
6007 static struct pmu perf_task_clock = {
6008 .task_ctx_nr = perf_sw_context,
6009
6010 .event_init = task_clock_event_init,
6011 .add = task_clock_event_add,
6012 .del = task_clock_event_del,
6013 .start = task_clock_event_start,
6014 .stop = task_clock_event_stop,
6015 .read = task_clock_event_read,
6016
6017 .event_idx = perf_swevent_event_idx,
6018 };
6019
6020 static void perf_pmu_nop_void(struct pmu *pmu)
6021 {
6022 }
6023
6024 static int perf_pmu_nop_int(struct pmu *pmu)
6025 {
6026 return 0;
6027 }
6028
6029 static void perf_pmu_start_txn(struct pmu *pmu)
6030 {
6031 perf_pmu_disable(pmu);
6032 }
6033
6034 static int perf_pmu_commit_txn(struct pmu *pmu)
6035 {
6036 perf_pmu_enable(pmu);
6037 return 0;
6038 }
6039
6040 static void perf_pmu_cancel_txn(struct pmu *pmu)
6041 {
6042 perf_pmu_enable(pmu);
6043 }
6044
6045 static int perf_event_idx_default(struct perf_event *event)
6046 {
6047 return event->hw.idx + 1;
6048 }
6049
6050 /*
6051 * Ensures all contexts with the same task_ctx_nr have the same
6052 * pmu_cpu_context too.
6053 */
6054 static void *find_pmu_context(int ctxn)
6055 {
6056 struct pmu *pmu;
6057
6058 if (ctxn < 0)
6059 return NULL;
6060
6061 list_for_each_entry(pmu, &pmus, entry) {
6062 if (pmu->task_ctx_nr == ctxn)
6063 return pmu->pmu_cpu_context;
6064 }
6065
6066 return NULL;
6067 }
6068
6069 static void update_pmu_context(struct pmu *pmu, struct pmu *old_pmu)
6070 {
6071 int cpu;
6072
6073 for_each_possible_cpu(cpu) {
6074 struct perf_cpu_context *cpuctx;
6075
6076 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6077
6078 if (cpuctx->unique_pmu == old_pmu)
6079 cpuctx->unique_pmu = pmu;
6080 }
6081 }
6082
6083 static void free_pmu_context(struct pmu *pmu)
6084 {
6085 struct pmu *i;
6086
6087 mutex_lock(&pmus_lock);
6088 /*
6089 * Like a real lame refcount.
6090 */
6091 list_for_each_entry(i, &pmus, entry) {
6092 if (i->pmu_cpu_context == pmu->pmu_cpu_context) {
6093 update_pmu_context(i, pmu);
6094 goto out;
6095 }
6096 }
6097
6098 free_percpu(pmu->pmu_cpu_context);
6099 out:
6100 mutex_unlock(&pmus_lock);
6101 }
6102 static struct idr pmu_idr;
6103
6104 static ssize_t
6105 type_show(struct device *dev, struct device_attribute *attr, char *page)
6106 {
6107 struct pmu *pmu = dev_get_drvdata(dev);
6108
6109 return snprintf(page, PAGE_SIZE-1, "%d\n", pmu->type);
6110 }
6111
6112 static struct device_attribute pmu_dev_attrs[] = {
6113 __ATTR_RO(type),
6114 __ATTR_NULL,
6115 };
6116
6117 static int pmu_bus_running;
6118 static struct bus_type pmu_bus = {
6119 .name = "event_source",
6120 .dev_attrs = pmu_dev_attrs,
6121 };
6122
6123 static void pmu_dev_release(struct device *dev)
6124 {
6125 kfree(dev);
6126 }
6127
6128 static int pmu_dev_alloc(struct pmu *pmu)
6129 {
6130 int ret = -ENOMEM;
6131
6132 pmu->dev = kzalloc(sizeof(struct device), GFP_KERNEL);
6133 if (!pmu->dev)
6134 goto out;
6135
6136 pmu->dev->groups = pmu->attr_groups;
6137 device_initialize(pmu->dev);
6138 ret = dev_set_name(pmu->dev, "%s", pmu->name);
6139 if (ret)
6140 goto free_dev;
6141
6142 dev_set_drvdata(pmu->dev, pmu);
6143 pmu->dev->bus = &pmu_bus;
6144 pmu->dev->release = pmu_dev_release;
6145 ret = device_add(pmu->dev);
6146 if (ret)
6147 goto free_dev;
6148
6149 out:
6150 return ret;
6151
6152 free_dev:
6153 put_device(pmu->dev);
6154 goto out;
6155 }
6156
6157 static struct lock_class_key cpuctx_mutex;
6158 static struct lock_class_key cpuctx_lock;
6159
6160 int perf_pmu_register(struct pmu *pmu, char *name, int type)
6161 {
6162 int cpu, ret;
6163
6164 mutex_lock(&pmus_lock);
6165 ret = -ENOMEM;
6166 pmu->pmu_disable_count = alloc_percpu(int);
6167 if (!pmu->pmu_disable_count)
6168 goto unlock;
6169
6170 pmu->type = -1;
6171 if (!name)
6172 goto skip_type;
6173 pmu->name = name;
6174
6175 if (type < 0) {
6176 type = idr_alloc(&pmu_idr, pmu, PERF_TYPE_MAX, 0, GFP_KERNEL);
6177 if (type < 0) {
6178 ret = type;
6179 goto free_pdc;
6180 }
6181 }
6182 pmu->type = type;
6183
6184 if (pmu_bus_running) {
6185 ret = pmu_dev_alloc(pmu);
6186 if (ret)
6187 goto free_idr;
6188 }
6189
6190 skip_type:
6191 pmu->pmu_cpu_context = find_pmu_context(pmu->task_ctx_nr);
6192 if (pmu->pmu_cpu_context)
6193 goto got_cpu_context;
6194
6195 ret = -ENOMEM;
6196 pmu->pmu_cpu_context = alloc_percpu(struct perf_cpu_context);
6197 if (!pmu->pmu_cpu_context)
6198 goto free_dev;
6199
6200 for_each_possible_cpu(cpu) {
6201 struct perf_cpu_context *cpuctx;
6202
6203 cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
6204 __perf_event_init_context(&cpuctx->ctx);
6205 lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex);
6206 lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock);
6207 cpuctx->ctx.type = cpu_context;
6208 cpuctx->ctx.pmu = pmu;
6209 cpuctx->jiffies_interval = 1;
6210 INIT_LIST_HEAD(&cpuctx->rotation_list);
6211 cpuctx->unique_pmu = pmu;
6212 }
6213
6214 got_cpu_context:
6215 if (!pmu->start_txn) {
6216 if (pmu->pmu_enable) {
6217 /*
6218 * If we have pmu_enable/pmu_disable calls, install
6219 * transaction stubs that use that to try and batch
6220 * hardware accesses.
6221 */
6222 pmu->start_txn = perf_pmu_start_txn;
6223 pmu->commit_txn = perf_pmu_commit_txn;
6224 pmu->cancel_txn = perf_pmu_cancel_txn;
6225 } else {
6226 pmu->start_txn = perf_pmu_nop_void;
6227 pmu->commit_txn = perf_pmu_nop_int;
6228 pmu->cancel_txn = perf_pmu_nop_void;
6229 }
6230 }
6231
6232 if (!pmu->pmu_enable) {
6233 pmu->pmu_enable = perf_pmu_nop_void;
6234 pmu->pmu_disable = perf_pmu_nop_void;
6235 }
6236
6237 if (!pmu->event_idx)
6238 pmu->event_idx = perf_event_idx_default;
6239
6240 list_add_rcu(&pmu->entry, &pmus);
6241 ret = 0;
6242 unlock:
6243 mutex_unlock(&pmus_lock);
6244
6245 return ret;
6246
6247 free_dev:
6248 device_del(pmu->dev);
6249 put_device(pmu->dev);
6250
6251 free_idr:
6252 if (pmu->type >= PERF_TYPE_MAX)
6253 idr_remove(&pmu_idr, pmu->type);
6254
6255 free_pdc:
6256 free_percpu(pmu->pmu_disable_count);
6257 goto unlock;
6258 }
6259
6260 void perf_pmu_unregister(struct pmu *pmu)
6261 {
6262 mutex_lock(&pmus_lock);
6263 list_del_rcu(&pmu->entry);
6264 mutex_unlock(&pmus_lock);
6265
6266 /*
6267 * We dereference the pmu list under both SRCU and regular RCU, so
6268 * synchronize against both of those.
6269 */
6270 synchronize_srcu(&pmus_srcu);
6271 synchronize_rcu();
6272
6273 free_percpu(pmu->pmu_disable_count);
6274 if (pmu->type >= PERF_TYPE_MAX)
6275 idr_remove(&pmu_idr, pmu->type);
6276 device_del(pmu->dev);
6277 put_device(pmu->dev);
6278 free_pmu_context(pmu);
6279 }
6280
6281 struct pmu *perf_init_event(struct perf_event *event)
6282 {
6283 struct pmu *pmu = NULL;
6284 int idx;
6285 int ret;
6286
6287 idx = srcu_read_lock(&pmus_srcu);
6288
6289 rcu_read_lock();
6290 pmu = idr_find(&pmu_idr, event->attr.type);
6291 rcu_read_unlock();
6292 if (pmu) {
6293 event->pmu = pmu;
6294 ret = pmu->event_init(event);
6295 if (ret)
6296 pmu = ERR_PTR(ret);
6297 goto unlock;
6298 }
6299
6300 list_for_each_entry_rcu(pmu, &pmus, entry) {
6301 event->pmu = pmu;
6302 ret = pmu->event_init(event);
6303 if (!ret)
6304 goto unlock;
6305
6306 if (ret != -ENOENT) {
6307 pmu = ERR_PTR(ret);
6308 goto unlock;
6309 }
6310 }
6311 pmu = ERR_PTR(-ENOENT);
6312 unlock:
6313 srcu_read_unlock(&pmus_srcu, idx);
6314
6315 return pmu;
6316 }
6317
6318 /*
6319 * Allocate and initialize a event structure
6320 */
6321 static struct perf_event *
6322 perf_event_alloc(struct perf_event_attr *attr, int cpu,
6323 struct task_struct *task,
6324 struct perf_event *group_leader,
6325 struct perf_event *parent_event,
6326 perf_overflow_handler_t overflow_handler,
6327 void *context)
6328 {
6329 struct pmu *pmu;
6330 struct perf_event *event;
6331 struct hw_perf_event *hwc;
6332 long err;
6333
6334 if ((unsigned)cpu >= nr_cpu_ids) {
6335 if (!task || cpu != -1)
6336 return ERR_PTR(-EINVAL);
6337 }
6338
6339 event = kzalloc(sizeof(*event), GFP_KERNEL);
6340 if (!event)
6341 return ERR_PTR(-ENOMEM);
6342
6343 /*
6344 * Single events are their own group leaders, with an
6345 * empty sibling list:
6346 */
6347 if (!group_leader)
6348 group_leader = event;
6349
6350 mutex_init(&event->child_mutex);
6351 INIT_LIST_HEAD(&event->child_list);
6352
6353 INIT_LIST_HEAD(&event->group_entry);
6354 INIT_LIST_HEAD(&event->event_entry);
6355 INIT_LIST_HEAD(&event->sibling_list);
6356 INIT_LIST_HEAD(&event->rb_entry);
6357
6358 init_waitqueue_head(&event->waitq);
6359 init_irq_work(&event->pending, perf_pending_event);
6360
6361 mutex_init(&event->mmap_mutex);
6362
6363 atomic_long_set(&event->refcount, 1);
6364 event->cpu = cpu;
6365 event->attr = *attr;
6366 event->group_leader = group_leader;
6367 event->pmu = NULL;
6368 event->oncpu = -1;
6369
6370 event->parent = parent_event;
6371
6372 event->ns = get_pid_ns(task_active_pid_ns(current));
6373 event->id = atomic64_inc_return(&perf_event_id);
6374
6375 event->state = PERF_EVENT_STATE_INACTIVE;
6376
6377 if (task) {
6378 event->attach_state = PERF_ATTACH_TASK;
6379
6380 if (attr->type == PERF_TYPE_TRACEPOINT)
6381 event->hw.tp_target = task;
6382 #ifdef CONFIG_HAVE_HW_BREAKPOINT
6383 /*
6384 * hw_breakpoint is a bit difficult here..
6385 */
6386 else if (attr->type == PERF_TYPE_BREAKPOINT)
6387 event->hw.bp_target = task;
6388 #endif
6389 }
6390
6391 if (!overflow_handler && parent_event) {
6392 overflow_handler = parent_event->overflow_handler;
6393 context = parent_event->overflow_handler_context;
6394 }
6395
6396 event->overflow_handler = overflow_handler;
6397 event->overflow_handler_context = context;
6398
6399 perf_event__state_init(event);
6400
6401 pmu = NULL;
6402
6403 hwc = &event->hw;
6404 hwc->sample_period = attr->sample_period;
6405 if (attr->freq && attr->sample_freq)
6406 hwc->sample_period = 1;
6407 hwc->last_period = hwc->sample_period;
6408
6409 local64_set(&hwc->period_left, hwc->sample_period);
6410
6411 /*
6412 * we currently do not support PERF_FORMAT_GROUP on inherited events
6413 */
6414 if (attr->inherit && (attr->read_format & PERF_FORMAT_GROUP))
6415 goto done;
6416
6417 pmu = perf_init_event(event);
6418
6419 done:
6420 err = 0;
6421 if (!pmu)
6422 err = -EINVAL;
6423 else if (IS_ERR(pmu))
6424 err = PTR_ERR(pmu);
6425
6426 if (err) {
6427 if (event->ns)
6428 put_pid_ns(event->ns);
6429 kfree(event);
6430 return ERR_PTR(err);
6431 }
6432
6433 if (!event->parent) {
6434 if (event->attach_state & PERF_ATTACH_TASK)
6435 static_key_slow_inc(&perf_sched_events.key);
6436 if (event->attr.mmap || event->attr.mmap_data)
6437 atomic_inc(&nr_mmap_events);
6438 if (event->attr.comm)
6439 atomic_inc(&nr_comm_events);
6440 if (event->attr.task)
6441 atomic_inc(&nr_task_events);
6442 if (event->attr.sample_type & PERF_SAMPLE_CALLCHAIN) {
6443 err = get_callchain_buffers();
6444 if (err) {
6445 free_event(event);
6446 return ERR_PTR(err);
6447 }
6448 }
6449 if (has_branch_stack(event)) {
6450 static_key_slow_inc(&perf_sched_events.key);
6451 if (!(event->attach_state & PERF_ATTACH_TASK))
6452 atomic_inc(&per_cpu(perf_branch_stack_events,
6453 event->cpu));
6454 }
6455 }
6456
6457 return event;
6458 }
6459
6460 static int perf_copy_attr(struct perf_event_attr __user *uattr,
6461 struct perf_event_attr *attr)
6462 {
6463 u32 size;
6464 int ret;
6465
6466 if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
6467 return -EFAULT;
6468
6469 /*
6470 * zero the full structure, so that a short copy will be nice.
6471 */
6472 memset(attr, 0, sizeof(*attr));
6473
6474 ret = get_user(size, &uattr->size);
6475 if (ret)
6476 return ret;
6477
6478 if (size > PAGE_SIZE) /* silly large */
6479 goto err_size;
6480
6481 if (!size) /* abi compat */
6482 size = PERF_ATTR_SIZE_VER0;
6483
6484 if (size < PERF_ATTR_SIZE_VER0)
6485 goto err_size;
6486
6487 /*
6488 * If we're handed a bigger struct than we know of,
6489 * ensure all the unknown bits are 0 - i.e. new
6490 * user-space does not rely on any kernel feature
6491 * extensions we dont know about yet.
6492 */
6493 if (size > sizeof(*attr)) {
6494 unsigned char __user *addr;
6495 unsigned char __user *end;
6496 unsigned char val;
6497
6498 addr = (void __user *)uattr + sizeof(*attr);
6499 end = (void __user *)uattr + size;
6500
6501 for (; addr < end; addr++) {
6502 ret = get_user(val, addr);
6503 if (ret)
6504 return ret;
6505 if (val)
6506 goto err_size;
6507 }
6508 size = sizeof(*attr);
6509 }
6510
6511 ret = copy_from_user(attr, uattr, size);
6512 if (ret)
6513 return -EFAULT;
6514
6515 if (attr->__reserved_1)
6516 return -EINVAL;
6517
6518 if (attr->sample_type & ~(PERF_SAMPLE_MAX-1))
6519 return -EINVAL;
6520
6521 if (attr->read_format & ~(PERF_FORMAT_MAX-1))
6522 return -EINVAL;
6523
6524 if (attr->sample_type & PERF_SAMPLE_BRANCH_STACK) {
6525 u64 mask = attr->branch_sample_type;
6526
6527 /* only using defined bits */
6528 if (mask & ~(PERF_SAMPLE_BRANCH_MAX-1))
6529 return -EINVAL;
6530
6531 /* at least one branch bit must be set */
6532 if (!(mask & ~PERF_SAMPLE_BRANCH_PLM_ALL))
6533 return -EINVAL;
6534
6535 /* kernel level capture: check permissions */
6536 if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
6537 && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6538 return -EACCES;
6539
6540 /* propagate priv level, when not set for branch */
6541 if (!(mask & PERF_SAMPLE_BRANCH_PLM_ALL)) {
6542
6543 /* exclude_kernel checked on syscall entry */
6544 if (!attr->exclude_kernel)
6545 mask |= PERF_SAMPLE_BRANCH_KERNEL;
6546
6547 if (!attr->exclude_user)
6548 mask |= PERF_SAMPLE_BRANCH_USER;
6549
6550 if (!attr->exclude_hv)
6551 mask |= PERF_SAMPLE_BRANCH_HV;
6552 /*
6553 * adjust user setting (for HW filter setup)
6554 */
6555 attr->branch_sample_type = mask;
6556 }
6557 }
6558
6559 if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
6560 ret = perf_reg_validate(attr->sample_regs_user);
6561 if (ret)
6562 return ret;
6563 }
6564
6565 if (attr->sample_type & PERF_SAMPLE_STACK_USER) {
6566 if (!arch_perf_have_user_stack_dump())
6567 return -ENOSYS;
6568
6569 /*
6570 * We have __u32 type for the size, but so far
6571 * we can only use __u16 as maximum due to the
6572 * __u16 sample size limit.
6573 */
6574 if (attr->sample_stack_user >= USHRT_MAX)
6575 ret = -EINVAL;
6576 else if (!IS_ALIGNED(attr->sample_stack_user, sizeof(u64)))
6577 ret = -EINVAL;
6578 }
6579
6580 out:
6581 return ret;
6582
6583 err_size:
6584 put_user(sizeof(*attr), &uattr->size);
6585 ret = -E2BIG;
6586 goto out;
6587 }
6588
6589 static int
6590 perf_event_set_output(struct perf_event *event, struct perf_event *output_event)
6591 {
6592 struct ring_buffer *rb = NULL, *old_rb = NULL;
6593 int ret = -EINVAL;
6594
6595 if (!output_event)
6596 goto set;
6597
6598 /* don't allow circular references */
6599 if (event == output_event)
6600 goto out;
6601
6602 /*
6603 * Don't allow cross-cpu buffers
6604 */
6605 if (output_event->cpu != event->cpu)
6606 goto out;
6607
6608 /*
6609 * If its not a per-cpu rb, it must be the same task.
6610 */
6611 if (output_event->cpu == -1 && output_event->ctx != event->ctx)
6612 goto out;
6613
6614 set:
6615 mutex_lock(&event->mmap_mutex);
6616 /* Can't redirect output if we've got an active mmap() */
6617 if (atomic_read(&event->mmap_count))
6618 goto unlock;
6619
6620 old_rb = event->rb;
6621
6622 if (output_event) {
6623 /* get the rb we want to redirect to */
6624 rb = ring_buffer_get(output_event);
6625 if (!rb)
6626 goto unlock;
6627 }
6628
6629 if (old_rb)
6630 ring_buffer_detach(event, old_rb);
6631
6632 if (rb)
6633 ring_buffer_attach(event, rb);
6634
6635 rcu_assign_pointer(event->rb, rb);
6636
6637 if (old_rb) {
6638 ring_buffer_put(old_rb);
6639 /*
6640 * Since we detached before setting the new rb, so that we
6641 * could attach the new rb, we could have missed a wakeup.
6642 * Provide it now.
6643 */
6644 wake_up_all(&event->waitq);
6645 }
6646
6647 ret = 0;
6648 unlock:
6649 mutex_unlock(&event->mmap_mutex);
6650
6651 out:
6652 return ret;
6653 }
6654
6655 /**
6656 * sys_perf_event_open - open a performance event, associate it to a task/cpu
6657 *
6658 * @attr_uptr: event_id type attributes for monitoring/sampling
6659 * @pid: target pid
6660 * @cpu: target cpu
6661 * @group_fd: group leader event fd
6662 */
6663 SYSCALL_DEFINE5(perf_event_open,
6664 struct perf_event_attr __user *, attr_uptr,
6665 pid_t, pid, int, cpu, int, group_fd, unsigned long, flags)
6666 {
6667 struct perf_event *group_leader = NULL, *output_event = NULL;
6668 struct perf_event *event, *sibling;
6669 struct perf_event_attr attr;
6670 struct perf_event_context *ctx;
6671 struct file *event_file = NULL;
6672 struct fd group = {NULL, 0};
6673 struct task_struct *task = NULL;
6674 struct pmu *pmu;
6675 int event_fd;
6676 int move_group = 0;
6677 int err;
6678
6679 /* for future expandability... */
6680 if (flags & ~PERF_FLAG_ALL)
6681 return -EINVAL;
6682
6683 err = perf_copy_attr(attr_uptr, &attr);
6684 if (err)
6685 return err;
6686
6687 if (!attr.exclude_kernel) {
6688 if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
6689 return -EACCES;
6690 }
6691
6692 if (attr.freq) {
6693 if (attr.sample_freq > sysctl_perf_event_sample_rate)
6694 return -EINVAL;
6695 } else {
6696 if (attr.sample_period & (1ULL << 63))
6697 return -EINVAL;
6698 }
6699
6700 /*
6701 * In cgroup mode, the pid argument is used to pass the fd
6702 * opened to the cgroup directory in cgroupfs. The cpu argument
6703 * designates the cpu on which to monitor threads from that
6704 * cgroup.
6705 */
6706 if ((flags & PERF_FLAG_PID_CGROUP) && (pid == -1 || cpu == -1))
6707 return -EINVAL;
6708
6709 event_fd = get_unused_fd();
6710 if (event_fd < 0)
6711 return event_fd;
6712
6713 if (group_fd != -1) {
6714 err = perf_fget_light(group_fd, &group);
6715 if (err)
6716 goto err_fd;
6717 group_leader = group.file->private_data;
6718 if (flags & PERF_FLAG_FD_OUTPUT)
6719 output_event = group_leader;
6720 if (flags & PERF_FLAG_FD_NO_GROUP)
6721 group_leader = NULL;
6722 }
6723
6724 if (pid != -1 && !(flags & PERF_FLAG_PID_CGROUP)) {
6725 task = find_lively_task_by_vpid(pid);
6726 if (IS_ERR(task)) {
6727 err = PTR_ERR(task);
6728 goto err_group_fd;
6729 }
6730 }
6731
6732 get_online_cpus();
6733
6734 event = perf_event_alloc(&attr, cpu, task, group_leader, NULL,
6735 NULL, NULL);
6736 if (IS_ERR(event)) {
6737 err = PTR_ERR(event);
6738 goto err_task;
6739 }
6740
6741 if (flags & PERF_FLAG_PID_CGROUP) {
6742 err = perf_cgroup_connect(pid, event, &attr, group_leader);
6743 if (err)
6744 goto err_alloc;
6745 /*
6746 * one more event:
6747 * - that has cgroup constraint on event->cpu
6748 * - that may need work on context switch
6749 */
6750 atomic_inc(&per_cpu(perf_cgroup_events, event->cpu));
6751 static_key_slow_inc(&perf_sched_events.key);
6752 }
6753
6754 /*
6755 * Special case software events and allow them to be part of
6756 * any hardware group.
6757 */
6758 pmu = event->pmu;
6759
6760 if (group_leader &&
6761 (is_software_event(event) != is_software_event(group_leader))) {
6762 if (is_software_event(event)) {
6763 /*
6764 * If event and group_leader are not both a software
6765 * event, and event is, then group leader is not.
6766 *
6767 * Allow the addition of software events to !software
6768 * groups, this is safe because software events never
6769 * fail to schedule.
6770 */
6771 pmu = group_leader->pmu;
6772 } else if (is_software_event(group_leader) &&
6773 (group_leader->group_flags & PERF_GROUP_SOFTWARE)) {
6774 /*
6775 * In case the group is a pure software group, and we
6776 * try to add a hardware event, move the whole group to
6777 * the hardware context.
6778 */
6779 move_group = 1;
6780 }
6781 }
6782
6783 /*
6784 * Get the target context (task or percpu):
6785 */
6786 ctx = find_get_context(pmu, task, event->cpu);
6787 if (IS_ERR(ctx)) {
6788 err = PTR_ERR(ctx);
6789 goto err_alloc;
6790 }
6791
6792 if (task) {
6793 put_task_struct(task);
6794 task = NULL;
6795 }
6796
6797 /*
6798 * Look up the group leader (we will attach this event to it):
6799 */
6800 if (group_leader) {
6801 err = -EINVAL;
6802
6803 /*
6804 * Do not allow a recursive hierarchy (this new sibling
6805 * becoming part of another group-sibling):
6806 */
6807 if (group_leader->group_leader != group_leader)
6808 goto err_context;
6809 /*
6810 * Do not allow to attach to a group in a different
6811 * task or CPU context:
6812 */
6813 if (move_group) {
6814 if (group_leader->ctx->type != ctx->type)
6815 goto err_context;
6816 } else {
6817 if (group_leader->ctx != ctx)
6818 goto err_context;
6819 }
6820
6821 /*
6822 * Only a group leader can be exclusive or pinned
6823 */
6824 if (attr.exclusive || attr.pinned)
6825 goto err_context;
6826 }
6827
6828 if (output_event) {
6829 err = perf_event_set_output(event, output_event);
6830 if (err)
6831 goto err_context;
6832 }
6833
6834 event_file = anon_inode_getfile("[perf_event]", &perf_fops, event, O_RDWR);
6835 if (IS_ERR(event_file)) {
6836 err = PTR_ERR(event_file);
6837 goto err_context;
6838 }
6839
6840 if (move_group) {
6841 struct perf_event_context *gctx = group_leader->ctx;
6842
6843 mutex_lock(&gctx->mutex);
6844 perf_remove_from_context(group_leader, false);
6845
6846 /*
6847 * Removing from the context ends up with disabled
6848 * event. What we want here is event in the initial
6849 * startup state, ready to be add into new context.
6850 */
6851 perf_event__state_init(group_leader);
6852 list_for_each_entry(sibling, &group_leader->sibling_list,
6853 group_entry) {
6854 perf_remove_from_context(sibling, false);
6855 perf_event__state_init(sibling);
6856 put_ctx(gctx);
6857 }
6858 mutex_unlock(&gctx->mutex);
6859 put_ctx(gctx);
6860 }
6861
6862 WARN_ON_ONCE(ctx->parent_ctx);
6863 mutex_lock(&ctx->mutex);
6864
6865 if (move_group) {
6866 synchronize_rcu();
6867 perf_install_in_context(ctx, group_leader, event->cpu);
6868 get_ctx(ctx);
6869 list_for_each_entry(sibling, &group_leader->sibling_list,
6870 group_entry) {
6871 perf_install_in_context(ctx, sibling, event->cpu);
6872 get_ctx(ctx);
6873 }
6874 }
6875
6876 perf_install_in_context(ctx, event, event->cpu);
6877 ++ctx->generation;
6878 perf_unpin_context(ctx);
6879 mutex_unlock(&ctx->mutex);
6880
6881 put_online_cpus();
6882
6883 event->owner = current;
6884
6885 mutex_lock(&current->perf_event_mutex);
6886 list_add_tail(&event->owner_entry, &current->perf_event_list);
6887 mutex_unlock(&current->perf_event_mutex);
6888
6889 /*
6890 * Precalculate sample_data sizes
6891 */
6892 perf_event__header_size(event);
6893 perf_event__id_header_size(event);
6894
6895 /*
6896 * Drop the reference on the group_event after placing the
6897 * new event on the sibling_list. This ensures destruction
6898 * of the group leader will find the pointer to itself in
6899 * perf_group_detach().
6900 */
6901 fdput(group);
6902 fd_install(event_fd, event_file);
6903 return event_fd;
6904
6905 err_context:
6906 perf_unpin_context(ctx);
6907 put_ctx(ctx);
6908 err_alloc:
6909 free_event(event);
6910 err_task:
6911 put_online_cpus();
6912 if (task)
6913 put_task_struct(task);
6914 err_group_fd:
6915 fdput(group);
6916 err_fd:
6917 put_unused_fd(event_fd);
6918 return err;
6919 }
6920
6921 /**
6922 * perf_event_create_kernel_counter
6923 *
6924 * @attr: attributes of the counter to create
6925 * @cpu: cpu in which the counter is bound
6926 * @task: task to profile (NULL for percpu)
6927 */
6928 struct perf_event *
6929 perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu,
6930 struct task_struct *task,
6931 perf_overflow_handler_t overflow_handler,
6932 void *context)
6933 {
6934 struct perf_event_context *ctx;
6935 struct perf_event *event;
6936 int err;
6937
6938 /*
6939 * Get the target context (task or percpu):
6940 */
6941
6942 event = perf_event_alloc(attr, cpu, task, NULL, NULL,
6943 overflow_handler, context);
6944 if (IS_ERR(event)) {
6945 err = PTR_ERR(event);
6946 goto err;
6947 }
6948
6949 ctx = find_get_context(event->pmu, task, cpu);
6950 if (IS_ERR(ctx)) {
6951 err = PTR_ERR(ctx);
6952 goto err_free;
6953 }
6954
6955 WARN_ON_ONCE(ctx->parent_ctx);
6956 mutex_lock(&ctx->mutex);
6957 perf_install_in_context(ctx, event, cpu);
6958 ++ctx->generation;
6959 perf_unpin_context(ctx);
6960 mutex_unlock(&ctx->mutex);
6961
6962 return event;
6963
6964 err_free:
6965 free_event(event);
6966 err:
6967 return ERR_PTR(err);
6968 }
6969 EXPORT_SYMBOL_GPL(perf_event_create_kernel_counter);
6970
6971 void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
6972 {
6973 struct perf_event_context *src_ctx;
6974 struct perf_event_context *dst_ctx;
6975 struct perf_event *event, *tmp;
6976 LIST_HEAD(events);
6977
6978 src_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, src_cpu)->ctx;
6979 dst_ctx = &per_cpu_ptr(pmu->pmu_cpu_context, dst_cpu)->ctx;
6980
6981 mutex_lock(&src_ctx->mutex);
6982 list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
6983 event_entry) {
6984 perf_remove_from_context(event, false);
6985 put_ctx(src_ctx);
6986 list_add(&event->event_entry, &events);
6987 }
6988 mutex_unlock(&src_ctx->mutex);
6989
6990 synchronize_rcu();
6991
6992 mutex_lock(&dst_ctx->mutex);
6993 list_for_each_entry_safe(event, tmp, &events, event_entry) {
6994 list_del(&event->event_entry);
6995 if (event->state >= PERF_EVENT_STATE_OFF)
6996 event->state = PERF_EVENT_STATE_INACTIVE;
6997 perf_install_in_context(dst_ctx, event, dst_cpu);
6998 get_ctx(dst_ctx);
6999 }
7000 mutex_unlock(&dst_ctx->mutex);
7001 }
7002 EXPORT_SYMBOL_GPL(perf_pmu_migrate_context);
7003
7004 static void sync_child_event(struct perf_event *child_event,
7005 struct task_struct *child)
7006 {
7007 struct perf_event *parent_event = child_event->parent;
7008 u64 child_val;
7009
7010 if (child_event->attr.inherit_stat)
7011 perf_event_read_event(child_event, child);
7012
7013 child_val = perf_event_count(child_event);
7014
7015 /*
7016 * Add back the child's count to the parent's count:
7017 */
7018 atomic64_add(child_val, &parent_event->child_count);
7019 atomic64_add(child_event->total_time_enabled,
7020 &parent_event->child_total_time_enabled);
7021 atomic64_add(child_event->total_time_running,
7022 &parent_event->child_total_time_running);
7023
7024 /*
7025 * Remove this event from the parent's list
7026 */
7027 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
7028 mutex_lock(&parent_event->child_mutex);
7029 list_del_init(&child_event->child_list);
7030 mutex_unlock(&parent_event->child_mutex);
7031
7032 /*
7033 * Release the parent event, if this was the last
7034 * reference to it.
7035 */
7036 put_event(parent_event);
7037 }
7038
7039 static void
7040 __perf_event_exit_task(struct perf_event *child_event,
7041 struct perf_event_context *child_ctx,
7042 struct task_struct *child)
7043 {
7044 perf_remove_from_context(child_event, !!child_event->parent);
7045
7046 /*
7047 * It can happen that the parent exits first, and has events
7048 * that are still around due to the child reference. These
7049 * events need to be zapped.
7050 */
7051 if (child_event->parent) {
7052 sync_child_event(child_event, child);
7053 free_event(child_event);
7054 }
7055 }
7056
7057 static void perf_event_exit_task_context(struct task_struct *child, int ctxn)
7058 {
7059 struct perf_event *child_event, *tmp;
7060 struct perf_event_context *child_ctx;
7061 unsigned long flags;
7062
7063 if (likely(!child->perf_event_ctxp[ctxn])) {
7064 perf_event_task(child, NULL, 0);
7065 return;
7066 }
7067
7068 local_irq_save(flags);
7069 /*
7070 * We can't reschedule here because interrupts are disabled,
7071 * and either child is current or it is a task that can't be
7072 * scheduled, so we are now safe from rescheduling changing
7073 * our context.
7074 */
7075 child_ctx = rcu_dereference_raw(child->perf_event_ctxp[ctxn]);
7076
7077 /*
7078 * Take the context lock here so that if find_get_context is
7079 * reading child->perf_event_ctxp, we wait until it has
7080 * incremented the context's refcount before we do put_ctx below.
7081 */
7082 raw_spin_lock(&child_ctx->lock);
7083 task_ctx_sched_out(child_ctx);
7084 child->perf_event_ctxp[ctxn] = NULL;
7085 /*
7086 * If this context is a clone; unclone it so it can't get
7087 * swapped to another process while we're removing all
7088 * the events from it.
7089 */
7090 unclone_ctx(child_ctx);
7091 update_context_time(child_ctx);
7092 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7093
7094 /*
7095 * Report the task dead after unscheduling the events so that we
7096 * won't get any samples after PERF_RECORD_EXIT. We can however still
7097 * get a few PERF_RECORD_READ events.
7098 */
7099 perf_event_task(child, child_ctx, 0);
7100
7101 /*
7102 * We can recurse on the same lock type through:
7103 *
7104 * __perf_event_exit_task()
7105 * sync_child_event()
7106 * put_event()
7107 * mutex_lock(&ctx->mutex)
7108 *
7109 * But since its the parent context it won't be the same instance.
7110 */
7111 mutex_lock(&child_ctx->mutex);
7112
7113 again:
7114 list_for_each_entry_safe(child_event, tmp, &child_ctx->pinned_groups,
7115 group_entry)
7116 __perf_event_exit_task(child_event, child_ctx, child);
7117
7118 list_for_each_entry_safe(child_event, tmp, &child_ctx->flexible_groups,
7119 group_entry)
7120 __perf_event_exit_task(child_event, child_ctx, child);
7121
7122 /*
7123 * If the last event was a group event, it will have appended all
7124 * its siblings to the list, but we obtained 'tmp' before that which
7125 * will still point to the list head terminating the iteration.
7126 */
7127 if (!list_empty(&child_ctx->pinned_groups) ||
7128 !list_empty(&child_ctx->flexible_groups))
7129 goto again;
7130
7131 mutex_unlock(&child_ctx->mutex);
7132
7133 put_ctx(child_ctx);
7134 }
7135
7136 /*
7137 * When a child task exits, feed back event values to parent events.
7138 */
7139 void perf_event_exit_task(struct task_struct *child)
7140 {
7141 struct perf_event *event, *tmp;
7142 int ctxn;
7143
7144 mutex_lock(&child->perf_event_mutex);
7145 list_for_each_entry_safe(event, tmp, &child->perf_event_list,
7146 owner_entry) {
7147 list_del_init(&event->owner_entry);
7148
7149 /*
7150 * Ensure the list deletion is visible before we clear
7151 * the owner, closes a race against perf_release() where
7152 * we need to serialize on the owner->perf_event_mutex.
7153 */
7154 smp_wmb();
7155 event->owner = NULL;
7156 }
7157 mutex_unlock(&child->perf_event_mutex);
7158
7159 for_each_task_context_nr(ctxn)
7160 perf_event_exit_task_context(child, ctxn);
7161 }
7162
7163 static void perf_free_event(struct perf_event *event,
7164 struct perf_event_context *ctx)
7165 {
7166 struct perf_event *parent = event->parent;
7167
7168 if (WARN_ON_ONCE(!parent))
7169 return;
7170
7171 mutex_lock(&parent->child_mutex);
7172 list_del_init(&event->child_list);
7173 mutex_unlock(&parent->child_mutex);
7174
7175 put_event(parent);
7176
7177 perf_group_detach(event);
7178 list_del_event(event, ctx);
7179 free_event(event);
7180 }
7181
7182 /*
7183 * free an unexposed, unused context as created by inheritance by
7184 * perf_event_init_task below, used by fork() in case of fail.
7185 */
7186 void perf_event_free_task(struct task_struct *task)
7187 {
7188 struct perf_event_context *ctx;
7189 struct perf_event *event, *tmp;
7190 int ctxn;
7191
7192 for_each_task_context_nr(ctxn) {
7193 ctx = task->perf_event_ctxp[ctxn];
7194 if (!ctx)
7195 continue;
7196
7197 mutex_lock(&ctx->mutex);
7198 again:
7199 list_for_each_entry_safe(event, tmp, &ctx->pinned_groups,
7200 group_entry)
7201 perf_free_event(event, ctx);
7202
7203 list_for_each_entry_safe(event, tmp, &ctx->flexible_groups,
7204 group_entry)
7205 perf_free_event(event, ctx);
7206
7207 if (!list_empty(&ctx->pinned_groups) ||
7208 !list_empty(&ctx->flexible_groups))
7209 goto again;
7210
7211 mutex_unlock(&ctx->mutex);
7212
7213 put_ctx(ctx);
7214 }
7215 }
7216
7217 void perf_event_delayed_put(struct task_struct *task)
7218 {
7219 int ctxn;
7220
7221 for_each_task_context_nr(ctxn)
7222 WARN_ON_ONCE(task->perf_event_ctxp[ctxn]);
7223 }
7224
7225 /*
7226 * inherit a event from parent task to child task:
7227 */
7228 static struct perf_event *
7229 inherit_event(struct perf_event *parent_event,
7230 struct task_struct *parent,
7231 struct perf_event_context *parent_ctx,
7232 struct task_struct *child,
7233 struct perf_event *group_leader,
7234 struct perf_event_context *child_ctx)
7235 {
7236 struct perf_event *child_event;
7237 unsigned long flags;
7238
7239 /*
7240 * Instead of creating recursive hierarchies of events,
7241 * we link inherited events back to the original parent,
7242 * which has a filp for sure, which we use as the reference
7243 * count:
7244 */
7245 if (parent_event->parent)
7246 parent_event = parent_event->parent;
7247
7248 child_event = perf_event_alloc(&parent_event->attr,
7249 parent_event->cpu,
7250 child,
7251 group_leader, parent_event,
7252 NULL, NULL);
7253 if (IS_ERR(child_event))
7254 return child_event;
7255
7256 if (!atomic_long_inc_not_zero(&parent_event->refcount)) {
7257 free_event(child_event);
7258 return NULL;
7259 }
7260
7261 get_ctx(child_ctx);
7262
7263 /*
7264 * Make the child state follow the state of the parent event,
7265 * not its attr.disabled bit. We hold the parent's mutex,
7266 * so we won't race with perf_event_{en, dis}able_family.
7267 */
7268 if (parent_event->state >= PERF_EVENT_STATE_INACTIVE)
7269 child_event->state = PERF_EVENT_STATE_INACTIVE;
7270 else
7271 child_event->state = PERF_EVENT_STATE_OFF;
7272
7273 if (parent_event->attr.freq) {
7274 u64 sample_period = parent_event->hw.sample_period;
7275 struct hw_perf_event *hwc = &child_event->hw;
7276
7277 hwc->sample_period = sample_period;
7278 hwc->last_period = sample_period;
7279
7280 local64_set(&hwc->period_left, sample_period);
7281 }
7282
7283 child_event->ctx = child_ctx;
7284 child_event->overflow_handler = parent_event->overflow_handler;
7285 child_event->overflow_handler_context
7286 = parent_event->overflow_handler_context;
7287
7288 /*
7289 * Precalculate sample_data sizes
7290 */
7291 perf_event__header_size(child_event);
7292 perf_event__id_header_size(child_event);
7293
7294 /*
7295 * Link it up in the child's context:
7296 */
7297 raw_spin_lock_irqsave(&child_ctx->lock, flags);
7298 add_event_to_ctx(child_event, child_ctx);
7299 raw_spin_unlock_irqrestore(&child_ctx->lock, flags);
7300
7301 /*
7302 * Link this into the parent event's child list
7303 */
7304 WARN_ON_ONCE(parent_event->ctx->parent_ctx);
7305 mutex_lock(&parent_event->child_mutex);
7306 list_add_tail(&child_event->child_list, &parent_event->child_list);
7307 mutex_unlock(&parent_event->child_mutex);
7308
7309 return child_event;
7310 }
7311
7312 static int inherit_group(struct perf_event *parent_event,
7313 struct task_struct *parent,
7314 struct perf_event_context *parent_ctx,
7315 struct task_struct *child,
7316 struct perf_event_context *child_ctx)
7317 {
7318 struct perf_event *leader;
7319 struct perf_event *sub;
7320 struct perf_event *child_ctr;
7321
7322 leader = inherit_event(parent_event, parent, parent_ctx,
7323 child, NULL, child_ctx);
7324 if (IS_ERR(leader))
7325 return PTR_ERR(leader);
7326 list_for_each_entry(sub, &parent_event->sibling_list, group_entry) {
7327 child_ctr = inherit_event(sub, parent, parent_ctx,
7328 child, leader, child_ctx);
7329 if (IS_ERR(child_ctr))
7330 return PTR_ERR(child_ctr);
7331 }
7332 return 0;
7333 }
7334
7335 static int
7336 inherit_task_group(struct perf_event *event, struct task_struct *parent,
7337 struct perf_event_context *parent_ctx,
7338 struct task_struct *child, int ctxn,
7339 int *inherited_all)
7340 {
7341 int ret;
7342 struct perf_event_context *child_ctx;
7343
7344 if (!event->attr.inherit) {
7345 *inherited_all = 0;
7346 return 0;
7347 }
7348
7349 child_ctx = child->perf_event_ctxp[ctxn];
7350 if (!child_ctx) {
7351 /*
7352 * This is executed from the parent task context, so
7353 * inherit events that have been marked for cloning.
7354 * First allocate and initialize a context for the
7355 * child.
7356 */
7357
7358 child_ctx = alloc_perf_context(parent_ctx->pmu, child);
7359 if (!child_ctx)
7360 return -ENOMEM;
7361
7362 child->perf_event_ctxp[ctxn] = child_ctx;
7363 }
7364
7365 ret = inherit_group(event, parent, parent_ctx,
7366 child, child_ctx);
7367
7368 if (ret)
7369 *inherited_all = 0;
7370
7371 return ret;
7372 }
7373
7374 /*
7375 * Initialize the perf_event context in task_struct
7376 */
7377 int perf_event_init_context(struct task_struct *child, int ctxn)
7378 {
7379 struct perf_event_context *child_ctx, *parent_ctx;
7380 struct perf_event_context *cloned_ctx;
7381 struct perf_event *event;
7382 struct task_struct *parent = current;
7383 int inherited_all = 1;
7384 unsigned long flags;
7385 int ret = 0;
7386
7387 if (likely(!parent->perf_event_ctxp[ctxn]))
7388 return 0;
7389
7390 /*
7391 * If the parent's context is a clone, pin it so it won't get
7392 * swapped under us.
7393 */
7394 parent_ctx = perf_pin_task_context(parent, ctxn);
7395
7396 /*
7397 * No need to check if parent_ctx != NULL here; since we saw
7398 * it non-NULL earlier, the only reason for it to become NULL
7399 * is if we exit, and since we're currently in the middle of
7400 * a fork we can't be exiting at the same time.
7401 */
7402
7403 /*
7404 * Lock the parent list. No need to lock the child - not PID
7405 * hashed yet and not running, so nobody can access it.
7406 */
7407 mutex_lock(&parent_ctx->mutex);
7408
7409 /*
7410 * We dont have to disable NMIs - we are only looking at
7411 * the list, not manipulating it:
7412 */
7413 list_for_each_entry(event, &parent_ctx->pinned_groups, group_entry) {
7414 ret = inherit_task_group(event, parent, parent_ctx,
7415 child, ctxn, &inherited_all);
7416 if (ret)
7417 break;
7418 }
7419
7420 /*
7421 * We can't hold ctx->lock when iterating the ->flexible_group list due
7422 * to allocations, but we need to prevent rotation because
7423 * rotate_ctx() will change the list from interrupt context.
7424 */
7425 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7426 parent_ctx->rotate_disable = 1;
7427 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7428
7429 list_for_each_entry(event, &parent_ctx->flexible_groups, group_entry) {
7430 ret = inherit_task_group(event, parent, parent_ctx,
7431 child, ctxn, &inherited_all);
7432 if (ret)
7433 break;
7434 }
7435
7436 raw_spin_lock_irqsave(&parent_ctx->lock, flags);
7437 parent_ctx->rotate_disable = 0;
7438
7439 child_ctx = child->perf_event_ctxp[ctxn];
7440
7441 if (child_ctx && inherited_all) {
7442 /*
7443 * Mark the child context as a clone of the parent
7444 * context, or of whatever the parent is a clone of.
7445 *
7446 * Note that if the parent is a clone, the holding of
7447 * parent_ctx->lock avoids it from being uncloned.
7448 */
7449 cloned_ctx = parent_ctx->parent_ctx;
7450 if (cloned_ctx) {
7451 child_ctx->parent_ctx = cloned_ctx;
7452 child_ctx->parent_gen = parent_ctx->parent_gen;
7453 } else {
7454 child_ctx->parent_ctx = parent_ctx;
7455 child_ctx->parent_gen = parent_ctx->generation;
7456 }
7457 get_ctx(child_ctx->parent_ctx);
7458 }
7459
7460 raw_spin_unlock_irqrestore(&parent_ctx->lock, flags);
7461 mutex_unlock(&parent_ctx->mutex);
7462
7463 perf_unpin_context(parent_ctx);
7464 put_ctx(parent_ctx);
7465
7466 return ret;
7467 }
7468
7469 /*
7470 * Initialize the perf_event context in task_struct
7471 */
7472 int perf_event_init_task(struct task_struct *child)
7473 {
7474 int ctxn, ret;
7475
7476 memset(child->perf_event_ctxp, 0, sizeof(child->perf_event_ctxp));
7477 mutex_init(&child->perf_event_mutex);
7478 INIT_LIST_HEAD(&child->perf_event_list);
7479
7480 for_each_task_context_nr(ctxn) {
7481 ret = perf_event_init_context(child, ctxn);
7482 if (ret)
7483 return ret;
7484 }
7485
7486 return 0;
7487 }
7488
7489 static void __init perf_event_init_all_cpus(void)
7490 {
7491 struct swevent_htable *swhash;
7492 int cpu;
7493
7494 for_each_possible_cpu(cpu) {
7495 swhash = &per_cpu(swevent_htable, cpu);
7496 mutex_init(&swhash->hlist_mutex);
7497 INIT_LIST_HEAD(&per_cpu(rotation_list, cpu));
7498 }
7499 }
7500
7501 static void __cpuinit perf_event_init_cpu(int cpu)
7502 {
7503 struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
7504
7505 mutex_lock(&swhash->hlist_mutex);
7506 if (swhash->hlist_refcount > 0) {
7507 struct swevent_hlist *hlist;
7508
7509 hlist = kzalloc_node(sizeof(*hlist), GFP_KERNEL, cpu_to_node(cpu));
7510 WARN_ON(!hlist);
7511 rcu_assign_pointer(swhash->swevent_hlist, hlist);
7512 }
7513 mutex_unlock(&swhash->hlist_mutex);
7514 }
7515
7516 #if defined CONFIG_HOTPLUG_CPU || defined CONFIG_KEXEC
7517 static void perf_pmu_rotate_stop(struct pmu *pmu)
7518 {
7519 struct perf_cpu_context *cpuctx = this_cpu_ptr(pmu->pmu_cpu_context);
7520
7521 WARN_ON(!irqs_disabled());
7522
7523 list_del_init(&cpuctx->rotation_list);
7524 }
7525
7526 static void __perf_event_exit_context(void *__info)
7527 {
7528 struct remove_event re = { .detach_group = false };
7529 struct perf_event_context *ctx = __info;
7530
7531 perf_pmu_rotate_stop(ctx->pmu);
7532
7533 rcu_read_lock();
7534 list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
7535 __perf_remove_from_context(&re);
7536 rcu_read_unlock();
7537 }
7538
7539 static void perf_event_exit_cpu_context(int cpu)
7540 {
7541 struct perf_event_context *ctx;
7542 struct pmu *pmu;
7543 int idx;
7544
7545 idx = srcu_read_lock(&pmus_srcu);
7546 list_for_each_entry_rcu(pmu, &pmus, entry) {
7547 ctx = &per_cpu_ptr(pmu->pmu_cpu_context, cpu)->ctx;
7548
7549 mutex_lock(&ctx->mutex);
7550 smp_call_function_single(cpu, __perf_event_exit_context, ctx, 1);
7551 mutex_unlock(&ctx->mutex);
7552 }
7553 srcu_read_unlock(&pmus_srcu, idx);
7554 }
7555
7556 static void perf_event_exit_cpu(int cpu)
7557 {
7558 perf_event_exit_cpu_context(cpu);
7559 }
7560 #else
7561 static inline void perf_event_exit_cpu(int cpu) { }
7562 #endif
7563
7564 static int
7565 perf_reboot(struct notifier_block *notifier, unsigned long val, void *v)
7566 {
7567 int cpu;
7568
7569 for_each_online_cpu(cpu)
7570 perf_event_exit_cpu(cpu);
7571
7572 return NOTIFY_OK;
7573 }
7574
7575 /*
7576 * Run the perf reboot notifier at the very last possible moment so that
7577 * the generic watchdog code runs as long as possible.
7578 */
7579 static struct notifier_block perf_reboot_notifier = {
7580 .notifier_call = perf_reboot,
7581 .priority = INT_MIN,
7582 };
7583
7584 static int __cpuinit
7585 perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu)
7586 {
7587 unsigned int cpu = (long)hcpu;
7588
7589 switch (action & ~CPU_TASKS_FROZEN) {
7590
7591 case CPU_UP_PREPARE:
7592 case CPU_DOWN_FAILED:
7593 perf_event_init_cpu(cpu);
7594 break;
7595
7596 case CPU_UP_CANCELED:
7597 case CPU_DOWN_PREPARE:
7598 perf_event_exit_cpu(cpu);
7599 break;
7600
7601 default:
7602 break;
7603 }
7604
7605 return NOTIFY_OK;
7606 }
7607
7608 void __init perf_event_init(void)
7609 {
7610 int ret;
7611
7612 idr_init(&pmu_idr);
7613
7614 perf_event_init_all_cpus();
7615 init_srcu_struct(&pmus_srcu);
7616 perf_pmu_register(&perf_swevent, "software", PERF_TYPE_SOFTWARE);
7617 perf_pmu_register(&perf_cpu_clock, NULL, -1);
7618 perf_pmu_register(&perf_task_clock, NULL, -1);
7619 perf_tp_register();
7620 perf_cpu_notifier(perf_cpu_notify);
7621 register_reboot_notifier(&perf_reboot_notifier);
7622
7623 ret = init_hw_breakpoint();
7624 WARN(ret, "hw_breakpoint initialization failed with: %d", ret);
7625
7626 /* do not patch jump label more than once per second */
7627 jump_label_rate_limit(&perf_sched_events, HZ);
7628
7629 /*
7630 * Build time assertion that we keep the data_head at the intended
7631 * location. IOW, validation we got the __reserved[] size right.
7632 */
7633 BUILD_BUG_ON((offsetof(struct perf_event_mmap_page, data_head))
7634 != 1024);
7635 }
7636
7637 static int __init perf_event_sysfs_init(void)
7638 {
7639 struct pmu *pmu;
7640 int ret;
7641
7642 mutex_lock(&pmus_lock);
7643
7644 ret = bus_register(&pmu_bus);
7645 if (ret)
7646 goto unlock;
7647
7648 list_for_each_entry(pmu, &pmus, entry) {
7649 if (!pmu->name || pmu->type < 0)
7650 continue;
7651
7652 ret = pmu_dev_alloc(pmu);
7653 WARN(ret, "Failed to register pmu: %s, reason %d\n", pmu->name, ret);
7654 }
7655 pmu_bus_running = 1;
7656 ret = 0;
7657
7658 unlock:
7659 mutex_unlock(&pmus_lock);
7660
7661 return ret;
7662 }
7663 device_initcall(perf_event_sysfs_init);
7664
7665 #ifdef CONFIG_CGROUP_PERF
7666 static struct cgroup_subsys_state *perf_cgroup_css_alloc(struct cgroup *cont)
7667 {
7668 struct perf_cgroup *jc;
7669
7670 jc = kzalloc(sizeof(*jc), GFP_KERNEL);
7671 if (!jc)
7672 return ERR_PTR(-ENOMEM);
7673
7674 jc->info = alloc_percpu(struct perf_cgroup_info);
7675 if (!jc->info) {
7676 kfree(jc);
7677 return ERR_PTR(-ENOMEM);
7678 }
7679
7680 return &jc->css;
7681 }
7682
7683 static void perf_cgroup_css_free(struct cgroup *cont)
7684 {
7685 struct perf_cgroup *jc;
7686 jc = container_of(cgroup_subsys_state(cont, perf_subsys_id),
7687 struct perf_cgroup, css);
7688 free_percpu(jc->info);
7689 kfree(jc);
7690 }
7691
7692 static int __perf_cgroup_move(void *info)
7693 {
7694 struct task_struct *task = info;
7695 perf_cgroup_switch(task, PERF_CGROUP_SWOUT | PERF_CGROUP_SWIN);
7696 return 0;
7697 }
7698
7699 static void perf_cgroup_attach(struct cgroup *cgrp, struct cgroup_taskset *tset)
7700 {
7701 struct task_struct *task;
7702
7703 cgroup_taskset_for_each(task, cgrp, tset)
7704 task_function_call(task, __perf_cgroup_move, task);
7705 }
7706
7707 static void perf_cgroup_exit(struct cgroup *cgrp, struct cgroup *old_cgrp,
7708 struct task_struct *task)
7709 {
7710 /*
7711 * cgroup_exit() is called in the copy_process() failure path.
7712 * Ignore this case since the task hasn't ran yet, this avoids
7713 * trying to poke a half freed task state from generic code.
7714 */
7715 if (!(task->flags & PF_EXITING))
7716 return;
7717
7718 task_function_call(task, __perf_cgroup_move, task);
7719 }
7720
7721 struct cgroup_subsys perf_subsys = {
7722 .name = "perf_event",
7723 .subsys_id = perf_subsys_id,
7724 .css_alloc = perf_cgroup_css_alloc,
7725 .css_free = perf_cgroup_css_free,
7726 .exit = perf_cgroup_exit,
7727 .attach = perf_cgroup_attach,
7728 };
7729 #endif /* CONFIG_CGROUP_PERF */