perf: Fix fasync handling on inherited events

[GitHub/mt8127/android_kernel_alcatel_ttab.git] / kernel / events / core.c
diff --git a/kernel/events/core.c b/kernel/events/core.c

index b391907d53520cb4a126bc8e3e0a2d5a64ade54d..d9b0aad17dbf73a288d4bedcb8c7f2ae5bb8d982 100644 (file)
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -39,6 +39,7 @@
  #include <linux/hw_breakpoint.h>
  #include <linux/mm_types.h>
  #include <linux/cgroup.h>
+#include <linux/compat.h>
  
  #include "internal.h"
  
@@ -165,25 +166,109 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
  /*
   * max perf event sample rate
   */
-#define DEFAULT_MAX_SAMPLE_RATE 100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
-static int max_samples_per_tick __read_mostly =
-       DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+#define DEFAULT_MAX_SAMPLE_RATE                100000
+#define DEFAULT_SAMPLE_PERIOD_NS       (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
+#define DEFAULT_CPU_TIME_MAX_PERCENT   25
+
+int sysctl_perf_event_sample_rate __read_mostly        = DEFAULT_MAX_SAMPLE_RATE;
+
+static int max_samples_per_tick __read_mostly  = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
+
+static atomic_t perf_sample_allowed_ns __read_mostly =
+       ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+
+void update_perf_cpu_limits(void)
+{
+       u64 tmp = perf_sample_period_ns;
+
+       tmp *= sysctl_perf_cpu_time_max_percent;
+       do_div(tmp, 100);
+       atomic_set(&perf_sample_allowed_ns, tmp);
+}
  
  int perf_proc_update_handler(struct ctl_table *table, int write,
                 void __user *buffer, size_t *lenp,
                 loff_t *ppos)
  {
-       int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
  
         if (ret || !write)
                 return ret;
  
         max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+       perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+       update_perf_cpu_limits();
+
+       return 0;
+}
+
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+                               void __user *buffer, size_t *lenp,
+                               loff_t *ppos)
+{
+       int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+       if (ret || !write)
+               return ret;
+
+       update_perf_cpu_limits();
  
         return 0;
  }
  
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done.  This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+DEFINE_PER_CPU(u64, running_sample_length);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
+       u64 avg_local_sample_len;
+       u64 local_samples_len;
+
+       if (atomic_read(&perf_sample_allowed_ns) == 0)
+               return;
+
+       /* decay the counter by 1 average sample */
+       local_samples_len = __get_cpu_var(running_sample_length);
+       local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+       local_samples_len += sample_len_ns;
+       __get_cpu_var(running_sample_length) = local_samples_len;
+
+       /*
+        * note: this will be biased artifically low until we have
+        * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
+        * from having to maintain a count.
+        */
+       avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+       if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+               return;
+
+       if (max_samples_per_tick <= 1)
+               return;
+
+       max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+       sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+       perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+
+       printk_ratelimited(KERN_WARNING
+                       "perf samples too long (%lld > %d), lowering "
+                       "kernel.perf_event_max_sample_rate to %d\n",
+                       avg_local_sample_len,
+                       atomic_read(&perf_sample_allowed_ns),
+                       sysctl_perf_event_sample_rate);
+
+       update_perf_cpu_limits();
+}
+
  static atomic64_t perf_event_id;
  
  static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -761,8 +846,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
  {
         struct perf_event_context *ctx;
  
-       rcu_read_lock();
  retry:
+       /*
+        * One of the few rules of preemptible RCU is that one cannot do
+        * rcu_read_unlock() while holding a scheduler (or nested) lock when
+        * part of the read side critical section was preemptible -- see
+        * rcu_read_unlock_special().
+        *
+        * Since ctx->lock nests under rq->lock we must ensure the entire read
+        * side critical section is non-preemptible.
+        */
+       preempt_disable();
+       rcu_read_lock();
         ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
         if (ctx) {
                 /*
@@ -778,6 +873,8 @@ retry:
                 raw_spin_lock_irqsave(&ctx->lock, *flags);
                 if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                         raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+                       rcu_read_unlock();
+                       preempt_enable();
                         goto retry;
                 }
  
@@ -787,6 +884,7 @@ retry:
                 }
         }
         rcu_read_unlock();
+       preempt_enable();
         return ctx;
  }
  
@@ -1224,6 +1322,11 @@ group_sched_out(struct perf_event *group_event,
                 cpuctx->exclusive = 0;
  }
  
+struct remove_event {
+       struct perf_event *event;
+       bool detach_group;
+};
+
  /*
   * Cross CPU call to remove a performance event
   *
@@ -1232,12 +1335,15 @@ group_sched_out(struct perf_event *group_event,
   */
  static int __perf_remove_from_context(void *info)
  {
-       struct perf_event *event = info;
+       struct remove_event *re = info;
+       struct perf_event *event = re->event;
         struct perf_event_context *ctx = event->ctx;
         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
  
         raw_spin_lock(&ctx->lock);
         event_sched_out(event, cpuctx, ctx);
+       if (re->detach_group)
+               perf_group_detach(event);
         list_del_event(event, ctx);
         if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
                 ctx->is_active = 0;
@@ -1262,10 +1368,14 @@ static int __perf_remove_from_context(void *info)
   * When called from perf_event_exit_task, it's OK because the
   * context has been detached from its task.
   */
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
  {
         struct perf_event_context *ctx = event->ctx;
         struct task_struct *task = ctx->task;
+       struct remove_event re = {
+               .event = event,
+               .detach_group = detach_group,
+       };
  
         lockdep_assert_held(&ctx->mutex);
  
@@ -1274,12 +1384,12 @@ static void perf_remove_from_context(struct perf_event *event)
                  * Per cpu events are removed via an smp call and
                  * the removal is always successful.
                  */
-               cpu_function_call(event->cpu, __perf_remove_from_context, event);
+               cpu_function_call(event->cpu, __perf_remove_from_context, &re);
                 return;
         }
  
  retry:
-       if (!task_function_call(task, __perf_remove_from_context, event))
+       if (!task_function_call(task, __perf_remove_from_context, &re))
                 return;
  
         raw_spin_lock_irq(&ctx->lock);
@@ -1289,6 +1399,11 @@ retry:
          */
         if (ctx->is_active) {
                 raw_spin_unlock_irq(&ctx->lock);
+               /*
+                * Reload the task pointer, it might have been changed by
+                * a concurrent perf_event_context_sched_out().
+                */
+               task = ctx->task;
                 goto retry;
         }
  
@@ -1296,6 +1411,8 @@ retry:
          * Since the task isn't running, its safe to remove the event, us
          * holding the ctx->lock ensures the task won't get scheduled in.
          */
+       if (detach_group)
+               perf_group_detach(event);
         list_del_event(event, ctx);
         raw_spin_unlock_irq(&ctx->lock);
  }
@@ -1718,6 +1835,11 @@ retry:
          */
         if (ctx->is_active) {
                 raw_spin_unlock_irq(&ctx->lock);
+               /*
+                * Reload the task pointer, it might have been changed by
+                * a concurrent perf_event_context_sched_out().
+                */
+               task = ctx->task;
                 goto retry;
         }
  
@@ -1761,7 +1883,16 @@ static int __perf_event_enable(void *info)
         struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
         int err;
  
-       if (WARN_ON_ONCE(!ctx->is_active))
+       /*
+        * There's a time window between 'ctx->is_active' check
+        * in perf_event_enable function and this place having:
+        *   - IRQs on
+        *   - ctx->lock unlocked
+        *
+        * where the task could be killed and 'ctx' deactivated
+        * by perf_event_exit_task.
+        */
+       if (!ctx->is_active)
                 return -EINVAL;
  
         raw_spin_lock(&ctx->lock);
@@ -1994,9 +2125,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
         perf_event_update_userpage(next_event);
  }
  
-#define list_next_entry(pos, member) \
-       list_entry(pos->member.next, typeof(*pos), member)
-
  static void perf_event_sync_stat(struct perf_event_context *ctx,
                                    struct perf_event_context *next_ctx)
  {
@@ -2996,10 +3124,7 @@ int perf_event_release_kernel(struct perf_event *event)
          *     to trigger the AB-BA case.
          */
         mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
-       raw_spin_lock_irq(&ctx->lock);
-       perf_group_detach(event);
-       raw_spin_unlock_irq(&ctx->lock);
-       perf_remove_from_context(event);
+       perf_remove_from_context(event, true);
         mutex_unlock(&ctx->mutex);
  
         free_event(event);
@@ -3366,6 +3491,25 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
         return 0;
  }
  
+#ifdef CONFIG_COMPAT
+static long perf_compat_ioctl(struct file *file, unsigned int cmd,
+                               unsigned long arg)
+{
+       switch (_IOC_NR(cmd)) {
+       case _IOC_NR(PERF_EVENT_IOC_SET_FILTER):
+               /* Fix up pointer size (usually 4 -> 8 in 32-on-64-bit case */
+               if (_IOC_SIZE(cmd) == sizeof(compat_uptr_t)) {
+                       cmd &= ~IOCSIZE_MASK;
+                       cmd |= sizeof(void *) << IOCSIZE_SHIFT;
+               }
+               break;
+       }
+       return perf_ioctl(file, cmd, arg);
+}
+#else
+# define perf_compat_ioctl NULL
+#endif
+
  int perf_event_task_enable(void)
  {
         struct perf_event *event;
@@ -3837,7 +3981,7 @@ static const struct file_operations perf_fops = {
         .read                   = perf_read,
         .poll                   = perf_poll,
         .unlocked_ioctl         = perf_ioctl,
-       .compat_ioctl           = perf_ioctl,
+       .compat_ioctl           = perf_compat_ioctl,
         .mmap                   = perf_mmap,
         .fasync                 = perf_fasync,
  };
@@ -3849,12 +3993,20 @@ static const struct file_operations perf_fops = {
   * to user-space before waking everybody up.
   */
  
+static inline struct fasync_struct **perf_event_fasync(struct perf_event *event)
+{
+       /* only the parent has fasync state */
+       if (event->parent)
+               event = event->parent;
+       return &event->fasync;
+}
+
  void perf_event_wakeup(struct perf_event *event)
  {
         ring_buffer_wakeup(event);
  
         if (event->pending_kill) {
-               kill_fasync(&event->fasync, SIGIO, event->pending_kill);
+               kill_fasync(perf_event_fasync(event), SIGIO, event->pending_kill);
                 event->pending_kill = 0;
         }
  }
@@ -3863,6 +4015,13 @@ static void perf_pending_event(struct irq_work *entry)
  {
         struct perf_event *event = container_of(entry,
                         struct perf_event, pending);
+       int rctx;
+
+       rctx = perf_swevent_get_recursion_context();
+       /*
+        * If we 'fail' here, that's OK, it means recursion is already disabled
+        * and we won't recurse 'further'.
+        */
  
         if (event->pending_disable) {
                 event->pending_disable = 0;
@@ -3873,6 +4032,9 @@ static void perf_pending_event(struct irq_work *entry)
                 event->pending_wakeup = 0;
                 perf_event_wakeup(event);
         }
+
+       if (rctx >= 0)
+               perf_swevent_put_recursion_context(rctx);
  }
  
  /*
@@ -4999,7 +5161,7 @@ static int __perf_event_overflow(struct perf_event *event,
         else
                 perf_event_output(event, data, regs);
  
-       if (event->fasync && event->pending_kill) {
+       if (*perf_event_fasync(event) && event->pending_kill) {
                 event->pending_wakeup = 1;
                 irq_work_queue(&event->pending);
         }
@@ -5025,6 +5187,9 @@ struct swevent_htable {
  
         /* Recursion avoidance in each contexts */
         int                             recursion[PERF_NR_CONTEXTS];
+
+       /* Keeps track of cpu being initialized/exited */
+       bool                            online;
  };
  
  static DEFINE_PER_CPU(struct swevent_htable, swevent_htable);
@@ -5271,8 +5436,14 @@ static int perf_swevent_add(struct perf_event *event, int flags)
         hwc->state = !(flags & PERF_EF_START);
  
         head = find_swevent_head(swhash, event);
-       if (WARN_ON_ONCE(!head))
+       if (!head) {
+               /*
+                * We can race with cpu hotplug code. Do not
+                * WARN if the cpu just got unplugged.
+                */
+               WARN_ON_ONCE(swhash->online);
                 return -EINVAL;
+       }
  
         hlist_add_head_rcu(&event->hlist_entry, head);
  
@@ -6562,6 +6733,9 @@ SYSCALL_DEFINE5(perf_event_open,
         if (attr.freq) {
                 if (attr.sample_freq > sysctl_perf_event_sample_rate)
                         return -EINVAL;
+       } else {
+               if (attr.sample_period & (1ULL << 63))
+                       return -EINVAL;
         }
  
         /*
@@ -6708,7 +6882,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 struct perf_event_context *gctx = group_leader->ctx;
  
                 mutex_lock(&gctx->mutex);
-               perf_remove_from_context(group_leader);
+               perf_remove_from_context(group_leader, false);
  
                 /*
                  * Removing from the context ends up with disabled
@@ -6718,7 +6892,7 @@ SYSCALL_DEFINE5(perf_event_open,
                 perf_event__state_init(group_leader);
                 list_for_each_entry(sibling, &group_leader->sibling_list,
                                     group_entry) {
-                       perf_remove_from_context(sibling);
+                       perf_remove_from_context(sibling, false);
                         perf_event__state_init(sibling);
                         put_ctx(gctx);
                 }
@@ -6731,11 +6905,11 @@ SYSCALL_DEFINE5(perf_event_open,
  
         if (move_group) {
                 synchronize_rcu();
-               perf_install_in_context(ctx, group_leader, event->cpu);
+               perf_install_in_context(ctx, group_leader, group_leader->cpu);
                 get_ctx(ctx);
                 list_for_each_entry(sibling, &group_leader->sibling_list,
                                     group_entry) {
-                       perf_install_in_context(ctx, sibling, event->cpu);
+                       perf_install_in_context(ctx, sibling, sibling->cpu);
                         get_ctx(ctx);
                 }
         }
@@ -6848,7 +7022,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
         mutex_lock(&src_ctx->mutex);
         list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                  event_entry) {
-               perf_remove_from_context(event);
+               perf_remove_from_context(event, false);
                 put_ctx(src_ctx);
                 list_add(&event->event_entry, &events);
         }
@@ -6908,13 +7082,7 @@ __perf_event_exit_task(struct perf_event *child_event,
                          struct perf_event_context *child_ctx,
                          struct task_struct *child)
  {
-       if (child_event->parent) {
-               raw_spin_lock_irq(&child_ctx->lock);
-               perf_group_detach(child_event);
-               raw_spin_unlock_irq(&child_ctx->lock);
-       }
-
-       perf_remove_from_context(child_event);
+       perf_remove_from_context(child_event, !!child_event->parent);
  
         /*
          * It can happen that the parent exits first, and has events
@@ -7228,7 +7396,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
                  * child.
                  */
  
-               child_ctx = alloc_perf_context(event->pmu, child);
+               child_ctx = alloc_perf_context(parent_ctx->pmu, child);
                 if (!child_ctx)
                         return -ENOMEM;
  
@@ -7352,8 +7520,10 @@ int perf_event_init_task(struct task_struct *child)
  
         for_each_task_context_nr(ctxn) {
                 ret = perf_event_init_context(child, ctxn);
-               if (ret)
+               if (ret) {
+                       perf_event_free_task(child);
                         return ret;
+               }
         }
  
         return 0;
@@ -7376,6 +7546,7 @@ static void __cpuinit perf_event_init_cpu(int cpu)
         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
         mutex_lock(&swhash->hlist_mutex);
+       swhash->online = true;
         if (swhash->hlist_refcount > 0) {
                 struct swevent_hlist *hlist;
  
@@ -7398,15 +7569,15 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
  
  static void __perf_event_exit_context(void *__info)
  {
+       struct remove_event re = { .detach_group = false };
         struct perf_event_context *ctx = __info;
-       struct perf_event *event, *tmp;
  
         perf_pmu_rotate_stop(ctx->pmu);
  
-       list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-               __perf_remove_from_context(event);
-       list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
-               __perf_remove_from_context(event);
+       rcu_read_lock();
+       list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+               __perf_remove_from_context(&re);
+       rcu_read_unlock();
  }
  
  static void perf_event_exit_cpu_context(int cpu)
@@ -7430,11 +7601,12 @@ static void perf_event_exit_cpu(int cpu)
  {
         struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
  
+       perf_event_exit_cpu_context(cpu);
+
         mutex_lock(&swhash->hlist_mutex);
+       swhash->online = false;
         swevent_hlist_release(swhash);
         mutex_unlock(&swhash->hlist_mutex);
-
-       perf_event_exit_cpu_context(cpu);
  }
  #else
  static inline void perf_event_exit_cpu(int cpu) { }