Merge tag 'v3.10.56' into update
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / kernel / events / core.c
index 9dc297faf7c01b68cbb1a7a24444b99c300cb8a3..cd17762037819d63dee7c1424abf56a989834aa9 100644 (file)
@@ -165,25 +165,109 @@ int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free'
 /*
  * max perf event sample rate
  */
-#define DEFAULT_MAX_SAMPLE_RATE 100000
-int sysctl_perf_event_sample_rate __read_mostly = DEFAULT_MAX_SAMPLE_RATE;
-static int max_samples_per_tick __read_mostly =
-       DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+#define DEFAULT_MAX_SAMPLE_RATE                100000
+#define DEFAULT_SAMPLE_PERIOD_NS       (NSEC_PER_SEC / DEFAULT_MAX_SAMPLE_RATE)
+#define DEFAULT_CPU_TIME_MAX_PERCENT   25
+
+int sysctl_perf_event_sample_rate __read_mostly        = DEFAULT_MAX_SAMPLE_RATE;
+
+static int max_samples_per_tick __read_mostly  = DIV_ROUND_UP(DEFAULT_MAX_SAMPLE_RATE, HZ);
+static int perf_sample_period_ns __read_mostly = DEFAULT_SAMPLE_PERIOD_NS;
+
+static atomic_t perf_sample_allowed_ns __read_mostly =
+       ATOMIC_INIT( DEFAULT_SAMPLE_PERIOD_NS * DEFAULT_CPU_TIME_MAX_PERCENT / 100);
+
+void update_perf_cpu_limits(void)
+{
+       u64 tmp = perf_sample_period_ns;
+
+       tmp *= sysctl_perf_cpu_time_max_percent;
+       do_div(tmp, 100);
+       atomic_set(&perf_sample_allowed_ns, tmp);
+}
 
 int perf_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *lenp,
                loff_t *ppos)
 {
-       int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 
        if (ret || !write)
                return ret;
 
        max_samples_per_tick = DIV_ROUND_UP(sysctl_perf_event_sample_rate, HZ);
+       perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+       update_perf_cpu_limits();
+
+       return 0;
+}
+
+int sysctl_perf_cpu_time_max_percent __read_mostly = DEFAULT_CPU_TIME_MAX_PERCENT;
+
+int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
+                               void __user *buffer, size_t *lenp,
+                               loff_t *ppos)
+{
+       int ret = proc_dointvec(table, write, buffer, lenp, ppos);
+
+       if (ret || !write)
+               return ret;
+
+       update_perf_cpu_limits();
 
        return 0;
 }
 
+/*
+ * perf samples are done in some very critical code paths (NMIs).
+ * If they take too much CPU time, the system can lock up and not
+ * get any real work done.  This will drop the sample rate when
+ * we detect that events are taking too long.
+ */
+#define NR_ACCUMULATED_SAMPLES 128
+DEFINE_PER_CPU(u64, running_sample_length);
+
+void perf_sample_event_took(u64 sample_len_ns)
+{
+       u64 avg_local_sample_len;
+       u64 local_samples_len;
+
+       if (atomic_read(&perf_sample_allowed_ns) == 0)
+               return;
+
+       /* decay the counter by 1 average sample */
+       local_samples_len = __get_cpu_var(running_sample_length);
+       local_samples_len -= local_samples_len/NR_ACCUMULATED_SAMPLES;
+       local_samples_len += sample_len_ns;
+       __get_cpu_var(running_sample_length) = local_samples_len;
+
+       /*
+        * note: this will be biased artifically low until we have
+        * seen NR_ACCUMULATED_SAMPLES.  Doing it this way keeps us
+        * from having to maintain a count.
+        */
+       avg_local_sample_len = local_samples_len/NR_ACCUMULATED_SAMPLES;
+
+       if (avg_local_sample_len <= atomic_read(&perf_sample_allowed_ns))
+               return;
+
+       if (max_samples_per_tick <= 1)
+               return;
+
+       max_samples_per_tick = DIV_ROUND_UP(max_samples_per_tick, 2);
+       sysctl_perf_event_sample_rate = max_samples_per_tick * HZ;
+       perf_sample_period_ns = NSEC_PER_SEC / sysctl_perf_event_sample_rate;
+
+       printk_ratelimited(KERN_WARNING
+                       "perf samples too long (%lld > %d), lowering "
+                       "kernel.perf_event_max_sample_rate to %d\n",
+                       avg_local_sample_len,
+                       atomic_read(&perf_sample_allowed_ns),
+                       sysctl_perf_event_sample_rate);
+
+       update_perf_cpu_limits();
+}
+
 static atomic64_t perf_event_id;
 
 static void cpu_ctx_sched_out(struct perf_cpu_context *cpuctx,
@@ -196,9 +280,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx,
 static void update_context_time(struct perf_event_context *ctx);
 static u64 perf_event_time(struct perf_event *event);
 
-static void ring_buffer_attach(struct perf_event *event,
-                              struct ring_buffer *rb);
-
 void __weak perf_event_print_debug(void)       { }
 
 extern __weak const char *perf_pmu_name(void)
@@ -764,8 +845,18 @@ perf_lock_task_context(struct task_struct *task, int ctxn, unsigned long *flags)
 {
        struct perf_event_context *ctx;
 
-       rcu_read_lock();
 retry:
+       /*
+        * One of the few rules of preemptible RCU is that one cannot do
+        * rcu_read_unlock() while holding a scheduler (or nested) lock when
+        * part of the read side critical section was preemptible -- see
+        * rcu_read_unlock_special().
+        *
+        * Since ctx->lock nests under rq->lock we must ensure the entire read
+        * side critical section is non-preemptible.
+        */
+       preempt_disable();
+       rcu_read_lock();
        ctx = rcu_dereference(task->perf_event_ctxp[ctxn]);
        if (ctx) {
                /*
@@ -781,6 +872,8 @@ retry:
                raw_spin_lock_irqsave(&ctx->lock, *flags);
                if (ctx != rcu_dereference(task->perf_event_ctxp[ctxn])) {
                        raw_spin_unlock_irqrestore(&ctx->lock, *flags);
+                       rcu_read_unlock();
+                       preempt_enable();
                        goto retry;
                }
 
@@ -790,6 +883,7 @@ retry:
                }
        }
        rcu_read_unlock();
+       preempt_enable();
        return ctx;
 }
 
@@ -1142,10 +1236,17 @@ static void perf_group_detach(struct perf_event *event)
         * If this was a group event with sibling events then
         * upgrade the siblings to singleton events by adding them
         * to whatever list we are on.
+        * If this isn't on a list, make sure we still remove the sibling's
+        * group_entry from this sibling_list; otherwise, when that sibling
+        * is later deallocated, it will try to remove itself from this
+        * sibling_list, which may well have been deallocated already,
+        * resulting in a use-after-free.
         */
        list_for_each_entry_safe(sibling, tmp, &event->sibling_list, group_entry) {
                if (list)
                        list_move_tail(&sibling->group_entry, list);
+               else
+                       list_del_init(&sibling->group_entry);
                sibling->group_leader = sibling;
 
                /* Inherit group flags from the previous leader */
@@ -1227,6 +1328,11 @@ group_sched_out(struct perf_event *group_event,
                cpuctx->exclusive = 0;
 }
 
+struct remove_event {
+       struct perf_event *event;
+       bool detach_group;
+};
+
 /*
  * Cross CPU call to remove a performance event
  *
@@ -1235,12 +1341,15 @@ group_sched_out(struct perf_event *group_event,
  */
 static int __perf_remove_from_context(void *info)
 {
-       struct perf_event *event = info;
+       struct remove_event *re = info;
+       struct perf_event *event = re->event;
        struct perf_event_context *ctx = event->ctx;
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
 
        raw_spin_lock(&ctx->lock);
        event_sched_out(event, cpuctx, ctx);
+       if (re->detach_group)
+               perf_group_detach(event);
        list_del_event(event, ctx);
        if (!ctx->nr_events && cpuctx->task_ctx == ctx) {
                ctx->is_active = 0;
@@ -1265,10 +1374,14 @@ static int __perf_remove_from_context(void *info)
  * When called from perf_event_exit_task, it's OK because the
  * context has been detached from its task.
  */
-static void perf_remove_from_context(struct perf_event *event)
+static void perf_remove_from_context(struct perf_event *event, bool detach_group)
 {
        struct perf_event_context *ctx = event->ctx;
        struct task_struct *task = ctx->task;
+       struct remove_event re = {
+               .event = event,
+               .detach_group = detach_group,
+       };
 
        lockdep_assert_held(&ctx->mutex);
 
@@ -1277,12 +1390,12 @@ static void perf_remove_from_context(struct perf_event *event)
                 * Per cpu events are removed via an smp call and
                 * the removal is always successful.
                 */
-               cpu_function_call(event->cpu, __perf_remove_from_context, event);
+               cpu_function_call(event->cpu, __perf_remove_from_context, &re);
                return;
        }
 
 retry:
-       if (!task_function_call(task, __perf_remove_from_context, event))
+       if (!task_function_call(task, __perf_remove_from_context, &re))
                return;
 
        raw_spin_lock_irq(&ctx->lock);
@@ -1292,6 +1405,11 @@ retry:
         */
        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
+               /*
+                * Reload the task pointer, it might have been changed by
+                * a concurrent perf_event_context_sched_out().
+                */
+               task = ctx->task;
                goto retry;
        }
 
@@ -1299,6 +1417,8 @@ retry:
         * Since the task isn't running, its safe to remove the event, us
         * holding the ctx->lock ensures the task won't get scheduled in.
         */
+       if (detach_group)
+               perf_group_detach(event);
        list_del_event(event, ctx);
        raw_spin_unlock_irq(&ctx->lock);
 }
@@ -1721,6 +1841,11 @@ retry:
         */
        if (ctx->is_active) {
                raw_spin_unlock_irq(&ctx->lock);
+               /*
+                * Reload the task pointer, it might have been changed by
+                * a concurrent perf_event_context_sched_out().
+                */
+               task = ctx->task;
                goto retry;
        }
 
@@ -1764,7 +1889,16 @@ static int __perf_event_enable(void *info)
        struct perf_cpu_context *cpuctx = __get_cpu_context(ctx);
        int err;
 
-       if (WARN_ON_ONCE(!ctx->is_active))
+       /*
+        * There's a time window between 'ctx->is_active' check
+        * in perf_event_enable function and this place having:
+        *   - IRQs on
+        *   - ctx->lock unlocked
+        *
+        * where the task could be killed and 'ctx' deactivated
+        * by perf_event_exit_task.
+        */
+       if (!ctx->is_active)
                return -EINVAL;
 
        raw_spin_lock(&ctx->lock);
@@ -1997,9 +2131,6 @@ static void __perf_event_sync_stat(struct perf_event *event,
        perf_event_update_userpage(next_event);
 }
 
-#define list_next_entry(pos, member) \
-       list_entry(pos->member.next, typeof(*pos), member)
-
 static void perf_event_sync_stat(struct perf_event_context *ctx,
                                   struct perf_event_context *next_ctx)
 {
@@ -2918,6 +3049,7 @@ static void free_event_rcu(struct rcu_head *head)
 }
 
 static void ring_buffer_put(struct ring_buffer *rb);
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb);
 
 static void free_event(struct perf_event *event)
 {
@@ -2942,15 +3074,30 @@ static void free_event(struct perf_event *event)
                if (has_branch_stack(event)) {
                        static_key_slow_dec_deferred(&perf_sched_events);
                        /* is system-wide event */
-                       if (!(event->attach_state & PERF_ATTACH_TASK))
+                       if (!(event->attach_state & PERF_ATTACH_TASK)) {
                                atomic_dec(&per_cpu(perf_branch_stack_events,
                                                    event->cpu));
+                       }
                }
        }
 
        if (event->rb) {
-               ring_buffer_put(event->rb);
-               event->rb = NULL;
+               struct ring_buffer *rb;
+
+               /*
+                * Can happen when we close an event with re-directed output.
+                *
+                * Since we have a 0 refcount, perf_mmap_close() will skip
+                * over us; possibly making our ring_buffer_put() the last.
+                */
+               mutex_lock(&event->mmap_mutex);
+               rb = event->rb;
+               if (rb) {
+                       rcu_assign_pointer(event->rb, NULL);
+                       ring_buffer_detach(event, rb);
+                       ring_buffer_put(rb); /* could be last */
+               }
+               mutex_unlock(&event->mmap_mutex);
        }
 
        if (is_cgroup_event(event))
@@ -2983,10 +3130,7 @@ int perf_event_release_kernel(struct perf_event *event)
         *     to trigger the AB-BA case.
         */
        mutex_lock_nested(&ctx->mutex, SINGLE_DEPTH_NESTING);
-       raw_spin_lock_irq(&ctx->lock);
-       perf_group_detach(event);
-       raw_spin_unlock_irq(&ctx->lock);
-       perf_remove_from_context(event);
+       perf_remove_from_context(event, true);
        mutex_unlock(&ctx->mutex);
 
        free_event(event);
@@ -3188,30 +3332,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait)
        unsigned int events = POLL_HUP;
 
        /*
-        * Race between perf_event_set_output() and perf_poll(): perf_poll()
-        * grabs the rb reference but perf_event_set_output() overrides it.
-        * Here is the timeline for two threads T1, T2:
-        * t0: T1, rb = rcu_dereference(event->rb)
-        * t1: T2, old_rb = event->rb
-        * t2: T2, event->rb = new rb
-        * t3: T2, ring_buffer_detach(old_rb)
-        * t4: T1, ring_buffer_attach(rb1)
-        * t5: T1, poll_wait(event->waitq)
-        *
-        * To avoid this problem, we grab mmap_mutex in perf_poll()
-        * thereby ensuring that the assignment of the new ring buffer
-        * and the detachment of the old buffer appear atomic to perf_poll()
+        * Pin the event->rb by taking event->mmap_mutex; otherwise
+        * perf_event_set_output() can swizzle our rb and make us miss wakeups.
         */
        mutex_lock(&event->mmap_mutex);
-
-       rcu_read_lock();
-       rb = rcu_dereference(event->rb);
-       if (rb) {
-               ring_buffer_attach(event, rb);
+       rb = event->rb;
+       if (rb)
                events = atomic_xchg(&rb->poll, 0);
-       }
-       rcu_read_unlock();
-
        mutex_unlock(&event->mmap_mutex);
 
        poll_wait(file, &event->waitq, wait);
@@ -3521,16 +3648,12 @@ static void ring_buffer_attach(struct perf_event *event,
                return;
 
        spin_lock_irqsave(&rb->event_lock, flags);
-       if (!list_empty(&event->rb_entry))
-               goto unlock;
-
-       list_add(&event->rb_entry, &rb->event_list);
-unlock:
+       if (list_empty(&event->rb_entry))
+               list_add(&event->rb_entry, &rb->event_list);
        spin_unlock_irqrestore(&rb->event_lock, flags);
 }
 
-static void ring_buffer_detach(struct perf_event *event,
-                              struct ring_buffer *rb)
+static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb)
 {
        unsigned long flags;
 
@@ -3549,13 +3672,10 @@ static void ring_buffer_wakeup(struct perf_event *event)
 
        rcu_read_lock();
        rb = rcu_dereference(event->rb);
-       if (!rb)
-               goto unlock;
-
-       list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
-               wake_up_all(&event->waitq);
-
-unlock:
+       if (rb) {
+               list_for_each_entry_rcu(event, &rb->event_list, rb_entry)
+                       wake_up_all(&event->waitq);
+       }
        rcu_read_unlock();
 }
 
@@ -3584,18 +3704,10 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event)
 
 static void ring_buffer_put(struct ring_buffer *rb)
 {
-       struct perf_event *event, *n;
-       unsigned long flags;
-
        if (!atomic_dec_and_test(&rb->refcount))
                return;
 
-       spin_lock_irqsave(&rb->event_lock, flags);
-       list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) {
-               list_del_init(&event->rb_entry);
-               wake_up_all(&event->waitq);
-       }
-       spin_unlock_irqrestore(&rb->event_lock, flags);
+       WARN_ON_ONCE(!list_empty(&rb->event_list));
 
        call_rcu(&rb->rcu_head, rb_free_rcu);
 }
@@ -3605,26 +3717,100 @@ static void perf_mmap_open(struct vm_area_struct *vma)
        struct perf_event *event = vma->vm_file->private_data;
 
        atomic_inc(&event->mmap_count);
+       atomic_inc(&event->rb->mmap_count);
 }
 
+/*
+ * A buffer can be mmap()ed multiple times; either directly through the same
+ * event, or through other events by use of perf_event_set_output().
+ *
+ * In order to undo the VM accounting done by perf_mmap() we need to destroy
+ * the buffer here, where we still have a VM context. This means we need
+ * to detach all events redirecting to us.
+ */
 static void perf_mmap_close(struct vm_area_struct *vma)
 {
        struct perf_event *event = vma->vm_file->private_data;
 
-       if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
-               unsigned long size = perf_data_size(event->rb);
-               struct user_struct *user = event->mmap_user;
-               struct ring_buffer *rb = event->rb;
+       struct ring_buffer *rb = event->rb;
+       struct user_struct *mmap_user = rb->mmap_user;
+       int mmap_locked = rb->mmap_locked;
+       unsigned long size = perf_data_size(rb);
+
+       atomic_dec(&rb->mmap_count);
+
+       if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex))
+               return;
+
+       /* Detach current event from the buffer. */
+       rcu_assign_pointer(event->rb, NULL);
+       ring_buffer_detach(event, rb);
+       mutex_unlock(&event->mmap_mutex);
+
+       /* If there's still other mmap()s of this buffer, we're done. */
+       if (atomic_read(&rb->mmap_count)) {
+               ring_buffer_put(rb); /* can't be last */
+               return;
+       }
 
-               atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
-               vma->vm_mm->pinned_vm -= event->mmap_locked;
-               rcu_assign_pointer(event->rb, NULL);
-               ring_buffer_detach(event, rb);
+       /*
+        * No other mmap()s, detach from all other events that might redirect
+        * into the now unreachable buffer. Somewhat complicated by the
+        * fact that rb::event_lock otherwise nests inside mmap_mutex.
+        */
+again:
+       rcu_read_lock();
+       list_for_each_entry_rcu(event, &rb->event_list, rb_entry) {
+               if (!atomic_long_inc_not_zero(&event->refcount)) {
+                       /*
+                        * This event is en-route to free_event() which will
+                        * detach it and remove it from the list.
+                        */
+                       continue;
+               }
+               rcu_read_unlock();
+
+               mutex_lock(&event->mmap_mutex);
+               /*
+                * Check we didn't race with perf_event_set_output() which can
+                * swizzle the rb from under us while we were waiting to
+                * acquire mmap_mutex.
+                *
+                * If we find a different rb; ignore this event, a next
+                * iteration will no longer find it on the list. We have to
+                * still restart the iteration to make sure we're not now
+                * iterating the wrong list.
+                */
+               if (event->rb == rb) {
+                       rcu_assign_pointer(event->rb, NULL);
+                       ring_buffer_detach(event, rb);
+                       ring_buffer_put(rb); /* can't be last, we still have one */
+               }
                mutex_unlock(&event->mmap_mutex);
+               put_event(event);
 
-               ring_buffer_put(rb);
-               free_uid(user);
+               /*
+                * Restart the iteration; either we're on the wrong list or
+                * destroyed its integrity by doing a deletion.
+                */
+               goto again;
        }
+       rcu_read_unlock();
+
+       /*
+        * It could be there's still a few 0-ref events on the list; they'll
+        * get cleaned up by free_event() -- they'll also still have their
+        * ref on the rb and will free it whenever they are done with it.
+        *
+        * Aside from that, this buffer is 'fully' detached and unmapped,
+        * undo the VM accounting.
+        */
+
+       atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm);
+       vma->vm_mm->pinned_vm -= mmap_locked;
+       free_uid(mmap_user);
+
+       ring_buffer_put(rb); /* could be last */
 }
 
 static const struct vm_operations_struct perf_mmap_vmops = {
@@ -3674,12 +3860,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                return -EINVAL;
 
        WARN_ON_ONCE(event->ctx->parent_ctx);
+again:
        mutex_lock(&event->mmap_mutex);
        if (event->rb) {
-               if (event->rb->nr_pages == nr_pages)
-                       atomic_inc(&event->rb->refcount);
-               else
+               if (event->rb->nr_pages != nr_pages) {
                        ret = -EINVAL;
+                       goto unlock;
+               }
+
+               if (!atomic_inc_not_zero(&event->rb->mmap_count)) {
+                       /*
+                        * Raced against perf_mmap_close() through
+                        * perf_event_set_output(). Try again, hope for better
+                        * luck.
+                        */
+                       mutex_unlock(&event->mmap_mutex);
+                       goto again;
+               }
+
                goto unlock;
        }
 
@@ -3720,12 +3918,16 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                ret = -ENOMEM;
                goto unlock;
        }
-       rcu_assign_pointer(event->rb, rb);
+
+       atomic_set(&rb->mmap_count, 1);
+       rb->mmap_locked = extra;
+       rb->mmap_user = get_current_user();
 
        atomic_long_add(user_extra, &user->locked_vm);
-       event->mmap_locked = extra;
-       event->mmap_user = get_current_user();
-       vma->vm_mm->pinned_vm += event->mmap_locked;
+       vma->vm_mm->pinned_vm += extra;
+
+       ring_buffer_attach(event, rb);
+       rcu_assign_pointer(event->rb, rb);
 
        perf_event_update_userpage(event);
 
@@ -3734,7 +3936,11 @@ unlock:
                atomic_inc(&event->mmap_count);
        mutex_unlock(&event->mmap_mutex);
 
-       vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP;
+       /*
+        * Since pinned accounting is per vm we cannot allow fork() to copy our
+        * vma.
+        */
+       vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP;
        vma->vm_ops = &perf_mmap_vmops;
 
        return ret;
@@ -5269,7 +5475,6 @@ static int swevent_hlist_get_cpu(struct perf_event *event, int cpu)
        int err = 0;
 
        mutex_lock(&swhash->hlist_mutex);
-
        if (!swevent_hlist_deref(swhash) && cpu_online(cpu)) {
                struct swevent_hlist *hlist;
 
@@ -6412,6 +6617,8 @@ set:
        if (atomic_read(&event->mmap_count))
                goto unlock;
 
+       old_rb = event->rb;
+
        if (output_event) {
                /* get the rb we want to redirect to */
                rb = ring_buffer_get(output_event);
@@ -6419,16 +6626,28 @@ set:
                        goto unlock;
        }
 
-       old_rb = event->rb;
-       rcu_assign_pointer(event->rb, rb);
        if (old_rb)
                ring_buffer_detach(event, old_rb);
+
+       if (rb)
+               ring_buffer_attach(event, rb);
+
+       rcu_assign_pointer(event->rb, rb);
+
+       if (old_rb) {
+               ring_buffer_put(old_rb);
+               /*
+                * Since we detached before setting the new rb, so that we
+                * could attach the new rb, we could have missed a wakeup.
+                * Provide it now.
+                */
+               wake_up_all(&event->waitq);
+       }
+
        ret = 0;
 unlock:
        mutex_unlock(&event->mmap_mutex);
 
-       if (old_rb)
-               ring_buffer_put(old_rb);
 out:
        return ret;
 }
@@ -6473,6 +6692,9 @@ SYSCALL_DEFINE5(perf_event_open,
        if (attr.freq) {
                if (attr.sample_freq > sysctl_perf_event_sample_rate)
                        return -EINVAL;
+       } else {
+               if (attr.sample_period & (1ULL << 63))
+                       return -EINVAL;
        }
 
        /*
@@ -6619,7 +6841,7 @@ SYSCALL_DEFINE5(perf_event_open,
                struct perf_event_context *gctx = group_leader->ctx;
 
                mutex_lock(&gctx->mutex);
-               perf_remove_from_context(group_leader);
+               perf_remove_from_context(group_leader, false);
 
                /*
                 * Removing from the context ends up with disabled
@@ -6629,7 +6851,7 @@ SYSCALL_DEFINE5(perf_event_open,
                perf_event__state_init(group_leader);
                list_for_each_entry(sibling, &group_leader->sibling_list,
                                    group_entry) {
-                       perf_remove_from_context(sibling);
+                       perf_remove_from_context(sibling, false);
                        perf_event__state_init(sibling);
                        put_ctx(gctx);
                }
@@ -6759,7 +6981,7 @@ void perf_pmu_migrate_context(struct pmu *pmu, int src_cpu, int dst_cpu)
        mutex_lock(&src_ctx->mutex);
        list_for_each_entry_safe(event, tmp, &src_ctx->event_list,
                                 event_entry) {
-               perf_remove_from_context(event);
+               perf_remove_from_context(event, false);
                put_ctx(src_ctx);
                list_add(&event->event_entry, &events);
        }
@@ -6819,13 +7041,7 @@ __perf_event_exit_task(struct perf_event *child_event,
                         struct perf_event_context *child_ctx,
                         struct task_struct *child)
 {
-       if (child_event->parent) {
-               raw_spin_lock_irq(&child_ctx->lock);
-               perf_group_detach(child_event);
-               raw_spin_unlock_irq(&child_ctx->lock);
-       }
-
-       perf_remove_from_context(child_event);
+       perf_remove_from_context(child_event, !!child_event->parent);
 
        /*
         * It can happen that the parent exits first, and has events
@@ -7139,7 +7355,7 @@ inherit_task_group(struct perf_event *event, struct task_struct *parent,
                 * child.
                 */
 
-               child_ctx = alloc_perf_context(event->pmu, child);
+               child_ctx = alloc_perf_context(parent_ctx->pmu, child);
                if (!child_ctx)
                        return -ENOMEM;
 
@@ -7309,15 +7525,15 @@ static void perf_pmu_rotate_stop(struct pmu *pmu)
 
 static void __perf_event_exit_context(void *__info)
 {
+       struct remove_event re = { .detach_group = false };
        struct perf_event_context *ctx = __info;
-       struct perf_event *event, *tmp;
 
        perf_pmu_rotate_stop(ctx->pmu);
 
-       list_for_each_entry_safe(event, tmp, &ctx->pinned_groups, group_entry)
-               __perf_remove_from_context(event);
-       list_for_each_entry_safe(event, tmp, &ctx->flexible_groups, group_entry)
-               __perf_remove_from_context(event);
+       rcu_read_lock();
+       list_for_each_entry_rcu(re.event, &ctx->event_list, event_entry)
+               __perf_remove_from_context(&re);
+       rcu_read_unlock();
 }
 
 static void perf_event_exit_cpu_context(int cpu)
@@ -7339,12 +7555,6 @@ static void perf_event_exit_cpu_context(int cpu)
 
 static void perf_event_exit_cpu(int cpu)
 {
-       struct swevent_htable *swhash = &per_cpu(swevent_htable, cpu);
-
-       mutex_lock(&swhash->hlist_mutex);
-       swevent_hlist_release(swhash);
-       mutex_unlock(&swhash->hlist_mutex);
-
        perf_event_exit_cpu_context(cpu);
 }
 #else