perf_counter: fix counter freeing logic
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / kernel / perf_counter.c
index 0173738dd548eaa07e729c0658759f00c15b8694..7af16d1c480fbf0c19d1c9ad740e8e68c2516007 100644 (file)
@@ -45,7 +45,7 @@ static atomic_t nr_munmap_tracking __read_mostly;
 static atomic_t nr_comm_tracking __read_mostly;
 
 int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
-int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */
+int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
 
 /*
  * Lock for (sysadmin-configurable) counter reservations:
@@ -115,6 +115,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
        }
 
        list_add_rcu(&counter->event_entry, &ctx->event_list);
+       ctx->nr_counters++;
 }
 
 static void
@@ -122,6 +123,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
 {
        struct perf_counter *sibling, *tmp;
 
+       ctx->nr_counters--;
+
        list_del_init(&counter->list_entry);
        list_del_rcu(&counter->event_entry);
 
@@ -209,7 +212,6 @@ static void __perf_counter_remove_from_context(void *info)
        counter_sched_out(counter, cpuctx, ctx);
 
        counter->task = NULL;
-       ctx->nr_counters--;
 
        /*
         * Protect the list operation against NMI by disabling the
@@ -276,7 +278,6 @@ retry:
         * succeed.
         */
        if (!list_empty(&counter->list_entry)) {
-               ctx->nr_counters--;
                list_del_counter(counter, ctx);
                counter->task = NULL;
        }
@@ -544,7 +545,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
                               struct perf_counter_context *ctx)
 {
        list_add_counter(counter, ctx);
-       ctx->nr_counters++;
        counter->prev_state = PERF_COUNTER_STATE_OFF;
        counter->tstamp_enabled = ctx->time;
        counter->tstamp_running = ctx->time;
@@ -1046,6 +1046,38 @@ int perf_counter_task_enable(void)
        return 0;
 }
 
+void perf_adjust_freq(struct perf_counter_context *ctx)
+{
+       struct perf_counter *counter;
+       u64 irq_period;
+       u64 events, period;
+       s64 delta;
+
+       spin_lock(&ctx->lock);
+       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+               if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+                       continue;
+
+               if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+                       continue;
+
+               events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+               period = div64_u64(events, counter->hw_event.irq_freq);
+
+               delta = (s64)(1 + period - counter->hw.irq_period);
+               delta >>= 1;
+
+               irq_period = counter->hw.irq_period + delta;
+
+               if (!irq_period)
+                       irq_period = 1;
+
+               counter->hw.irq_period = irq_period;
+               counter->hw.interrupts = 0;
+       }
+       spin_unlock(&ctx->lock);
+}
+
 /*
  * Round-robin a context's counters:
  */
@@ -1081,6 +1113,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
        cpuctx = &per_cpu(perf_cpu_context, cpu);
        ctx = &curr->perf_counter_ctx;
 
+       perf_adjust_freq(&cpuctx->ctx);
+       perf_adjust_freq(ctx);
+
        perf_counter_cpu_sched_out(cpuctx);
        __perf_counter_task_sched_out(ctx);
 
@@ -1522,6 +1557,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
        if (atomic_dec_and_mutex_lock(&counter->mmap_count,
                                      &counter->mmap_mutex)) {
+               struct user_struct *user = current_user();
+
+               atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
                vma->vm_mm->locked_vm -= counter->data->nr_locked;
                perf_mmap_data_free(counter);
                mutex_unlock(&counter->mmap_mutex);
@@ -1537,11 +1575,13 @@ static struct vm_operations_struct perf_mmap_vmops = {
 static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 {
        struct perf_counter *counter = file->private_data;
+       struct user_struct *user = current_user();
        unsigned long vma_size;
        unsigned long nr_pages;
+       unsigned long user_locked, user_lock_limit;
        unsigned long locked, lock_limit;
+       long user_extra, extra;
        int ret = 0;
-       long extra;
 
        if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
                return -EINVAL;
@@ -1569,15 +1609,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                goto unlock;
        }
 
-       extra = nr_pages /* + 1 only account the data pages */;
-       extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
-       if (extra < 0)
-               extra = 0;
+       user_extra = nr_pages + 1;
+       user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+       user_locked = atomic_long_read(&user->locked_vm) + user_extra;
 
-       locked = vma->vm_mm->locked_vm + extra;
+       extra = 0;
+       if (user_locked > user_lock_limit)
+               extra = user_locked - user_lock_limit;
 
        lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
        lock_limit >>= PAGE_SHIFT;
+       locked = vma->vm_mm->locked_vm + extra;
 
        if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
                ret = -EPERM;
@@ -1590,6 +1632,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                goto unlock;
 
        atomic_set(&counter->mmap_count, 1);
+       atomic_long_add(user_extra, &user->locked_vm);
        vma->vm_mm->locked_vm += extra;
        counter->data->nr_locked = extra;
 unlock:
@@ -1999,11 +2042,10 @@ static void perf_counter_output(struct perf_counter *counter,
        header.size = sizeof(header);
 
        header.misc = PERF_EVENT_MISC_OVERFLOW;
-       header.misc |= user_mode(regs) ?
-               PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
+       header.misc |= perf_misc_flags(regs);
 
        if (record_type & PERF_RECORD_IP) {
-               ip = instruction_pointer(regs);
+               ip = perf_instruction_pointer(regs);
                header.type |= PERF_RECORD_IP;
                header.size += sizeof(ip);
        }
@@ -2374,6 +2416,8 @@ int perf_counter_overflow(struct perf_counter *counter,
        int events = atomic_read(&counter->event_limit);
        int ret = 0;
 
+       counter->hw.interrupts++;
+
        /*
         * XXX event_limit might not quite work as expected on inherited
         * counters
@@ -2442,6 +2486,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
        enum hrtimer_restart ret = HRTIMER_RESTART;
        struct perf_counter *counter;
        struct pt_regs *regs;
+       u64 period;
 
        counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
        counter->pmu->read(counter);
@@ -2460,7 +2505,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
                        ret = HRTIMER_NORESTART;
        }
 
-       hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
+       period = max_t(u64, 10000, counter->hw.irq_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
 
        return ret;
 }
@@ -2621,8 +2667,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hwc->hrtimer.function = perf_swcounter_hrtimer;
        if (hwc->irq_period) {
+               u64 period = max_t(u64, 10000, hwc->irq_period);
                __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(hwc->irq_period), 0,
+                               ns_to_ktime(period), 0,
                                HRTIMER_MODE_REL, 0);
        }
 
@@ -2671,8 +2718,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
        hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        hwc->hrtimer.function = perf_swcounter_hrtimer;
        if (hwc->irq_period) {
+               u64 period = max_t(u64, 10000, hwc->irq_period);
                __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(hwc->irq_period), 0,
+                               ns_to_ktime(period), 0,
                                HRTIMER_MODE_REL, 0);
        }
 
@@ -2803,9 +2851,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 
 static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
 {
-       struct perf_counter_hw_event *hw_event = &counter->hw_event;
        const struct pmu *pmu = NULL;
-       struct hw_perf_counter *hwc = &counter->hw;
 
        /*
         * Software counters (currently) can't in general distinguish
@@ -2818,8 +2864,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
        case PERF_COUNT_CPU_CLOCK:
                pmu = &perf_ops_cpu_clock;
 
-               if (hw_event->irq_period && hw_event->irq_period < 10000)
-                       hw_event->irq_period = 10000;
                break;
        case PERF_COUNT_TASK_CLOCK:
                /*
@@ -2831,8 +2875,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
                else
                        pmu = &perf_ops_cpu_clock;
 
-               if (hw_event->irq_period && hw_event->irq_period < 10000)
-                       hw_event->irq_period = 10000;
                break;
        case PERF_COUNT_PAGE_FAULTS:
        case PERF_COUNT_PAGE_FAULTS_MIN:
@@ -2846,9 +2888,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
                break;
        }
 
-       if (pmu)
-               hwc->irq_period = hw_event->irq_period;
-
        return pmu;
 }
 
@@ -2864,6 +2903,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 {
        const struct pmu *pmu;
        struct perf_counter *counter;
+       struct hw_perf_counter *hwc;
        long err;
 
        counter = kzalloc(sizeof(*counter), gfpflags);
@@ -2899,6 +2939,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
        pmu = NULL;
 
+       hwc = &counter->hw;
+       if (hw_event->freq && hw_event->irq_freq)
+               hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq);
+       else
+               hwc->irq_period = hw_event->irq_period;
+
        /*
         * we currently do not support PERF_RECORD_GROUP on inherited counters
         */
@@ -3160,9 +3206,8 @@ static int inherit_group(struct perf_counter *parent_counter,
 static void sync_child_counter(struct perf_counter *child_counter,
                               struct perf_counter *parent_counter)
 {
-       u64 parent_val, child_val;
+       u64 child_val;
 
-       parent_val = atomic64_read(&parent_counter->count);
        child_val = atomic64_read(&child_counter->count);
 
        /*
@@ -3194,7 +3239,6 @@ __perf_counter_exit_task(struct task_struct *child,
                         struct perf_counter_context *child_ctx)
 {
        struct perf_counter *parent_counter;
-       struct perf_counter *sub, *tmp;
 
        /*
         * If we do not self-reap then we have to wait for the
@@ -3206,8 +3250,8 @@ __perf_counter_exit_task(struct task_struct *child,
         */
        if (child != current) {
                wait_task_inactive(child, 0);
-               list_del_init(&child_counter->list_entry);
                update_counter_times(child_counter);
+               list_del_counter(child_counter, child_ctx);
        } else {
                struct perf_cpu_context *cpuctx;
                unsigned long flags;
@@ -3226,9 +3270,7 @@ __perf_counter_exit_task(struct task_struct *child,
                group_sched_out(child_counter, cpuctx, child_ctx);
                update_counter_times(child_counter);
 
-               list_del_init(&child_counter->list_entry);
-
-               child_ctx->nr_counters--;
+               list_del_counter(child_counter, child_ctx);
 
                perf_enable();
                local_irq_restore(flags);
@@ -3242,13 +3284,6 @@ __perf_counter_exit_task(struct task_struct *child,
         */
        if (parent_counter) {
                sync_child_counter(child_counter, parent_counter);
-               list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
-                                        list_entry) {
-                       if (sub->parent) {
-                               sync_child_counter(sub, sub->parent);
-                               free_counter(sub);
-                       }
-               }
                free_counter(child_counter);
        }
 }
@@ -3264,14 +3299,25 @@ void perf_counter_exit_task(struct task_struct *child)
        struct perf_counter *child_counter, *tmp;
        struct perf_counter_context *child_ctx;
 
+       WARN_ON_ONCE(child != current);
+
        child_ctx = &child->perf_counter_ctx;
 
        if (likely(!child_ctx->nr_counters))
                return;
 
+again:
        list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
                                 list_entry)
                __perf_counter_exit_task(child, child_counter, child_ctx);
+
+       /*
+        * If the last counter was a group counter, it will have appended all
+        * its siblings to the list, but we obtained 'tmp' before that which
+        * will still point to the list head terminating the iteration.
+        */
+       if (!list_empty(&child_ctx->counter_list))
+               goto again;
 }
 
 /*