perf_counter: fix counter freeing logic

[GitHub/mt8127/android_kernel_alcatel_ttab.git] / kernel / perf_counter.c
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c

index 0173738dd548eaa07e729c0658759f00c15b8694..7af16d1c480fbf0c19d1c9ad740e8e68c2516007 100644 (file)
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -45,7 +45,7 @@ static atomic_t nr_munmap_tracking __read_mostly;
  static atomic_t nr_comm_tracking __read_mostly;
  
  int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */
-int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */
+int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */
  
  /*
   * Lock for (sysadmin-configurable) counter reservations:
@@ -115,6 +115,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
         }
  
         list_add_rcu(&counter->event_entry, &ctx->event_list);
+       ctx->nr_counters++;
  }
  
  static void
@@ -122,6 +123,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx)
  {
         struct perf_counter *sibling, *tmp;
  
+       ctx->nr_counters--;
+
         list_del_init(&counter->list_entry);
         list_del_rcu(&counter->event_entry);
  
@@ -209,7 +212,6 @@ static void __perf_counter_remove_from_context(void *info)
         counter_sched_out(counter, cpuctx, ctx);
  
         counter->task = NULL;
-       ctx->nr_counters--;
  
         /*
          * Protect the list operation against NMI by disabling the
@@ -276,7 +278,6 @@ retry:
          * succeed.
          */
         if (!list_empty(&counter->list_entry)) {
-               ctx->nr_counters--;
                 list_del_counter(counter, ctx);
                 counter->task = NULL;
         }
@@ -544,7 +545,6 @@ static void add_counter_to_ctx(struct perf_counter *counter,
                                struct perf_counter_context *ctx)
  {
         list_add_counter(counter, ctx);
-       ctx->nr_counters++;
         counter->prev_state = PERF_COUNTER_STATE_OFF;
         counter->tstamp_enabled = ctx->time;
         counter->tstamp_running = ctx->time;
@@ -1046,6 +1046,38 @@ int perf_counter_task_enable(void)
         return 0;
  }
  
+void perf_adjust_freq(struct perf_counter_context *ctx)
+{
+       struct perf_counter *counter;
+       u64 irq_period;
+       u64 events, period;
+       s64 delta;
+
+       spin_lock(&ctx->lock);
+       list_for_each_entry(counter, &ctx->counter_list, list_entry) {
+               if (counter->state != PERF_COUNTER_STATE_ACTIVE)
+                       continue;
+
+               if (!counter->hw_event.freq || !counter->hw_event.irq_freq)
+                       continue;
+
+               events = HZ * counter->hw.interrupts * counter->hw.irq_period;
+               period = div64_u64(events, counter->hw_event.irq_freq);
+
+               delta = (s64)(1 + period - counter->hw.irq_period);
+               delta >>= 1;
+
+               irq_period = counter->hw.irq_period + delta;
+
+               if (!irq_period)
+                       irq_period = 1;
+
+               counter->hw.irq_period = irq_period;
+               counter->hw.interrupts = 0;
+       }
+       spin_unlock(&ctx->lock);
+}
+
  /*
   * Round-robin a context's counters:
   */
@@ -1081,6 +1113,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
         cpuctx = &per_cpu(perf_cpu_context, cpu);
         ctx = &curr->perf_counter_ctx;
  
+       perf_adjust_freq(&cpuctx->ctx);
+       perf_adjust_freq(ctx);
+
         perf_counter_cpu_sched_out(cpuctx);
         __perf_counter_task_sched_out(ctx);
  
@@ -1522,6 +1557,9 @@ static void perf_mmap_close(struct vm_area_struct *vma)
  
         if (atomic_dec_and_mutex_lock(&counter->mmap_count,
                                       &counter->mmap_mutex)) {
+               struct user_struct *user = current_user();
+
+               atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm);
                 vma->vm_mm->locked_vm -= counter->data->nr_locked;
                 perf_mmap_data_free(counter);
                 mutex_unlock(&counter->mmap_mutex);
@@ -1537,11 +1575,13 @@ static struct vm_operations_struct perf_mmap_vmops = {
  static int perf_mmap(struct file *file, struct vm_area_struct *vma)
  {
         struct perf_counter *counter = file->private_data;
+       struct user_struct *user = current_user();
         unsigned long vma_size;
         unsigned long nr_pages;
+       unsigned long user_locked, user_lock_limit;
         unsigned long locked, lock_limit;
+       long user_extra, extra;
         int ret = 0;
-       long extra;
  
         if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE))
                 return -EINVAL;
@@ -1569,15 +1609,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                 goto unlock;
         }
  
-       extra = nr_pages /* + 1 only account the data pages */;
-       extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
-       if (extra < 0)
-               extra = 0;
+       user_extra = nr_pages + 1;
+       user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10);
+       user_locked = atomic_long_read(&user->locked_vm) + user_extra;
  
-       locked = vma->vm_mm->locked_vm + extra;
+       extra = 0;
+       if (user_locked > user_lock_limit)
+               extra = user_locked - user_lock_limit;
  
         lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur;
         lock_limit >>= PAGE_SHIFT;
+       locked = vma->vm_mm->locked_vm + extra;
  
         if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
                 ret = -EPERM;
@@ -1590,6 +1632,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
                 goto unlock;
  
         atomic_set(&counter->mmap_count, 1);
+       atomic_long_add(user_extra, &user->locked_vm);
         vma->vm_mm->locked_vm += extra;
         counter->data->nr_locked = extra;
  unlock:
@@ -1999,11 +2042,10 @@ static void perf_counter_output(struct perf_counter *counter,
         header.size = sizeof(header);
  
         header.misc = PERF_EVENT_MISC_OVERFLOW;
-       header.misc |= user_mode(regs) ?
-               PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL;
+       header.misc |= perf_misc_flags(regs);
  
         if (record_type & PERF_RECORD_IP) {
-               ip = instruction_pointer(regs);
+               ip = perf_instruction_pointer(regs);
                 header.type |= PERF_RECORD_IP;
                 header.size += sizeof(ip);
         }
@@ -2374,6 +2416,8 @@ int perf_counter_overflow(struct perf_counter *counter,
         int events = atomic_read(&counter->event_limit);
         int ret = 0;
  
+       counter->hw.interrupts++;
+
         /*
          * XXX event_limit might not quite work as expected on inherited
          * counters
@@ -2442,6 +2486,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
         enum hrtimer_restart ret = HRTIMER_RESTART;
         struct perf_counter *counter;
         struct pt_regs *regs;
+       u64 period;
  
         counter = container_of(hrtimer, struct perf_counter, hw.hrtimer);
         counter->pmu->read(counter);
@@ -2460,7 +2505,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
                         ret = HRTIMER_NORESTART;
         }
  
-       hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period));
+       period = max_t(u64, 10000, counter->hw.irq_period);
+       hrtimer_forward_now(hrtimer, ns_to_ktime(period));
  
         return ret;
  }
@@ -2621,8 +2667,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter)
         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         hwc->hrtimer.function = perf_swcounter_hrtimer;
         if (hwc->irq_period) {
+               u64 period = max_t(u64, 10000, hwc->irq_period);
                 __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(hwc->irq_period), 0,
+                               ns_to_ktime(period), 0,
                                 HRTIMER_MODE_REL, 0);
         }
  
@@ -2671,8 +2718,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter)
         hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         hwc->hrtimer.function = perf_swcounter_hrtimer;
         if (hwc->irq_period) {
+               u64 period = max_t(u64, 10000, hwc->irq_period);
                 __hrtimer_start_range_ns(&hwc->hrtimer,
-                               ns_to_ktime(hwc->irq_period), 0,
+                               ns_to_ktime(period), 0,
                                 HRTIMER_MODE_REL, 0);
         }
  
@@ -2803,9 +2851,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
  
  static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
  {
-       struct perf_counter_hw_event *hw_event = &counter->hw_event;
         const struct pmu *pmu = NULL;
-       struct hw_perf_counter *hwc = &counter->hw;
  
         /*
          * Software counters (currently) can't in general distinguish
@@ -2818,8 +2864,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
         case PERF_COUNT_CPU_CLOCK:
                 pmu = &perf_ops_cpu_clock;
  
-               if (hw_event->irq_period && hw_event->irq_period < 10000)
-                       hw_event->irq_period = 10000;
                 break;
         case PERF_COUNT_TASK_CLOCK:
                 /*
@@ -2831,8 +2875,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
                 else
                         pmu = &perf_ops_cpu_clock;
  
-               if (hw_event->irq_period && hw_event->irq_period < 10000)
-                       hw_event->irq_period = 10000;
                 break;
         case PERF_COUNT_PAGE_FAULTS:
         case PERF_COUNT_PAGE_FAULTS_MIN:
@@ -2846,9 +2888,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter)
                 break;
         }
  
-       if (pmu)
-               hwc->irq_period = hw_event->irq_period;
-
         return pmu;
  }
  
@@ -2864,6 +2903,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
  {
         const struct pmu *pmu;
         struct perf_counter *counter;
+       struct hw_perf_counter *hwc;
         long err;
  
         counter = kzalloc(sizeof(*counter), gfpflags);
@@ -2899,6 +2939,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
  
         pmu = NULL;
  
+       hwc = &counter->hw;
+       if (hw_event->freq && hw_event->irq_freq)
+               hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq);
+       else
+               hwc->irq_period = hw_event->irq_period;
+
         /*
          * we currently do not support PERF_RECORD_GROUP on inherited counters
          */
@@ -3160,9 +3206,8 @@ static int inherit_group(struct perf_counter *parent_counter,
  static void sync_child_counter(struct perf_counter *child_counter,
                                struct perf_counter *parent_counter)
  {
-       u64 parent_val, child_val;
+       u64 child_val;
  
-       parent_val = atomic64_read(&parent_counter->count);
         child_val = atomic64_read(&child_counter->count);
  
         /*
@@ -3194,7 +3239,6 @@ __perf_counter_exit_task(struct task_struct *child,
                          struct perf_counter_context *child_ctx)
  {
         struct perf_counter *parent_counter;
-       struct perf_counter *sub, *tmp;
  
         /*
          * If we do not self-reap then we have to wait for the
@@ -3206,8 +3250,8 @@ __perf_counter_exit_task(struct task_struct *child,
          */
         if (child != current) {
                 wait_task_inactive(child, 0);
-               list_del_init(&child_counter->list_entry);
                 update_counter_times(child_counter);
+               list_del_counter(child_counter, child_ctx);
         } else {
                 struct perf_cpu_context *cpuctx;
                 unsigned long flags;
@@ -3226,9 +3270,7 @@ __perf_counter_exit_task(struct task_struct *child,
                 group_sched_out(child_counter, cpuctx, child_ctx);
                 update_counter_times(child_counter);
  
-               list_del_init(&child_counter->list_entry);
-
-               child_ctx->nr_counters--;
+               list_del_counter(child_counter, child_ctx);
  
                 perf_enable();
                 local_irq_restore(flags);
@@ -3242,13 +3284,6 @@ __perf_counter_exit_task(struct task_struct *child,
          */
         if (parent_counter) {
                 sync_child_counter(child_counter, parent_counter);
-               list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list,
-                                        list_entry) {
-                       if (sub->parent) {
-                               sync_child_counter(sub, sub->parent);
-                               free_counter(sub);
-                       }
-               }
                 free_counter(child_counter);
         }
  }
@@ -3264,14 +3299,25 @@ void perf_counter_exit_task(struct task_struct *child)
         struct perf_counter *child_counter, *tmp;
         struct perf_counter_context *child_ctx;
  
+       WARN_ON_ONCE(child != current);
+
         child_ctx = &child->perf_counter_ctx;
  
         if (likely(!child_ctx->nr_counters))
                 return;
  
+again:
         list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list,
                                  list_entry)
                 __perf_counter_exit_task(child, child_counter, child_ctx);
+
+       /*
+        * If the last counter was a group counter, it will have appended all
+        * its siblings to the list, but we obtained 'tmp' before that which
+        * will still point to the list head terminating the iteration.
+        */
+       if (!list_empty(&child_ctx->counter_list))
+               goto again;
  }
  
  /*