perfcounters: generalize the counter scheduler
authorIngo Molnar <mingo@elte.hu>
Sun, 21 Dec 2008 13:43:25 +0000 (14:43 +0100)
committerIngo Molnar <mingo@elte.hu>
Tue, 23 Dec 2008 11:45:23 +0000 (12:45 +0100)
Impact: clean up and refactor code

refactor the counter scheduler: separate out in/out functions and
introduce a counter-rotation function as well.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
kernel/perf_counter.c

index 48e1dbcdc1cd9c72ccd04aa8dc416884bd584196..d7a79f321b1c3c497d346c4a6b944780aac21c2e 100644 (file)
@@ -111,11 +111,12 @@ static void __perf_counter_remove_from_context(void *info)
        spin_lock(&ctx->lock);
 
        if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
-               counter->hw_ops->disable(counter);
                counter->state = PERF_COUNTER_STATE_INACTIVE;
+               counter->hw_ops->disable(counter);
                ctx->nr_active--;
                cpuctx->active_oncpu--;
                counter->task = NULL;
+               counter->oncpu = -1;
        }
        ctx->nr_counters--;
 
@@ -192,8 +193,36 @@ retry:
        spin_unlock_irq(&ctx->lock);
 }
 
+static int
+counter_sched_in(struct perf_counter *counter,
+                struct perf_cpu_context *cpuctx,
+                struct perf_counter_context *ctx,
+                int cpu)
+{
+       if (counter->state == PERF_COUNTER_STATE_OFF)
+               return 0;
+
+       counter->state = PERF_COUNTER_STATE_ACTIVE;
+       counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
+       /*
+        * The new state must be visible before we turn it on in the hardware:
+        */
+       smp_wmb();
+
+       if (counter->hw_ops->enable(counter)) {
+               counter->state = PERF_COUNTER_STATE_INACTIVE;
+               counter->oncpu = -1;
+               return -EAGAIN;
+       }
+
+       cpuctx->active_oncpu++;
+       ctx->nr_active++;
+
+       return 0;
+}
+
 /*
- * Cross CPU call to install and enable a preformance counter
+ * Cross CPU call to install and enable a performance counter
  */
 static void __perf_install_in_context(void *info)
 {
@@ -220,22 +249,17 @@ static void __perf_install_in_context(void *info)
         * counters on a global level. NOP for non NMI based counters.
         */
        perf_flags = hw_perf_save_disable();
-       list_add_counter(counter, ctx);
-       hw_perf_restore(perf_flags);
 
+       list_add_counter(counter, ctx);
        ctx->nr_counters++;
 
-       if (cpuctx->active_oncpu < perf_max_counters) {
-               counter->state = PERF_COUNTER_STATE_ACTIVE;
-               counter->oncpu = cpu;
-               ctx->nr_active++;
-               cpuctx->active_oncpu++;
-               counter->hw_ops->enable(counter);
-       }
+       counter_sched_in(counter, cpuctx, ctx, cpu);
 
        if (!ctx->task && cpuctx->max_pertask)
                cpuctx->max_pertask--;
 
+       hw_perf_restore(perf_flags);
+
        spin_unlock(&ctx->lock);
        curr_rq_unlock_irq_restore(&flags);
 }
@@ -302,8 +326,8 @@ counter_sched_out(struct perf_counter *counter,
        if (counter->state != PERF_COUNTER_STATE_ACTIVE)
                return;
 
-       counter->hw_ops->disable(counter);
        counter->state = PERF_COUNTER_STATE_INACTIVE;
+       counter->hw_ops->disable(counter);
        counter->oncpu = -1;
 
        cpuctx->active_oncpu--;
@@ -326,6 +350,22 @@ group_sched_out(struct perf_counter *group_counter,
                counter_sched_out(counter, cpuctx, ctx);
 }
 
+void __perf_counter_sched_out(struct perf_counter_context *ctx,
+                             struct perf_cpu_context *cpuctx)
+{
+       struct perf_counter *counter;
+
+       if (likely(!ctx->nr_counters))
+               return;
+
+       spin_lock(&ctx->lock);
+       if (ctx->nr_active) {
+               list_for_each_entry(counter, &ctx->counter_list, list_entry)
+                       group_sched_out(counter, cpuctx, ctx);
+       }
+       spin_unlock(&ctx->lock);
+}
+
 /*
  * Called from scheduler to remove the counters of the current task,
  * with interrupts disabled.
@@ -341,39 +381,18 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu)
 {
        struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
        struct perf_counter_context *ctx = &task->perf_counter_ctx;
-       struct perf_counter *counter;
 
        if (likely(!cpuctx->task_ctx))
                return;
 
-       spin_lock(&ctx->lock);
-       if (ctx->nr_active) {
-               list_for_each_entry(counter, &ctx->counter_list, list_entry)
-                       group_sched_out(counter, cpuctx, ctx);
-       }
-       spin_unlock(&ctx->lock);
+       __perf_counter_sched_out(ctx, cpuctx);
+
        cpuctx->task_ctx = NULL;
 }
 
-static int
-counter_sched_in(struct perf_counter *counter,
-                struct perf_cpu_context *cpuctx,
-                struct perf_counter_context *ctx,
-                int cpu)
+static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx)
 {
-       if (counter->state == PERF_COUNTER_STATE_OFF)
-               return 0;
-
-       if (counter->hw_ops->enable(counter))
-               return -EAGAIN;
-
-       counter->state = PERF_COUNTER_STATE_ACTIVE;
-       counter->oncpu = cpu;   /* TODO: put 'cpu' into cpuctx->cpu */
-
-       cpuctx->active_oncpu++;
-       ctx->nr_active++;
-
-       return 0;
+       __perf_counter_sched_out(&cpuctx->ctx, cpuctx);
 }
 
 static int
@@ -416,21 +435,10 @@ group_error:
        return -EAGAIN;
 }
 
-/*
- * Called from scheduler to add the counters of the current task
- * with interrupts disabled.
- *
- * We restore the counter value and then enable it.
- *
- * This does not protect us against NMI, but enable()
- * sets the enabled bit in the control field of counter _before_
- * accessing the counter control register. If a NMI hits, then it will
- * keep the counter running.
- */
-void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+static void
+__perf_counter_sched_in(struct perf_counter_context *ctx,
+                       struct perf_cpu_context *cpuctx, int cpu)
 {
-       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
-       struct perf_counter_context *ctx = &task->perf_counter_ctx;
        struct perf_counter *counter;
 
        if (likely(!ctx->nr_counters))
@@ -453,10 +461,35 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu)
                        break;
        }
        spin_unlock(&ctx->lock);
+}
 
+/*
+ * Called from scheduler to add the counters of the current task
+ * with interrupts disabled.
+ *
+ * We restore the counter value and then enable it.
+ *
+ * This does not protect us against NMI, but enable()
+ * sets the enabled bit in the control field of counter _before_
+ * accessing the counter control register. If a NMI hits, then it will
+ * keep the counter running.
+ */
+void perf_counter_task_sched_in(struct task_struct *task, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx = &task->perf_counter_ctx;
+
+       __perf_counter_sched_in(ctx, cpuctx, cpu);
        cpuctx->task_ctx = ctx;
 }
 
+static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
+{
+       struct perf_counter_context *ctx = &cpuctx->ctx;
+
+       __perf_counter_sched_in(ctx, cpuctx, cpu);
+}
+
 int perf_counter_task_disable(void)
 {
        struct task_struct *curr = current;
@@ -514,6 +547,8 @@ int perf_counter_task_enable(void)
        /* force the update of the task clock: */
        __task_delta_exec(curr, 1);
 
+       perf_counter_task_sched_out(curr, cpu);
+
        spin_lock(&ctx->lock);
 
        /*
@@ -538,19 +573,18 @@ int perf_counter_task_enable(void)
        return 0;
 }
 
-void perf_counter_task_tick(struct task_struct *curr, int cpu)
+/*
+ * Round-robin a context's counters:
+ */
+static void rotate_ctx(struct perf_counter_context *ctx)
 {
-       struct perf_counter_context *ctx = &curr->perf_counter_ctx;
        struct perf_counter *counter;
        u64 perf_flags;
 
-       if (likely(!ctx->nr_counters))
+       if (!ctx->nr_counters)
                return;
 
-       perf_counter_task_sched_out(curr, cpu);
-
        spin_lock(&ctx->lock);
-
        /*
         * Rotate the first entry last (works just fine for group counters too):
         */
@@ -563,7 +597,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu)
        hw_perf_restore(perf_flags);
 
        spin_unlock(&ctx->lock);
+}
+
+void perf_counter_task_tick(struct task_struct *curr, int cpu)
+{
+       struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu);
+       struct perf_counter_context *ctx = &curr->perf_counter_ctx;
+       const int rotate_percpu = 0;
+
+       if (rotate_percpu)
+               perf_counter_cpu_sched_out(cpuctx);
+       perf_counter_task_sched_out(curr, cpu);
 
+       if (rotate_percpu)
+               rotate_ctx(&cpuctx->ctx);
+       rotate_ctx(ctx);
+
+       if (rotate_percpu)
+               perf_counter_cpu_sched_in(cpuctx, cpu);
        perf_counter_task_sched_in(curr, cpu);
 }
 
@@ -905,8 +956,6 @@ static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update)
        struct task_struct *curr = counter->task;
        u64 delta;
 
-       WARN_ON_ONCE(counter->task != current);
-
        delta = __task_delta_exec(curr, update);
 
        return curr->se.sum_exec_runtime + delta;
@@ -1160,6 +1209,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
        counter->group_leader           = group_leader;
        counter->hw_ops                 = NULL;
 
+       counter->state = PERF_COUNTER_STATE_INACTIVE;
        if (hw_event->disabled)
                counter->state = PERF_COUNTER_STATE_OFF;
 
@@ -1331,35 +1381,49 @@ __perf_counter_exit_task(struct task_struct *child,
 {
        struct perf_counter *parent_counter;
        u64 parent_val, child_val;
-       unsigned long flags;
-       u64 perf_flags;
 
        /*
-        * Disable and unlink this counter.
-        *
-        * Be careful about zapping the list - IRQ/NMI context
-        * could still be processing it:
+        * If we do not self-reap then we have to wait for the
+        * child task to unschedule (it will happen for sure),
+        * so that its counter is at its final count. (This
+        * condition triggers rarely - child tasks usually get
+        * off their CPU before the parent has a chance to
+        * get this far into the reaping action)
         */
-       curr_rq_lock_irq_save(&flags);
-       perf_flags = hw_perf_save_disable();
-
-       if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+       if (child != current) {
+               wait_task_inactive(child, 0);
+               list_del_init(&child_counter->list_entry);
+       } else {
                struct perf_cpu_context *cpuctx;
+               unsigned long flags;
+               u64 perf_flags;
+
+               /*
+                * Disable and unlink this counter.
+                *
+                * Be careful about zapping the list - IRQ/NMI context
+                * could still be processing it:
+                */
+               curr_rq_lock_irq_save(&flags);
+               perf_flags = hw_perf_save_disable();
 
                cpuctx = &__get_cpu_var(perf_cpu_context);
 
-               child_counter->hw_ops->disable(child_counter);
-               child_counter->state = PERF_COUNTER_STATE_INACTIVE;
-               child_counter->oncpu = -1;
+               if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) {
+                       child_counter->state = PERF_COUNTER_STATE_INACTIVE;
+                       child_counter->hw_ops->disable(child_counter);
+                       cpuctx->active_oncpu--;
+                       child_ctx->nr_active--;
+                       child_counter->oncpu = -1;
+               }
 
-               cpuctx->active_oncpu--;
-               child_ctx->nr_active--;
-       }
+               list_del_init(&child_counter->list_entry);
 
-       list_del_init(&child_counter->list_entry);
+               child_ctx->nr_counters--;
 
-       hw_perf_restore(perf_flags);
-       curr_rq_unlock_irq_restore(&flags);
+               hw_perf_restore(perf_flags);
+               curr_rq_unlock_irq_restore(&flags);
+       }
 
        parent_counter = child_counter->parent;
        /*