#define PERF_COUNTER_EVENT_SHIFT 0
#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT)
+/*
+ * Bits that can be set in hw_event.read_format to request that
+ * reads on the counter should return the indicated quantities,
+ * in increasing order of bit value, after the counter value.
+ */
+enum perf_counter_read_format {
+ PERF_FORMAT_TOTAL_TIME_ENABLED = 1,
+ PERF_FORMAT_TOTAL_TIME_RUNNING = 2,
+};
+
/*
* Hardware event to monitor via a performance monitoring counter:
*/
enum perf_counter_active_state prev_state;
atomic64_t count;
+ /*
+ * These are the total time in nanoseconds that the counter
+ * has been enabled (i.e. eligible to run, and the task has
+ * been scheduled in, if this is a per-task counter)
+ * and running (scheduled onto the CPU), respectively.
+ *
+ * They are computed from tstamp_enabled, tstamp_running and
+ * tstamp_stopped when the counter is in INACTIVE or ACTIVE state.
+ */
+ u64 total_time_enabled;
+ u64 total_time_running;
+
+ /*
+ * These are timestamps used for computing total_time_enabled
+ * and total_time_running when the counter is in INACTIVE or
+ * ACTIVE state, measured in nanoseconds from an arbitrary point
+ * in time.
+ * tstamp_enabled: the notional time when the counter was enabled
+ * tstamp_running: the notional time when the counter was scheduled on
+ * tstamp_stopped: in INACTIVE state, the notional time when the
+ * counter was scheduled off.
+ */
+ u64 tstamp_enabled;
+ u64 tstamp_running;
+ u64 tstamp_stopped;
+
struct perf_counter_hw_event hw_event;
struct hw_perf_counter hw;
struct perf_counter *parent;
struct list_head child_list;
+ /*
+ * These accumulate total time (in nanoseconds) that children
+ * counters have been enabled and running, respectively.
+ */
+ atomic64_t child_total_time_enabled;
+ atomic64_t child_total_time_running;
+
/*
* Protect attach/detach and child_list:
*/
int nr_active;
int is_active;
struct task_struct *task;
+
+ /*
+ * time_now is the current time in nanoseconds since an arbitrary
+ * point in the past. For per-task counters, this is based on the
+ * task clock, and for per-cpu counters it is based on the cpu clock.
+ * time_lost is an offset from the task/cpu clock, used to make it
+ * appear that time only passes while the context is scheduled in.
+ */
+ u64 time_now;
+ u64 time_lost;
#endif
};
return;
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->tstamp_stopped = ctx->time_now;
counter->hw_ops->disable(counter);
counter->oncpu = -1;
spin_unlock_irq(&ctx->lock);
}
+/*
+ * Get the current time for this context.
+ * If this is a task context, we use the task's task clock,
+ * or for a per-cpu context, we use the cpu clock.
+ */
+static u64 get_context_time(struct perf_counter_context *ctx, int update)
+{
+ struct task_struct *curr = ctx->task;
+
+ if (!curr)
+ return cpu_clock(smp_processor_id());
+
+ return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime;
+}
+
+/*
+ * Update the record of the current time in a context.
+ */
+static void update_context_time(struct perf_counter_context *ctx, int update)
+{
+ ctx->time_now = get_context_time(ctx, update) - ctx->time_lost;
+}
+
+/*
+ * Update the total_time_enabled and total_time_running fields for a counter.
+ */
+static void update_counter_times(struct perf_counter *counter)
+{
+ struct perf_counter_context *ctx = counter->ctx;
+ u64 run_end;
+
+ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+ counter->total_time_enabled = ctx->time_now -
+ counter->tstamp_enabled;
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+ run_end = counter->tstamp_stopped;
+ else
+ run_end = ctx->time_now;
+ counter->total_time_running = run_end - counter->tstamp_running;
+ }
+}
+
+/*
+ * Update total_time_enabled and total_time_running for all counters in a group.
+ */
+static void update_group_times(struct perf_counter *leader)
+{
+ struct perf_counter *counter;
+
+ update_counter_times(leader);
+ list_for_each_entry(counter, &leader->sibling_list, list_entry)
+ update_counter_times(counter);
+}
+
/*
* Cross CPU call to disable a performance counter
*/
* If it is in error state, leave it in error state.
*/
if (counter->state >= PERF_COUNTER_STATE_INACTIVE) {
+ update_context_time(ctx, 1);
+ update_counter_times(counter);
if (counter == counter->group_leader)
group_sched_out(counter, cpuctx, ctx);
else
* Since we have the lock this context can't be scheduled
* in, so we can change the state safely.
*/
- if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+ update_counter_times(counter);
counter->state = PERF_COUNTER_STATE_OFF;
+ }
spin_unlock_irq(&ctx->lock);
}
return -EAGAIN;
}
+ counter->tstamp_running += ctx->time_now - counter->tstamp_stopped;
+
if (!is_software_counter(counter))
cpuctx->active_oncpu++;
ctx->nr_active++;
return can_add_hw;
}
+static void add_counter_to_ctx(struct perf_counter *counter,
+ struct perf_counter_context *ctx)
+{
+ list_add_counter(counter, ctx);
+ ctx->nr_counters++;
+ counter->prev_state = PERF_COUNTER_STATE_OFF;
+ counter->tstamp_enabled = ctx->time_now;
+ counter->tstamp_running = ctx->time_now;
+ counter->tstamp_stopped = ctx->time_now;
+}
+
/*
* Cross CPU call to install and enable a performance counter
*/
curr_rq_lock_irq_save(&flags);
spin_lock(&ctx->lock);
+ update_context_time(ctx, 1);
/*
* Protect the list operation against NMI by disabling the
*/
perf_flags = hw_perf_save_disable();
- list_add_counter(counter, ctx);
- ctx->nr_counters++;
- counter->prev_state = PERF_COUNTER_STATE_OFF;
+ add_counter_to_ctx(counter, ctx);
/*
* Don't put the counter on if it is disabled or if
*/
if (leader != counter)
group_sched_out(leader, cpuctx, ctx);
- if (leader->hw_event.pinned)
+ if (leader->hw_event.pinned) {
+ update_group_times(leader);
leader->state = PERF_COUNTER_STATE_ERROR;
+ }
}
if (!err && !ctx->task && cpuctx->max_pertask)
* can add the counter safely, if it the call above did not
* succeed.
*/
- if (list_empty(&counter->list_entry)) {
- list_add_counter(counter, ctx);
- ctx->nr_counters++;
- }
+ if (list_empty(&counter->list_entry))
+ add_counter_to_ctx(counter, ctx);
spin_unlock_irq(&ctx->lock);
}
curr_rq_lock_irq_save(&flags);
spin_lock(&ctx->lock);
+ update_context_time(ctx, 1);
counter->prev_state = counter->state;
if (counter->state >= PERF_COUNTER_STATE_INACTIVE)
goto unlock;
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled;
/*
* If the counter is in a group and isn't the group leader,
*/
if (leader != counter)
group_sched_out(leader, cpuctx, ctx);
- if (leader->hw_event.pinned)
+ if (leader->hw_event.pinned) {
+ update_group_times(leader);
leader->state = PERF_COUNTER_STATE_ERROR;
+ }
}
unlock:
* Since we have the lock this context can't be scheduled
* in, so we can change the state safely.
*/
- if (counter->state == PERF_COUNTER_STATE_OFF)
+ if (counter->state == PERF_COUNTER_STATE_OFF) {
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->tstamp_enabled = ctx->time_now -
+ counter->total_time_enabled;
+ }
out:
spin_unlock_irq(&ctx->lock);
}
ctx->is_active = 0;
if (likely(!ctx->nr_counters))
goto out;
+ update_context_time(ctx, 0);
flags = hw_perf_save_disable();
if (ctx->nr_active) {
if (likely(!ctx->nr_counters))
goto out;
+ /*
+ * Add any time since the last sched_out to the lost time
+ * so it doesn't get included in the total_time_enabled and
+ * total_time_running measures for counters in the context.
+ */
+ ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now;
+
flags = hw_perf_save_disable();
/*
* If this pinned group hasn't been scheduled,
* put it in error state.
*/
- if (counter->state == PERF_COUNTER_STATE_INACTIVE)
+ if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+ update_group_times(counter);
counter->state = PERF_COUNTER_STATE_ERROR;
+ }
}
list_for_each_entry(counter, &ctx->counter_list, list_entry) {
perf_flags = hw_perf_save_disable();
list_for_each_entry(counter, &ctx->counter_list, list_entry) {
- if (counter->state != PERF_COUNTER_STATE_ERROR)
+ if (counter->state != PERF_COUNTER_STATE_ERROR) {
+ update_group_times(counter);
counter->state = PERF_COUNTER_STATE_OFF;
+ }
}
hw_perf_restore(perf_flags);
if (counter->state > PERF_COUNTER_STATE_OFF)
continue;
counter->state = PERF_COUNTER_STATE_INACTIVE;
+ counter->tstamp_enabled = ctx->time_now -
+ counter->total_time_enabled;
counter->hw_event.disabled = 0;
}
hw_perf_restore(perf_flags);
static void __read(void *info)
{
struct perf_counter *counter = info;
+ struct perf_counter_context *ctx = counter->ctx;
unsigned long flags;
curr_rq_lock_irq_save(&flags);
+ if (ctx->is_active)
+ update_context_time(ctx, 1);
counter->hw_ops->read(counter);
+ update_counter_times(counter);
curr_rq_unlock_irq_restore(&flags);
}
if (counter->state == PERF_COUNTER_STATE_ACTIVE) {
smp_call_function_single(counter->oncpu,
__read, counter, 1);
+ } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) {
+ update_counter_times(counter);
}
return atomic64_read(&counter->count);
static ssize_t
perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
{
- u64 cntval;
-
- if (count < sizeof(cntval))
- return -EINVAL;
+ u64 values[3];
+ int n;
/*
* Return end-of-file for a read on a counter that is in
return 0;
mutex_lock(&counter->mutex);
- cntval = perf_counter_read(counter);
+ values[0] = perf_counter_read(counter);
+ n = 1;
+ if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED)
+ values[n++] = counter->total_time_enabled +
+ atomic64_read(&counter->child_total_time_enabled);
+ if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING)
+ values[n++] = counter->total_time_running +
+ atomic64_read(&counter->child_total_time_running);
mutex_unlock(&counter->mutex);
- return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval);
+ if (count < n * sizeof(u64))
+ return -EINVAL;
+ count = n * sizeof(u64);
+
+ if (copy_to_user(buf, values, count))
+ return -EFAULT;
+
+ return count;
}
static ssize_t
* Link it up in the child's context:
*/
child_counter->task = child;
- list_add_counter(child_counter, child_ctx);
- child_ctx->nr_counters++;
+ add_counter_to_ctx(child_counter, child_ctx);
child_counter->parent = parent_counter;
/*
* Add back the child's count to the parent's count:
*/
atomic64_add(child_val, &parent_counter->count);
+ atomic64_add(child_counter->total_time_enabled,
+ &parent_counter->child_total_time_enabled);
+ atomic64_add(child_counter->total_time_running,
+ &parent_counter->child_total_time_running);
/*
* Remove this counter from the parent's list
if (child != current) {
wait_task_inactive(child, 0);
list_del_init(&child_counter->list_entry);
+ update_counter_times(child_counter);
} else {
struct perf_cpu_context *cpuctx;
unsigned long flags;
cpuctx = &__get_cpu_var(perf_cpu_context);
group_sched_out(child_counter, cpuctx, child_ctx);
+ update_counter_times(child_counter);
list_del_init(&child_counter->list_entry);