X-Git-Url: https://git.stricted.de/?a=blobdiff_plain;f=kernel%2Fperf_counter.c;h=7af16d1c480fbf0c19d1c9ad740e8e68c2516007;hb=33b2fb303fe7f6b08bbb32f708e67b96eaa94a7a;hp=7669afe82cc7c6b62aeebde009d0d782ed479f75;hpb=63e35b25d6b5c3136d22ef249dbbf96716aa08bf;p=GitHub%2Fmt8127%2Fandroid_kernel_alcatel_ttab.git diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7669afe82cc7..7af16d1c480f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1,9 +1,10 @@ /* * Performance counter core code * - * Copyright(C) 2008 Thomas Gleixner - * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar - * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING */ @@ -25,6 +26,7 @@ #include #include #include +#include #include @@ -37,22 +39,30 @@ int perf_max_counters __read_mostly = 1; static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; +static atomic_t nr_counters __read_mostly; +static atomic_t nr_mmap_tracking __read_mostly; +static atomic_t nr_munmap_tracking __read_mostly; +static atomic_t nr_comm_tracking __read_mostly; + +int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ +int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ + /* - * Mutex for (sysadmin-configurable) counter reservations: + * Lock for (sysadmin-configurable) counter reservations: */ -static DEFINE_MUTEX(perf_resource_mutex); +static DEFINE_SPINLOCK(perf_resource_lock); /* * Architecture provided APIs - weak aliases: */ -extern __weak const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { return NULL; } -u64 __weak hw_perf_save_disable(void) { return 0; } -void __weak hw_perf_restore(u64 ctrl) { barrier(); } +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + void __weak hw_perf_counter_setup(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, @@ -63,6 +73,30 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } +static DEFINE_PER_CPU(int, disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -73,7 +107,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) * add it straight to the context's counter list, or to the group * leader's sibling list: */ - if (counter->group_leader == counter) + if (group_leader == counter) list_add_tail(&counter->list_entry, &ctx->counter_list); else { list_add_tail(&counter->list_entry, &group_leader->sibling_list); @@ -81,6 +115,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) } list_add_rcu(&counter->event_entry, &ctx->event_list); + ctx->nr_counters++; } static void @@ -88,6 +123,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { struct perf_counter *sibling, *tmp; + ctx->nr_counters--; + list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -116,7 +153,8 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->hw_ops->disable(counter); + counter->tstamp_stopped = ctx->time; + counter->pmu->disable(counter); counter->oncpu = -1; if (!is_software_counter(counter)) @@ -160,7 +198,6 @@ static void __perf_counter_remove_from_context(void *info) struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; unsigned long flags; - u64 perf_flags; /* * If this is a task context, we need to check whether it is @@ -170,21 +207,19 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); counter_sched_out(counter, cpuctx, ctx); counter->task = NULL; - ctx->nr_counters--; /* * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_del_counter(counter, ctx); - hw_perf_restore(perf_flags); + perf_enable(); if (!ctx->task) { /* @@ -196,8 +231,7 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } @@ -244,13 +278,61 @@ retry: * succeed. */ if (!list_empty(&counter->list_entry)) { - ctx->nr_counters--; list_del_counter(counter, ctx); counter->task = NULL; } spin_unlock_irq(&ctx->lock); } +static inline u64 perf_clock(void) +{ + return cpu_clock(smp_processor_id()); +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_counter_context *ctx) +{ + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; +} + +/* + * Update the total_time_enabled and total_time_running fields for a counter. + */ +static void update_counter_times(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + u64 run_end; + + if (counter->state < PERF_COUNTER_STATE_INACTIVE) + return; + + counter->total_time_enabled = ctx->time - counter->tstamp_enabled; + + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + run_end = counter->tstamp_stopped; + else + run_end = ctx->time; + + counter->total_time_running = run_end - counter->tstamp_running; +} + +/* + * Update total_time_enabled and total_time_running for all counters in a group. + */ +static void update_group_times(struct perf_counter *leader) +{ + struct perf_counter *counter; + + update_counter_times(leader); + list_for_each_entry(counter, &leader->sibling_list, list_entry) + update_counter_times(counter); +} + /* * Cross CPU call to disable a performance counter */ @@ -268,14 +350,15 @@ static void __perf_counter_disable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); /* * If the counter is on, turn it off. * If it is in error state, leave it in error state. */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + update_context_time(ctx); + update_counter_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); else @@ -283,8 +366,7 @@ static void __perf_counter_disable(void *info) counter->state = PERF_COUNTER_STATE_OFF; } - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -320,30 +402,14 @@ static void perf_counter_disable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + } spin_unlock_irq(&ctx->lock); } -/* - * Disable a counter and all its children. - */ -static void perf_counter_disable_family(struct perf_counter *counter) -{ - struct perf_counter *child; - - perf_counter_disable(counter); - - /* - * Lock the mutex to protect the list of children - */ - mutex_lock(&counter->mutex); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_disable(child); - mutex_unlock(&counter->mutex); -} - static int counter_sched_in(struct perf_counter *counter, struct perf_cpu_context *cpuctx, @@ -360,12 +426,14 @@ counter_sched_in(struct perf_counter *counter, */ smp_wmb(); - if (counter->hw_ops->enable(counter)) { + if (counter->pmu->enable(counter)) { counter->state = PERF_COUNTER_STATE_INACTIVE; counter->oncpu = -1; return -EAGAIN; } + counter->tstamp_running += ctx->time - counter->tstamp_stopped; + if (!is_software_counter(counter)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -376,6 +444,54 @@ counter_sched_in(struct perf_counter *counter, return 0; } +static int +group_sched_in(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + struct perf_counter *counter, *partial_group; + int ret; + + if (group_counter->state == PERF_COUNTER_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; + + group_counter->prev_state = group_counter->state; + if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) + return -EAGAIN; + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + counter->prev_state = counter->state; + if (counter_sched_in(counter, cpuctx, ctx, cpu)) { + partial_group = counter; + goto group_error; + } + } + + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + if (counter == partial_group) + break; + counter_sched_out(counter, cpuctx, ctx); + } + counter_sched_out(group_counter, cpuctx, ctx); + + return -EAGAIN; +} + /* * Return 1 for a group consisting entirely of software counters, * 0 if the group contains any hardware counters. @@ -425,6 +541,16 @@ static int group_can_go_on(struct perf_counter *counter, return can_add_hw; } +static void add_counter_to_ctx(struct perf_counter *counter, + struct perf_counter_context *ctx) +{ + list_add_counter(counter, ctx); + counter->prev_state = PERF_COUNTER_STATE_OFF; + counter->tstamp_enabled = ctx->time; + counter->tstamp_running = ctx->time; + counter->tstamp_stopped = ctx->time; +} + /* * Cross CPU call to install and enable a performance counter */ @@ -436,7 +562,6 @@ static void __perf_install_in_context(void *info) struct perf_counter *leader = counter->group_leader; int cpu = smp_processor_id(); unsigned long flags; - u64 perf_flags; int err; /* @@ -447,18 +572,16 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); /* * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_save_disable(); + perf_disable(); - list_add_counter(counter, ctx); - ctx->nr_counters++; - counter->prev_state = PERF_COUNTER_STATE_OFF; + add_counter_to_ctx(counter, ctx); /* * Don't put the counter on if it is disabled or if @@ -486,18 +609,19 @@ static void __perf_install_in_context(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) + if (leader->hw_event.pinned) { + update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; + } } if (!err && !ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; unlock: - hw_perf_restore(perf_flags); + perf_enable(); - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -548,10 +672,8 @@ retry: * can add the counter safely, if it the call above did not * succeed. */ - if (list_empty(&counter->list_entry)) { - list_add_counter(counter, ctx); - ctx->nr_counters++; - } + if (list_empty(&counter->list_entry)) + add_counter_to_ctx(counter, ctx); spin_unlock_irq(&ctx->lock); } @@ -574,13 +696,14 @@ static void __perf_counter_enable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time - counter->total_time_enabled; /* * If the counter is in a group and isn't the group leader, @@ -589,11 +712,18 @@ static void __perf_counter_enable(void *info) if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) goto unlock; - if (!group_can_go_on(counter, cpuctx, 1)) + if (!group_can_go_on(counter, cpuctx, 1)) { err = -EEXIST; - else - err = counter_sched_in(counter, cpuctx, ctx, - smp_processor_id()); + } else { + perf_disable(); + if (counter == leader) + err = group_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + else + err = counter_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + perf_enable(); + } if (err) { /* @@ -602,13 +732,14 @@ static void __perf_counter_enable(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) + if (leader->hw_event.pinned) { + update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; + } } unlock: - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -659,47 +790,46 @@ static void perf_counter_enable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_OFF) + if (counter->state == PERF_COUNTER_STATE_OFF) { counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; + } out: spin_unlock_irq(&ctx->lock); } -/* - * Enable a counter and all its children. - */ -static void perf_counter_enable_family(struct perf_counter *counter) +static int perf_counter_refresh(struct perf_counter *counter, int refresh) { - struct perf_counter *child; + /* + * not supported on inherited counters + */ + if (counter->hw_event.inherit) + return -EINVAL; + atomic_add(refresh, &counter->event_limit); perf_counter_enable(counter); - /* - * Lock the mutex to protect the list of children - */ - mutex_lock(&counter->mutex); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_enable(child); - mutex_unlock(&counter->mutex); + return 0; } void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx) { struct perf_counter *counter; - u64 flags; spin_lock(&ctx->lock); ctx->is_active = 0; if (likely(!ctx->nr_counters)) goto out; + update_context_time(ctx); - flags = hw_perf_save_disable(); + perf_disable(); if (ctx->nr_active) { list_for_each_entry(counter, &ctx->counter_list, list_entry) group_sched_out(counter, cpuctx, ctx); } - hw_perf_restore(flags); + perf_enable(); out: spin_unlock(&ctx->lock); } @@ -724,64 +854,26 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) if (likely(!cpuctx->task_ctx)) return; + update_context_time(ctx); + regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); + perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } -static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) +static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) { - __perf_counter_sched_out(&cpuctx->ctx, cpuctx); + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; } -static int -group_sched_in(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) +static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) { - struct perf_counter *counter, *partial_group; - int ret; - - if (group_counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); - if (ret) - return ret < 0 ? ret : 0; - - group_counter->prev_state = group_counter->state; - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) - return -EAGAIN; - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - counter->prev_state = counter->state; - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; - goto group_error; - } - } - - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter == partial_group) - break; - counter_sched_out(counter, cpuctx, ctx); - } - counter_sched_out(group_counter, cpuctx, ctx); - - return -EAGAIN; + __perf_counter_sched_out(&cpuctx->ctx, cpuctx); } static void @@ -789,7 +881,6 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { struct perf_counter *counter; - u64 flags; int can_add_hw = 1; spin_lock(&ctx->lock); @@ -797,7 +888,9 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (likely(!ctx->nr_counters)) goto out; - flags = hw_perf_save_disable(); + ctx->timestamp = perf_clock(); + + perf_disable(); /* * First go through the list and put on any pinned groups @@ -817,8 +910,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * If this pinned group hasn't been scheduled, * put it in error state. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_group_times(counter); counter->state = PERF_COUNTER_STATE_ERROR; + } } list_for_each_entry(counter, &ctx->counter_list, list_entry) { @@ -842,7 +937,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, can_add_hw = 0; } } - hw_perf_restore(flags); + perf_enable(); out: spin_unlock(&ctx->lock); } @@ -880,37 +975,31 @@ int perf_counter_task_disable(void) struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; unsigned long flags; - u64 perf_flags; - int cpu; if (likely(!ctx->nr_counters)) return 0; - curr_rq_lock_irq_save(&flags); - cpu = smp_processor_id(); - - /* force the update of the task clock: */ - __task_delta_exec(curr, 1); + local_irq_save(flags); - perf_counter_task_sched_out(curr, cpu); + __perf_counter_task_sched_out(ctx); spin_lock(&ctx->lock); /* * Disable all the counters: */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ERROR) + if (counter->state != PERF_COUNTER_STATE_ERROR) { + update_group_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + } } - hw_perf_restore(perf_flags); + perf_enable(); - spin_unlock(&ctx->lock); - - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); return 0; } @@ -921,51 +1010,80 @@ int perf_counter_task_enable(void) struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; unsigned long flags; - u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); cpu = smp_processor_id(); - /* force the update of the task clock: */ - __task_delta_exec(curr, 1); - - perf_counter_task_sched_out(curr, cpu); + __perf_counter_task_sched_out(ctx); spin_lock(&ctx->lock); /* * Disable all the counters: */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; counter->hw_event.disabled = 0; } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock(&ctx->lock); perf_counter_task_sched_in(curr, cpu); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); return 0; } +void perf_adjust_freq(struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + u64 irq_period; + u64 events, period; + s64 delta; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + continue; + + if (!counter->hw_event.freq || !counter->hw_event.irq_freq) + continue; + + events = HZ * counter->hw.interrupts * counter->hw.irq_period; + period = div64_u64(events, counter->hw_event.irq_freq); + + delta = (s64)(1 + period - counter->hw.irq_period); + delta >>= 1; + + irq_period = counter->hw.irq_period + delta; + + if (!irq_period) + irq_period = 1; + + counter->hw.irq_period = irq_period; + counter->hw.interrupts = 0; + } + spin_unlock(&ctx->lock); +} + /* * Round-robin a context's counters: */ static void rotate_ctx(struct perf_counter_context *ctx) { struct perf_counter *counter; - u64 perf_flags; if (!ctx->nr_counters) return; @@ -974,32 +1092,37 @@ static void rotate_ctx(struct perf_counter_context *ctx) /* * Rotate the first entry last (works just fine for group counters too): */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { list_move_tail(&counter->list_entry, &ctx->counter_list); break; } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock(&ctx->lock); } void perf_counter_task_tick(struct task_struct *curr, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &curr->perf_counter_ctx; - const int rotate_percpu = 0; + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + + if (!atomic_read(&nr_counters)) + return; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &curr->perf_counter_ctx; + + perf_adjust_freq(&cpuctx->ctx); + perf_adjust_freq(ctx); - if (rotate_percpu) - perf_counter_cpu_sched_out(cpuctx); - perf_counter_task_sched_out(curr, cpu); + perf_counter_cpu_sched_out(cpuctx); + __perf_counter_task_sched_out(ctx); - if (rotate_percpu) - rotate_ctx(&cpuctx->ctx); + rotate_ctx(&cpuctx->ctx); rotate_ctx(ctx); - if (rotate_percpu) - perf_counter_cpu_sched_in(cpuctx, cpu); + perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); } @@ -1009,11 +1132,15 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) static void __read(void *info) { struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; unsigned long flags; - curr_rq_lock_irq_save(&flags); - counter->hw_ops->read(counter); - curr_rq_unlock_irq_restore(&flags); + local_irq_save(flags); + if (ctx->is_active) + update_context_time(ctx); + counter->pmu->read(counter); + update_counter_times(counter); + local_irq_restore(flags); } static u64 perf_counter_read(struct perf_counter *counter) @@ -1025,6 +1152,8 @@ static u64 perf_counter_read(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, __read, counter, 1); + } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); } return atomic64_read(&counter->count); @@ -1047,7 +1176,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) */ if (cpu != -1) { /* Must be root to operate on a CPU counter: */ - if (!capable(CAP_SYS_ADMIN)) + if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); if (cpu < 0 || cpu > num_possible_cpus()) @@ -1099,8 +1228,20 @@ static void free_counter_rcu(struct rcu_head *head) kfree(counter); } +static void perf_pending_sync(struct perf_counter *counter); + static void free_counter(struct perf_counter *counter) { + perf_pending_sync(counter); + + atomic_dec(&nr_counters); + if (counter->hw_event.mmap) + atomic_dec(&nr_mmap_tracking); + if (counter->hw_event.munmap) + atomic_dec(&nr_munmap_tracking); + if (counter->hw_event.comm) + atomic_dec(&nr_comm_tracking); + if (counter->destroy) counter->destroy(counter); @@ -1137,10 +1278,8 @@ static int perf_release(struct inode *inode, struct file *file) static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 cntval; - - if (count < sizeof(cntval)) - return -EINVAL; + u64 values[3]; + int n; /* * Return end-of-file for a read on a counter that is in @@ -1151,10 +1290,24 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return 0; mutex_lock(&counter->mutex); - cntval = perf_counter_read(counter); + values[0] = perf_counter_read(counter); + n = 1; + if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); mutex_unlock(&counter->mutex); - return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); + if (count < n * sizeof(u64)) + return -EINVAL; + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; } static ssize_t @@ -1169,14 +1322,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; struct perf_mmap_data *data; - unsigned int events; + unsigned int events = POLL_HUP; rcu_read_lock(); data = rcu_dereference(counter->data); if (data) - events = atomic_xchg(&data->wakeup, 0); - else - events = POLL_HUP; + events = atomic_xchg(&data->poll, 0); rcu_read_unlock(); poll_wait(file, &counter->waitq, wait); @@ -1184,28 +1335,99 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } -static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +static void perf_counter_reset(struct perf_counter *counter) { - struct perf_counter *counter = file->private_data; - int err = 0; + (void)perf_counter_read(counter); + atomic64_set(&counter->count, 0); + perf_counter_update_userpage(counter); +} + +static void perf_counter_for_each_sibling(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *sibling; + + spin_lock_irq(&ctx->lock); + counter = counter->group_leader; + + func(counter); + list_for_each_entry(sibling, &counter->sibling_list, list_entry) + func(sibling); + spin_unlock_irq(&ctx->lock); +} + +static void perf_counter_for_each_child(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter *child; + + mutex_lock(&counter->mutex); + func(counter); + list_for_each_entry(child, &counter->child_list, child_list) + func(child); + mutex_unlock(&counter->mutex); +} + +static void perf_counter_for_each(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter *child; + + mutex_lock(&counter->mutex); + perf_counter_for_each_sibling(counter, func); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_for_each_sibling(child, func); + mutex_unlock(&counter->mutex); +} + +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_counter *counter = file->private_data; + void (*func)(struct perf_counter *); + u32 flags = arg; switch (cmd) { case PERF_COUNTER_IOC_ENABLE: - perf_counter_enable_family(counter); + func = perf_counter_enable; break; case PERF_COUNTER_IOC_DISABLE: - perf_counter_disable_family(counter); + func = perf_counter_disable; + break; + case PERF_COUNTER_IOC_RESET: + func = perf_counter_reset; break; + + case PERF_COUNTER_IOC_REFRESH: + return perf_counter_refresh(counter, arg); default: - err = -ENOTTY; + return -ENOTTY; } - return err; + + if (flags & PERF_IOC_FLAG_GROUP) + perf_counter_for_each(counter, func); + else + perf_counter_for_each_child(counter, func); + + return 0; } -static void __perf_counter_update_userpage(struct perf_counter *counter, - struct perf_mmap_data *data) +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_counter_update_userpage(struct perf_counter *counter) { - struct perf_counter_mmap_page *userpg = data->user_page; + struct perf_mmap_data *data; + struct perf_counter_mmap_page *userpg; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; + + userpg = data->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -1213,26 +1435,16 @@ static void __perf_counter_update_userpage(struct perf_counter *counter, */ preempt_disable(); ++userpg->lock; - smp_wmb(); + barrier(); userpg->index = counter->hw.idx; userpg->offset = atomic64_read(&counter->count); if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); - userpg->data_head = atomic_read(&data->head); - smp_wmb(); + barrier(); ++userpg->lock; preempt_enable(); -} - -void perf_counter_update_userpage(struct perf_counter *counter) -{ - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) - __perf_counter_update_userpage(counter, data); +unlock: rcu_read_unlock(); } @@ -1291,6 +1503,7 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) } data->nr_pages = nr_pages; + atomic_set(&data->lock, -1); rcu_assign_pointer(counter->data, data); @@ -1344,13 +1557,17 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { + struct user_struct *user = current_user(); + + atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); + vma->vm_mm->locked_vm -= counter->data->nr_locked; perf_mmap_data_free(counter); mutex_unlock(&counter->mmap_mutex); } } static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, + .open = perf_mmap_open, .close = perf_mmap_close, .fault = perf_mmap_fault, }; @@ -1358,9 +1575,12 @@ static struct vm_operations_struct perf_mmap_vmops = { static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_counter *counter = file->private_data; + struct user_struct *user = current_user(); unsigned long vma_size; unsigned long nr_pages; + unsigned long user_locked, user_lock_limit; unsigned long locked, lock_limit; + long user_extra, extra; int ret = 0; if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) @@ -1369,7 +1589,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) vma_size = vma->vm_end - vma->vm_start; nr_pages = (vma_size / PAGE_SIZE) - 1; - if (nr_pages == 0 || !is_power_of_2(nr_pages)) + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; if (vma_size != PAGE_SIZE * (1 + nr_pages)) @@ -1378,24 +1602,40 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_pgoff != 0) return -EINVAL; - locked = vma_size >> PAGE_SHIFT; - locked += vma->vm_mm->locked_vm; + mutex_lock(&counter->mmap_mutex); + if (atomic_inc_not_zero(&counter->mmap_count)) { + if (nr_pages != counter->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + user_locked = atomic_long_read(&user->locked_vm) + user_extra; + + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) - return -EPERM; - - mutex_lock(&counter->mmap_mutex); - if (atomic_inc_not_zero(&counter->mmap_count)) - goto out; + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } WARN_ON(counter->data); ret = perf_mmap_data_alloc(counter, nr_pages); - if (!ret) - atomic_set(&counter->mmap_count, 1); -out: + if (ret) + goto unlock; + + atomic_set(&counter->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); + vma->vm_mm->locked_vm += extra; + counter->data->nr_locked = extra; +unlock: mutex_unlock(&counter->mmap_mutex); vma->vm_flags &= ~VM_MAYWRITE; @@ -1405,6 +1645,22 @@ out: return ret; } +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct perf_counter *counter = filp->private_data; + struct inode *inode = filp->f_path.dentry->d_inode; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &counter->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, @@ -1412,8 +1668,143 @@ static const struct file_operations perf_fops = { .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_ioctl, .mmap = perf_mmap, + .fasync = perf_fasync, }; +/* + * Perf counter wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_counter_wakeup(struct perf_counter *counter) +{ + wake_up_all(&counter->waitq); + + if (counter->pending_kill) { + kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); + counter->pending_kill = 0; + } +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +static void perf_pending_counter(struct perf_pending_entry *entry) +{ + struct perf_counter *counter = container_of(entry, + struct perf_counter, pending); + + if (counter->pending_disable) { + counter->pending_disable = 0; + perf_counter_disable(counter); + } + + if (counter->pending_wakeup) { + counter->pending_wakeup = 0; + perf_counter_wakeup(counter); + } +} + +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) +{ + struct perf_pending_entry **head; + + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) + return; + + entry->func = func; + + head = &get_cpu_var(perf_pending_head); + + do { + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); + + set_perf_counter_pending(); + + put_cpu_var(perf_pending_head); +} + +static int __perf_pending_run(void) +{ + struct perf_pending_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; + + list = list->next; + + func = entry->func; + entry->next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + func(entry); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_counter *counter) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return counter->pending.next == NULL; +} + +static void perf_pending_sync(struct perf_counter *counter) +{ + wait_event(counter->waitq, perf_not_pending(counter)); +} + +void perf_counter_do_pending(void) +{ + __perf_pending_run(); +} + +/* + * Callchain support -- arch specific + */ + +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + /* * Output */ @@ -1423,36 +1814,144 @@ struct perf_output_handle { struct perf_mmap_data *data; unsigned int offset; unsigned int head; - int wakeup; + int nmi; + int overflow; + int locked; + unsigned long flags; }; +static void perf_output_wakeup(struct perf_output_handle *handle) +{ + atomic_set(&handle->data->poll, POLL_IN); + + if (handle->nmi) { + handle->counter->pending_wakeup = 1; + perf_pending_queue(&handle->counter->pending, + perf_pending_counter); + } else + perf_counter_wakeup(handle->counter); +} + +/* + * Curious locking construct. + * + * We need to ensure a later event doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int head, cpu; + + data->done_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_xchg(&data->done_head, 0))) + data->user_page->data_head = head; + + /* + * NMI can happen here, which means we can miss a done_head update. + */ + + cpu = atomic_xchg(&data->lock, -1); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_read(&data->done_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) + cpu_relax(); + + goto again; + } + + if (atomic_xchg(&data->wakeup, 0)) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + static int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size) + struct perf_counter *counter, unsigned int size, + int nmi, int overflow) { struct perf_mmap_data *data; unsigned int offset, head; + /* + * For inherited counters we send all the output towards the parent. + */ + if (counter->parent) + counter = counter->parent; + rcu_read_lock(); data = rcu_dereference(counter->data); if (!data) goto out; + handle->data = data; + handle->counter = counter; + handle->nmi = nmi; + handle->overflow = overflow; + if (!data->nr_pages) - goto out; + goto fail; + + perf_output_lock(handle); do { offset = head = atomic_read(&data->head); head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); - handle->counter = counter; - handle->data = data; handle->offset = offset; handle->head = head; - handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); + + if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) + atomic_set(&data->wakeup, 1); return 0; +fail: + perf_output_wakeup(handle); out: rcu_read_unlock(); @@ -1488,110 +1987,456 @@ static void perf_output_copy(struct perf_output_handle *handle, handle->offset = offset; - WARN_ON_ONCE(handle->offset > handle->head); + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0); } #define perf_output_put(handle, x) \ perf_output_copy((handle), &(x), sizeof(x)) -static void perf_output_end(struct perf_output_handle *handle, int nmi) +static void perf_output_end(struct perf_output_handle *handle) { - if (handle->wakeup) { - (void)atomic_xchg(&handle->data->wakeup, POLL_IN); - __perf_counter_update_userpage(handle->counter, handle->data); - if (nmi) { - handle->counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&handle->counter->waitq); + struct perf_counter *counter = handle->counter; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = counter->hw_event.wakeup_events; + + if (handle->overflow && wakeup_events) { + int events = atomic_inc_return(&data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &data->events); + atomic_set(&data->wakeup, 1); + } } + + perf_output_unlock(handle); rcu_read_unlock(); } -static int perf_output_write(struct perf_counter *counter, int nmi, - void *buf, ssize_t size) +static void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs, u64 addr) { - struct perf_output_handle handle; int ret; + u64 record_type = counter->hw_event.record_type; + struct perf_output_handle handle; + struct perf_event_header header; + u64 ip; + struct { + u32 pid, tid; + } tid_entry; + struct { + u64 event; + u64 counter; + } group_entry; + struct perf_callchain_entry *callchain = NULL; + int callchain_size = 0; + u64 time; + struct { + u32 cpu, reserved; + } cpu_entry; + + header.type = 0; + header.size = sizeof(header); + + header.misc = PERF_EVENT_MISC_OVERFLOW; + header.misc |= perf_misc_flags(regs); + + if (record_type & PERF_RECORD_IP) { + ip = perf_instruction_pointer(regs); + header.type |= PERF_RECORD_IP; + header.size += sizeof(ip); + } + + if (record_type & PERF_RECORD_TID) { + /* namespace issues */ + tid_entry.pid = current->group_leader->pid; + tid_entry.tid = current->pid; + + header.type |= PERF_RECORD_TID; + header.size += sizeof(tid_entry); + } + + if (record_type & PERF_RECORD_TIME) { + /* + * Maybe do better on x86 and provide cpu_clock_nmi() + */ + time = sched_clock(); + + header.type |= PERF_RECORD_TIME; + header.size += sizeof(u64); + } + + if (record_type & PERF_RECORD_ADDR) { + header.type |= PERF_RECORD_ADDR; + header.size += sizeof(u64); + } + + if (record_type & PERF_RECORD_CONFIG) { + header.type |= PERF_RECORD_CONFIG; + header.size += sizeof(u64); + } + + if (record_type & PERF_RECORD_CPU) { + header.type |= PERF_RECORD_CPU; + header.size += sizeof(cpu_entry); + + cpu_entry.cpu = raw_smp_processor_id(); + } + + if (record_type & PERF_RECORD_GROUP) { + header.type |= PERF_RECORD_GROUP; + header.size += sizeof(u64) + + counter->nr_siblings * sizeof(group_entry); + } + + if (record_type & PERF_RECORD_CALLCHAIN) { + callchain = perf_callchain(regs); - ret = perf_output_begin(&handle, counter, size); + if (callchain) { + callchain_size = (1 + callchain->nr) * sizeof(u64); + + header.type |= PERF_RECORD_CALLCHAIN; + header.size += callchain_size; + } + } + + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) - goto out; + return; - perf_output_copy(&handle, buf, size); - perf_output_end(&handle, nmi); + perf_output_put(&handle, header); -out: - return ret; + if (record_type & PERF_RECORD_IP) + perf_output_put(&handle, ip); + + if (record_type & PERF_RECORD_TID) + perf_output_put(&handle, tid_entry); + + if (record_type & PERF_RECORD_TIME) + perf_output_put(&handle, time); + + if (record_type & PERF_RECORD_ADDR) + perf_output_put(&handle, addr); + + if (record_type & PERF_RECORD_CONFIG) + perf_output_put(&handle, counter->hw_event.config); + + if (record_type & PERF_RECORD_CPU) + perf_output_put(&handle, cpu_entry); + + /* + * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. + */ + if (record_type & PERF_RECORD_GROUP) { + struct perf_counter *leader, *sub; + u64 nr = counter->nr_siblings; + + perf_output_put(&handle, nr); + + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->pmu->read(sub); + + group_entry.event = sub->hw_event.config; + group_entry.counter = atomic64_read(&sub->count); + + perf_output_put(&handle, group_entry); + } + } + + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); + + perf_output_end(&handle); } -static void perf_output_simple(struct perf_counter *counter, - int nmi, struct pt_regs *regs) -{ +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + struct { - struct perf_event_header header; - u64 ip; + struct perf_event_header header; + + u32 pid; + u32 tid; } event; +}; - event.header.type = PERF_EVENT_IP; - event.header.size = sizeof(event); - event.ip = instruction_pointer(regs); +static void perf_counter_comm_output(struct perf_counter *counter, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; - perf_output_write(counter, nmi, &event, sizeof(event)); + perf_output_put(&handle, comm_event->event); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); } -static void perf_output_group(struct perf_counter *counter, int nmi) +static int perf_counter_comm_match(struct perf_counter *counter, + struct perf_comm_event *comm_event) { - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_counter *leader, *sub; + if (counter->hw_event.comm && + comm_event->event.header.type == PERF_EVENT_COMM) + return 1; + + return 0; +} + +static void perf_counter_comm_ctx(struct perf_counter_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_comm_match(counter, comm_event)) + perf_counter_comm_output(counter, comm_event); + } + rcu_read_unlock(); +} + +static void perf_counter_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; unsigned int size; + char *comm = comm_event->task->comm; + + size = ALIGN(strlen(comm)+1, sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event.header.size = sizeof(comm_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + perf_counter_comm_ctx(¤t->perf_counter_ctx, comm_event); +} + +void perf_counter_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event; + + if (!atomic_read(&nr_comm_tracking)) + return; + + comm_event = (struct perf_comm_event){ + .task = task, + .event = { + .header = { .type = PERF_EVENT_COMM, }, + .pid = task->group_leader->pid, + .tid = task->pid, + }, + }; + + perf_counter_comm_event(&comm_event); +} + +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct file *file; + char *file_name; + int file_size; + struct { - u64 event; - u64 counter; - } entry; - int ret; + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event; +}; - size = sizeof(header) + counter->nr_siblings * sizeof(entry); +static void perf_counter_mmap_output(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size, 0, 0); - ret = perf_output_begin(&handle, counter, size); if (ret) return; - header.type = PERF_EVENT_GROUP; - header.size = size; + perf_output_put(&handle, mmap_event->event); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle); +} - perf_output_put(&handle, header); +static int perf_counter_mmap_match(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + if (counter->hw_event.mmap && + mmap_event->event.header.type == PERF_EVENT_MMAP) + return 1; + + if (counter->hw_event.munmap && + mmap_event->event.header.type == PERF_EVENT_MUNMAP) + return 1; + + return 0; +} - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->hw_ops->read(sub); +static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_counter *counter; - entry.event = sub->hw_event.config; - entry.counter = atomic64_read(&sub->count); + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; - perf_output_put(&handle, entry); + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_mmap_match(counter, mmap_event)) + perf_counter_mmap_output(counter, mmap_event); } + rcu_read_unlock(); +} + +static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct file *file = mmap_event->file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + char *name; + + if (file) { + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = d_path(&file->f_path, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name)+1, sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; - perf_output_end(&handle, nmi); + mmap_event->event.header.size = sizeof(mmap_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); + + kfree(buf); } -void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs) +void perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) { - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_tracking)) return; - case PERF_RECORD_IRQ: - perf_output_simple(counter, nmi, regs); - break; + mmap_event = (struct perf_mmap_event){ + .file = file, + .event = { + .header = { .type = PERF_EVENT_MMAP, }, + .pid = current->group_leader->pid, + .tid = current->pid, + .start = addr, + .len = len, + .pgoff = pgoff, + }, + }; - case PERF_RECORD_GROUP: - perf_output_group(counter, nmi); - break; + perf_counter_mmap_event(&mmap_event); +} + +void perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) +{ + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_munmap_tracking)) + return; + + mmap_event = (struct perf_mmap_event){ + .file = file, + .event = { + .header = { .type = PERF_EVENT_MUNMAP, }, + .pid = current->group_leader->pid, + .tid = current->pid, + .start = addr, + .len = len, + .pgoff = pgoff, + }, + }; + + perf_counter_mmap_event(&mmap_event); +} + +/* + * Generic counter overflow handling. + */ + +int perf_counter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs, u64 addr) +{ + int events = atomic_read(&counter->event_limit); + int ret = 0; + + counter->hw.interrupts++; + + /* + * XXX event_limit might not quite work as expected on inherited + * counters + */ + + counter->pending_kill = POLL_IN; + if (events && atomic_dec_and_test(&counter->event_limit)) { + ret = 1; + counter->pending_kill = POLL_HUP; + if (nmi) { + counter->pending_disable = 1; + perf_pending_queue(&counter->pending, + perf_pending_counter); + } else + perf_counter_disable(counter); } + + perf_counter_output(counter, nmi, regs, addr); + return ret; } /* @@ -1638,11 +2483,13 @@ static void perf_swcounter_set_period(struct perf_counter *counter) static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { + enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_counter *counter; struct pt_regs *regs; + u64 period; counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->hw_ops->read(counter); + counter->pmu->read(counter); regs = get_irq_regs(); /* @@ -1653,20 +2500,26 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) !counter->hw_event.exclude_user) regs = task_pt_regs(current); - if (regs) - perf_counter_output(counter, 0, regs); + if (regs) { + if (perf_counter_overflow(counter, 0, regs, 0)) + ret = HRTIMER_NORESTART; + } - hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); + period = max_t(u64, 10000, counter->hw.irq_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - return HRTIMER_RESTART; + return ret; } static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { perf_swcounter_update(counter); perf_swcounter_set_period(counter); - perf_counter_output(counter, nmi, regs); + if (perf_counter_overflow(counter, nmi, regs, addr)) + /* soft-disable the counter */ + ; + } static int perf_swcounter_match(struct perf_counter *counter, @@ -1695,16 +2548,17 @@ static int perf_swcounter_match(struct perf_counter *counter, } static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { int neg = atomic64_add_negative(nr, &counter->hw.count); if (counter->hw.irq_period && !neg) - perf_swcounter_overflow(counter, nmi, regs); + perf_swcounter_overflow(counter, nmi, regs, addr); } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_event_types type, u32 event, - u64 nr, int nmi, struct pt_regs *regs) + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) { struct perf_counter *counter; @@ -1714,7 +2568,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { if (perf_swcounter_match(counter, type, event, regs)) - perf_swcounter_add(counter, nr, nmi, regs); + perf_swcounter_add(counter, nr, nmi, regs, addr); } rcu_read_unlock(); } @@ -1734,7 +2588,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) } static void __perf_swcounter_event(enum perf_event_types type, u32 event, - u64 nr, int nmi, struct pt_regs *regs) + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); int *recursion = perf_swcounter_recursion_context(cpuctx); @@ -1745,10 +2600,11 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event, (*recursion)++; barrier(); - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); + perf_swcounter_ctx_event(&cpuctx->ctx, type, event, + nr, nmi, regs, addr); if (cpuctx->task_ctx) { perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, - nr, nmi, regs); + nr, nmi, regs, addr); } barrier(); @@ -1758,9 +2614,10 @@ out: put_cpu_var(perf_cpu_context); } -void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) +void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs); + __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); } static void perf_swcounter_read(struct perf_counter *counter) @@ -1779,7 +2636,7 @@ static void perf_swcounter_disable(struct perf_counter *counter) perf_swcounter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_generic = { +static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, @@ -1810,8 +2667,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { + u64 period = max_t(u64, 10000, hwc->irq_period); __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(hwc->irq_period), 0, + ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); } @@ -1829,7 +2687,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) cpu_clock_perf_counter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_cpu_clock = { +static const struct pmu perf_ops_cpu_clock = { .enable = cpu_clock_perf_counter_enable, .disable = cpu_clock_perf_counter_disable, .read = cpu_clock_perf_counter_read, @@ -1839,43 +2697,30 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { * Software counter: task time clock */ -/* - * Called from within the scheduler: - */ -static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) -{ - struct task_struct *curr = counter->task; - u64 delta; - - delta = __task_delta_exec(curr, update); - - return curr->se.sum_exec_runtime + delta; -} - static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) { u64 prev; s64 delta; - prev = atomic64_read(&counter->hw.prev_count); - - atomic64_set(&counter->hw.prev_count, now); - + prev = atomic64_xchg(&counter->hw.prev_count, now); delta = now - prev; - atomic64_add(delta, &counter->count); } static int task_clock_perf_counter_enable(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; + u64 now; + + now = counter->ctx->time; - atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0)); + atomic64_set(&hwc->prev_count, now); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { + u64 period = max_t(u64, 10000, hwc->irq_period); __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(hwc->irq_period), 0, + ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); } @@ -1885,17 +2730,27 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) static void task_clock_perf_counter_disable(struct perf_counter *counter) { hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, - task_clock_perf_counter_val(counter, 0)); + task_clock_perf_counter_update(counter, counter->ctx->time); + } static void task_clock_perf_counter_read(struct perf_counter *counter) { - task_clock_perf_counter_update(counter, - task_clock_perf_counter_val(counter, 1)); + u64 time; + + if (!in_nmi()) { + update_context_time(counter->ctx); + time = counter->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - counter->ctx->timestamp; + time = counter->ctx->time + delta; + } + + task_clock_perf_counter_update(counter, time); } -static const struct hw_perf_counter_ops perf_ops_task_clock = { +static const struct pmu perf_ops_task_clock = { .enable = task_clock_perf_counter_enable, .disable = task_clock_perf_counter_disable, .read = task_clock_perf_counter_read, @@ -1947,7 +2802,7 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) cpu_migrations_perf_counter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { +static const struct pmu perf_ops_cpu_migrations = { .enable = cpu_migrations_perf_counter_enable, .disable = cpu_migrations_perf_counter_disable, .read = cpu_migrations_perf_counter_read, @@ -1961,8 +2816,9 @@ void perf_tpcounter_event(int event_id) if (!regs) regs = task_pt_regs(current); - __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs); + __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); } +EXPORT_SYMBOL_GPL(perf_tpcounter_event); extern int ftrace_profile_enable(int); extern void ftrace_profile_disable(int); @@ -1972,8 +2828,7 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) ftrace_profile_disable(perf_event_id(&counter->hw_event)); } -static const struct hw_perf_counter_ops * -tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { int event_id = perf_event_id(&counter->hw_event); int ret; @@ -1988,19 +2843,15 @@ tp_perf_counter_init(struct perf_counter *counter) return &perf_ops_generic; } #else -static const struct hw_perf_counter_ops * -tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { return NULL; } #endif -static const struct hw_perf_counter_ops * -sw_perf_counter_init(struct perf_counter *counter) +static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { - struct perf_counter_hw_event *hw_event = &counter->hw_event; - const struct hw_perf_counter_ops *hw_ops = NULL; - struct hw_perf_counter *hwc = &counter->hw; + const struct pmu *pmu = NULL; /* * Software counters (currently) can't in general distinguish @@ -2011,10 +2862,8 @@ sw_perf_counter_init(struct perf_counter *counter) */ switch (perf_event_id(&counter->hw_event)) { case PERF_COUNT_CPU_CLOCK: - hw_ops = &perf_ops_cpu_clock; + pmu = &perf_ops_cpu_clock; - if (hw_event->irq_period && hw_event->irq_period < 10000) - hw_event->irq_period = 10000; break; case PERF_COUNT_TASK_CLOCK: /* @@ -2022,29 +2871,24 @@ sw_perf_counter_init(struct perf_counter *counter) * use the cpu_clock counter instead. */ if (counter->ctx->task) - hw_ops = &perf_ops_task_clock; + pmu = &perf_ops_task_clock; else - hw_ops = &perf_ops_cpu_clock; + pmu = &perf_ops_cpu_clock; - if (hw_event->irq_period && hw_event->irq_period < 10000) - hw_event->irq_period = 10000; break; case PERF_COUNT_PAGE_FAULTS: case PERF_COUNT_PAGE_FAULTS_MIN: case PERF_COUNT_PAGE_FAULTS_MAJ: case PERF_COUNT_CONTEXT_SWITCHES: - hw_ops = &perf_ops_generic; + pmu = &perf_ops_generic; break; case PERF_COUNT_CPU_MIGRATIONS: if (!counter->hw_event.exclude_kernel) - hw_ops = &perf_ops_cpu_migrations; + pmu = &perf_ops_cpu_migrations; break; } - if (hw_ops) - hwc->irq_period = hw_event->irq_period; - - return hw_ops; + return pmu; } /* @@ -2057,12 +2901,14 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, struct perf_counter *group_leader, gfp_t gfpflags) { - const struct hw_perf_counter_ops *hw_ops; + const struct pmu *pmu; struct perf_counter *counter; + struct hw_perf_counter *hwc; + long err; counter = kzalloc(sizeof(*counter), gfpflags); if (!counter) - return NULL; + return ERR_PTR(-ENOMEM); /* * Single counters are their own group leaders, with an @@ -2083,42 +2929,67 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->cpu = cpu; counter->hw_event = *hw_event; - counter->wakeup_pending = 0; counter->group_leader = group_leader; - counter->hw_ops = NULL; + counter->pmu = NULL; counter->ctx = ctx; counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) counter->state = PERF_COUNTER_STATE_OFF; - hw_ops = NULL; + pmu = NULL; + + hwc = &counter->hw; + if (hw_event->freq && hw_event->irq_freq) + hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq); + else + hwc->irq_period = hw_event->irq_period; + + /* + * we currently do not support PERF_RECORD_GROUP on inherited counters + */ + if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP)) + goto done; if (perf_event_raw(hw_event)) { - hw_ops = hw_perf_counter_init(counter); + pmu = hw_perf_counter_init(counter); goto done; } switch (perf_event_type(hw_event)) { case PERF_TYPE_HARDWARE: - hw_ops = hw_perf_counter_init(counter); + pmu = hw_perf_counter_init(counter); break; case PERF_TYPE_SOFTWARE: - hw_ops = sw_perf_counter_init(counter); + pmu = sw_perf_counter_init(counter); break; case PERF_TYPE_TRACEPOINT: - hw_ops = tp_perf_counter_init(counter); + pmu = tp_perf_counter_init(counter); break; } +done: + err = 0; + if (!pmu) + err = -EINVAL; + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); - if (!hw_ops) { + if (err) { kfree(counter); - return NULL; + return ERR_PTR(err); } -done: - counter->hw_ops = hw_ops; + + counter->pmu = pmu; + + atomic_inc(&nr_counters); + if (counter->hw_event.mmap) + atomic_inc(&nr_mmap_tracking); + if (counter->hw_event.munmap) + atomic_inc(&nr_munmap_tracking); + if (counter->hw_event.comm) + atomic_inc(&nr_comm_tracking); return counter; } @@ -2190,10 +3061,10 @@ SYSCALL_DEFINE5(perf_counter_open, goto err_put_context; } - ret = -EINVAL; counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, GFP_KERNEL); - if (!counter) + ret = PTR_ERR(counter); + if (IS_ERR(counter)) goto err_put_context; ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); @@ -2265,15 +3136,14 @@ inherit_counter(struct perf_counter *parent_counter, child_counter = perf_counter_alloc(&parent_counter->hw_event, parent_counter->cpu, child_ctx, group_leader, GFP_KERNEL); - if (!child_counter) - return NULL; + if (IS_ERR(child_counter)) + return child_counter; /* * Link it up in the child's context: */ child_counter->task = child; - list_add_counter(child_counter, child_ctx); - child_ctx->nr_counters++; + add_counter_to_ctx(child_counter, child_ctx); child_counter->parent = parent_counter; /* @@ -2318,15 +3188,17 @@ static int inherit_group(struct perf_counter *parent_counter, { struct perf_counter *leader; struct perf_counter *sub; + struct perf_counter *child_ctr; leader = inherit_counter(parent_counter, parent, parent_ctx, child, NULL, child_ctx); - if (!leader) - return -ENOMEM; + if (IS_ERR(leader)) + return PTR_ERR(leader); list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { - if (!inherit_counter(sub, parent, parent_ctx, - child, leader, child_ctx)) - return -ENOMEM; + child_ctr = inherit_counter(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); } return 0; } @@ -2334,15 +3206,18 @@ static int inherit_group(struct perf_counter *parent_counter, static void sync_child_counter(struct perf_counter *child_counter, struct perf_counter *parent_counter) { - u64 parent_val, child_val; + u64 child_val; - parent_val = atomic64_read(&parent_counter->count); child_val = atomic64_read(&child_counter->count); /* * Add back the child's count to the parent's count: */ atomic64_add(child_val, &parent_counter->count); + atomic64_add(child_counter->total_time_enabled, + &parent_counter->child_total_time_enabled); + atomic64_add(child_counter->total_time_running, + &parent_counter->child_total_time_running); /* * Remove this counter from the parent's list @@ -2364,7 +3239,6 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter_context *child_ctx) { struct perf_counter *parent_counter; - struct perf_counter *sub, *tmp; /* * If we do not self-reap then we have to wait for the @@ -2376,11 +3250,11 @@ __perf_counter_exit_task(struct task_struct *child, */ if (child != current) { wait_task_inactive(child, 0); - list_del_init(&child_counter->list_entry); + update_counter_times(child_counter); + list_del_counter(child_counter, child_ctx); } else { struct perf_cpu_context *cpuctx; unsigned long flags; - u64 perf_flags; /* * Disable and unlink this counter. @@ -2388,19 +3262,18 @@ __perf_counter_exit_task(struct task_struct *child, * Be careful about zapping the list - IRQ/NMI context * could still be processing it: */ - curr_rq_lock_irq_save(&flags); - perf_flags = hw_perf_save_disable(); + local_irq_save(flags); + perf_disable(); cpuctx = &__get_cpu_var(perf_cpu_context); group_sched_out(child_counter, cpuctx, child_ctx); + update_counter_times(child_counter); - list_del_init(&child_counter->list_entry); - - child_ctx->nr_counters--; + list_del_counter(child_counter, child_ctx); - hw_perf_restore(perf_flags); - curr_rq_unlock_irq_restore(&flags); + perf_enable(); + local_irq_restore(flags); } parent_counter = child_counter->parent; @@ -2411,13 +3284,6 @@ __perf_counter_exit_task(struct task_struct *child, */ if (parent_counter) { sync_child_counter(child_counter, parent_counter); - list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, - list_entry) { - if (sub->parent) { - sync_child_counter(sub, sub->parent); - free_counter(sub); - } - } free_counter(child_counter); } } @@ -2433,14 +3299,25 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter *child_counter, *tmp; struct perf_counter_context *child_ctx; + WARN_ON_ONCE(child != current); + child_ctx = &child->perf_counter_ctx; if (likely(!child_ctx->nr_counters)) return; +again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) __perf_counter_exit_task(child, child_counter, child_ctx); + + /* + * If the last counter was a group counter, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->counter_list)) + goto again; } /* @@ -2494,9 +3371,9 @@ static void __cpuinit perf_counter_init_cpu(int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); __perf_counter_init_context(&cpuctx->ctx, NULL); - mutex_lock(&perf_resource_mutex); + spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; - mutex_unlock(&perf_resource_mutex); + spin_unlock(&perf_resource_lock); hw_perf_counter_setup(cpu); } @@ -2552,15 +3429,12 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { .notifier_call = perf_cpu_notify, }; -static int __init perf_counter_init(void) +void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); - - return 0; } -early_initcall(perf_counter_init); static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) { @@ -2582,7 +3456,7 @@ perf_set_reserve_percpu(struct sysdev_class *class, if (val > perf_max_counters) return -EINVAL; - mutex_lock(&perf_resource_mutex); + spin_lock(&perf_resource_lock); perf_reserved_percpu = val; for_each_online_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); @@ -2592,7 +3466,7 @@ perf_set_reserve_percpu(struct sysdev_class *class, cpuctx->max_pertask = mpt; spin_unlock_irq(&cpuctx->ctx.lock); } - mutex_unlock(&perf_resource_mutex); + spin_unlock(&perf_resource_lock); return count; } @@ -2614,9 +3488,9 @@ perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) if (val > 1) return -EINVAL; - mutex_lock(&perf_resource_mutex); + spin_lock(&perf_resource_lock); perf_overcommit = val; - mutex_unlock(&perf_resource_mutex); + spin_unlock(&perf_resource_lock); return count; }