From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:10:57 +0100 Subject: [PATCH] perfcounters: fix task clock counter Impact: fix per task clock counter precision Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 8 +++++ kernel/exit.c | 17 ++++++--- kernel/perf_counter.c | 70 ++++++++++++++++++++++++++++--------- kernel/sched.c | 49 ++++++++++++++++++++++++-- 4 files changed, 120 insertions(+), 24 deletions(-) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 4a145caeee07..1b2e3242497c 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq) return sum; } + +/* + * Lock/unlock the current runqueue - to extract task statistics: + */ +extern void curr_rq_lock_irq_save(unsigned long *flags); +extern void curr_rq_unlock_irq_restore(unsigned long *flags); +extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); + extern void account_user_time(struct task_struct *, cputime_t); extern void account_user_time_scaled(struct task_struct *, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t); diff --git a/kernel/exit.c b/kernel/exit.c index d336c90a5f13..244edfd96865 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); #ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); @@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + /* + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: + */ + perf_counter_exit_task(tsk); + preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 961d651aa574..f1110ac1267b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* @@ -106,7 +107,8 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->disable(counter); @@ -135,7 +137,8 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); } @@ -209,7 +212,8 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the @@ -232,7 +236,8 @@ static void __perf_install_in_context(void *info) if (!ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); } /* @@ -438,15 +443,19 @@ int perf_counter_task_disable(void) struct task_struct *curr = current; struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + unsigned long flags; u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - local_irq_disable(); + curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); + /* force the update of the task clock: */ + __task_delta_exec(curr, 1); + perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -463,7 +472,7 @@ int perf_counter_task_disable(void) spin_unlock(&ctx->lock); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); return 0; } @@ -473,15 +482,19 @@ int perf_counter_task_enable(void) struct task_struct *curr = current; struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + unsigned long flags; u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - local_irq_disable(); + curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); + /* force the update of the task clock: */ + __task_delta_exec(curr, 1); + spin_lock(&ctx->lock); /* @@ -493,6 +506,7 @@ int perf_counter_task_enable(void) if (counter->state != PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -500,7 +514,7 @@ int perf_counter_task_enable(void) perf_counter_task_sched_in(curr, cpu); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); return 0; } @@ -540,8 +554,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) static void __read(void *info) { struct perf_counter *counter = info; + unsigned long flags; + curr_rq_lock_irq_save(&flags); counter->hw_ops->read(counter); + curr_rq_unlock_irq_restore(&flags); } static u64 perf_counter_read(struct perf_counter *counter) @@ -860,13 +877,27 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .read = cpu_clock_perf_counter_read, }; -static void task_clock_perf_counter_update(struct perf_counter *counter) +/* + * Called from within the scheduler: + */ +static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) { - u64 prev, now; + struct task_struct *curr = counter->task; + u64 delta; + + WARN_ON_ONCE(counter->task != current); + + delta = __task_delta_exec(curr, update); + + return curr->se.sum_exec_runtime + delta; +} + +static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +{ + u64 prev; s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = current->se.sum_exec_runtime; atomic64_set(&counter->hw.prev_count, now); @@ -877,17 +908,23 @@ static void task_clock_perf_counter_update(struct perf_counter *counter) static void task_clock_perf_counter_read(struct perf_counter *counter) { - task_clock_perf_counter_update(counter); + u64 now = task_clock_perf_counter_val(counter, 1); + + task_clock_perf_counter_update(counter, now); } static void task_clock_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime); + u64 now = task_clock_perf_counter_val(counter, 0); + + atomic64_set(&counter->hw.prev_count, now); } static void task_clock_perf_counter_disable(struct perf_counter *counter) { - task_clock_perf_counter_update(counter); + u64 now = task_clock_perf_counter_val(counter, 0); + + task_clock_perf_counter_update(counter, now); } static const struct hw_perf_counter_ops perf_ops_task_clock = { @@ -1267,6 +1304,7 @@ __perf_counter_exit_task(struct task_struct *child, { struct perf_counter *parent_counter; u64 parent_val, child_val; + unsigned long flags; u64 perf_flags; /* @@ -1275,7 +1313,7 @@ __perf_counter_exit_task(struct task_struct *child, * Be careful about zapping the list - IRQ/NMI context * could still be processing it: */ - local_irq_disable(); + curr_rq_lock_irq_save(&flags); perf_flags = hw_perf_save_disable(); if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { @@ -1294,7 +1332,7 @@ __perf_counter_exit_task(struct task_struct *child, list_del_init(&child_counter->list_entry); hw_perf_restore(perf_flags); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); parent_counter = child_counter->parent; /* diff --git a/kernel/sched.c b/kernel/sched.c index 382cfdb5e38d..4d84ff4c8774 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); } @@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +void curr_rq_lock_irq_save(unsigned long *flags) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_save(*flags); + rq = cpu_rq(smp_processor_id()); + spin_lock(&rq->lock); +} + +void curr_rq_unlock_irq_restore(unsigned long *flags) + __releases(rq->lock) +{ + struct rq *rq; + + rq = cpu_rq(smp_processor_id()); + spin_unlock(&rq->lock); + local_irq_restore(*flags); +} + void task_rq_unlock_wait(struct task_struct *p) { struct rq *rq = task_rq(p); @@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { fire_sched_out_preempt_notifiers(prev, next); - perf_counter_task_sched_out(prev, cpu_of(rq)); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -4089,6 +4108,29 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); +/* + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. + */ +unsigned long long __task_delta_exec(struct task_struct *p, int update) +{ + s64 delta_exec; + struct rq *rq; + + rq = task_rq(p); + WARN_ON_ONCE(!runqueue_is_locked()); + WARN_ON_ONCE(!task_current(rq, p)); + + if (update) + update_rq_clock(rq); + + delta_exec = rq->clock - p->se.exec_start; + + WARN_ON_ONCE(delta_exec < 0); + + return delta_exec; +} + /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. @@ -4316,13 +4358,13 @@ void scheduler_tick(void) update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); + perf_counter_task_tick(curr, cpu); spin_unlock(&rq->lock); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif - perf_counter_task_tick(curr, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -4512,6 +4554,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, cpu); rq->nr_switches++; rq->curr = next; -- 2.20.1