perf_counter: unify and fix delayed counter wakeup
authorPeter Zijlstra <a.p.zijlstra@chello.nl>
Mon, 30 Mar 2009 17:07:02 +0000 (19:07 +0200)
committerIngo Molnar <mingo@elte.hu>
Mon, 6 Apr 2009 07:30:36 +0000 (09:30 +0200)
While going over the wakeup code I noticed delayed wakeups only work
for hardware counters but basically all software counters rely on
them.

This patch unifies and generalizes the delayed wakeup to fix this
issue.

Since we're dealing with NMI context bits here, use a cmpxchg() based
single link list implementation to track counters that have pending
wakeups.

[ This should really be generic code for delayed wakeups, but since we
  cannot use cmpxchg()/xchg() in generic code, I've let it live in the
  perf_counter code. -- Eric Dumazet could use it to aggregate the
  network wakeups. ]

Furthermore, the x86 method of using TIF flags was flawed in that its
quite possible to end up setting the bit on the idle task, loosing the
wakeup.

The powerpc method uses per-cpu storage and does appear to be
sufficient.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Orig-LKML-Reference: <20090330171023.153932974@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
arch/powerpc/include/asm/hw_irq.h
arch/powerpc/kernel/irq.c
arch/powerpc/kernel/perf_counter.c
arch/x86/include/asm/perf_counter.h
arch/x86/include/asm/thread_info.h
arch/x86/kernel/cpu/perf_counter.c
arch/x86/kernel/signal.c
include/linux/perf_counter.h
kernel/perf_counter.c
kernel/timer.c

index cb32d571c9c7f2ac557477343890f73ae881c2d5..20a44d0c9fdde3a016fb3bdab168c7d9475753da 100644 (file)
@@ -132,7 +132,7 @@ static inline int irqs_disabled_flags(unsigned long flags)
 struct irq_chip;
 
 #ifdef CONFIG_PERF_COUNTERS
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
        unsigned long x;
 
@@ -160,7 +160,7 @@ extern void perf_counter_do_pending(void);
 
 #else
 
-static inline unsigned long get_perf_counter_pending(void)
+static inline unsigned long test_perf_counter_pending(void)
 {
        return 0;
 }
index 469e9635ff04b505812805f308190d50e158c59c..2cd471f92fe65cbee6135bee2c61c1b7c90ba4a4 100644 (file)
@@ -135,7 +135,7 @@ notrace void raw_local_irq_restore(unsigned long en)
                        iseries_handle_interrupts();
        }
 
-       if (get_perf_counter_pending()) {
+       if (test_perf_counter_pending()) {
                clear_perf_counter_pending();
                perf_counter_do_pending();
        }
index df007fe0cc0bb6e6c989dc273a82349df8e5af99..cde720fc495cd245604be0a25cc8757a65932f10 100644 (file)
@@ -649,24 +649,6 @@ hw_perf_counter_init(struct perf_counter *counter)
        return &power_perf_ops;
 }
 
-/*
- * Handle wakeups.
- */
-void perf_counter_do_pending(void)
-{
-       int i;
-       struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
-       struct perf_counter *counter;
-
-       for (i = 0; i < cpuhw->n_counters; ++i) {
-               counter = cpuhw->counter[i];
-               if (counter && counter->wakeup_pending) {
-                       counter->wakeup_pending = 0;
-                       wake_up(&counter->waitq);
-               }
-       }
-}
-
 /*
  * A counter has overflowed; update its count and record
  * things if requested.  Note that interrupts are hard-disabled
@@ -720,7 +702,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
        struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters);
        struct perf_counter *counter;
        long val;
-       int need_wakeup = 0, found = 0;
+       int found = 0;
 
        for (i = 0; i < cpuhw->n_counters; ++i) {
                counter = cpuhw->counter[i];
@@ -761,7 +743,7 @@ static void perf_counter_interrupt(struct pt_regs *regs)
         * immediately; otherwise we'll have do the wakeup when interrupts
         * get soft-enabled.
         */
-       if (get_perf_counter_pending() && regs->softe) {
+       if (test_perf_counter_pending() && regs->softe) {
                irq_enter();
                clear_perf_counter_pending();
                perf_counter_do_pending();
index 1662043b340fb527d296d8bd049590c96a04a6db..e2b0e66b2353539482042db1894047c67d1fd00a 100644 (file)
@@ -84,8 +84,9 @@ union cpuid10_edx {
 #define MSR_ARCH_PERFMON_FIXED_CTR2                    0x30b
 #define X86_PMC_IDX_FIXED_BUS_CYCLES                   (X86_PMC_IDX_FIXED + 2)
 
-#define set_perf_counter_pending()     \
-               set_tsk_thread_flag(current, TIF_PERF_COUNTERS);
+#define set_perf_counter_pending()     do { } while (0)
+#define clear_perf_counter_pending()   do { } while (0)
+#define test_perf_counter_pending()    (0)
 
 #ifdef CONFIG_PERF_COUNTERS
 extern void init_hw_perf_counters(void);
index 3ffd5d2a3676910165f4db2ef82c8048970b7cb7..8820a73ae090aae29aa3454d6f3241ffcbef5440 100644 (file)
@@ -83,7 +83,6 @@ struct thread_info {
 #define TIF_SYSCALL_AUDIT      7       /* syscall auditing active */
 #define TIF_SECCOMP            8       /* secure computing */
 #define TIF_MCE_NOTIFY         10      /* notify userspace of an MCE */
-#define TIF_PERF_COUNTERS      11      /* notify perf counter work */
 #define TIF_NOTSC              16      /* TSC is not accessible in userland */
 #define TIF_IA32               17      /* 32bit process */
 #define TIF_FORK               18      /* ret_from_fork */
@@ -107,7 +106,6 @@ struct thread_info {
 #define _TIF_SYSCALL_AUDIT     (1 << TIF_SYSCALL_AUDIT)
 #define _TIF_SECCOMP           (1 << TIF_SECCOMP)
 #define _TIF_MCE_NOTIFY                (1 << TIF_MCE_NOTIFY)
-#define _TIF_PERF_COUNTERS     (1 << TIF_PERF_COUNTERS)
 #define _TIF_NOTSC             (1 << TIF_NOTSC)
 #define _TIF_IA32              (1 << TIF_IA32)
 #define _TIF_FORK              (1 << TIF_FORK)
@@ -141,7 +139,7 @@ struct thread_info {
 
 /* Only used for 64 bit */
 #define _TIF_DO_NOTIFY_MASK                                            \
-       (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME)
+       (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME)
 
 /* flags to check in __switch_to() */
 #define _TIF_WORK_CTXSW                                                        \
index 3f95b0cdc550e1ed5ad351379c9d131c96dd9941..7aab177fb566378ab40a815dd8c073b7ce367b1f 100644 (file)
@@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
                 */
                hwc->config |= pmc_ops->event_map(perf_event_id(hw_event));
        }
-       counter->wakeup_pending = 0;
 
        return 0;
 }
@@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs)
        irq_exit();
 }
 
-/*
- * This handler is triggered by NMI contexts:
- */
-void perf_counter_notify(struct pt_regs *regs)
-{
-       struct cpu_hw_counters *cpuc;
-       unsigned long flags;
-       int bit, cpu;
-
-       local_irq_save(flags);
-       cpu = smp_processor_id();
-       cpuc = &per_cpu(cpu_hw_counters, cpu);
-
-       for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) {
-               struct perf_counter *counter = cpuc->counters[bit];
-
-               if (!counter)
-                       continue;
-
-               if (counter->wakeup_pending) {
-                       counter->wakeup_pending = 0;
-                       wake_up(&counter->waitq);
-               }
-       }
-
-       local_irq_restore(flags);
-}
-
 void perf_counters_lapic_init(int nmi)
 {
        u32 apic_val;
index 611615a92c9029ad882638f6619f45f0d8e6348c..0a813b17b172437760b6d8beae71104f6fc878ba 100644 (file)
@@ -6,7 +6,6 @@
  *  2000-06-20  Pentium III FXSR, SSE support by Gareth Hughes
  *  2000-2002   x86-64 support by Andi Kleen
  */
-#include <linux/perf_counter.h>
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
@@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
                tracehook_notify_resume(regs);
        }
 
-       if (thread_info_flags & _TIF_PERF_COUNTERS) {
-               clear_thread_flag(TIF_PERF_COUNTERS);
-               perf_counter_notify(regs);
-       }
-
 #ifdef CONFIG_X86_32
        clear_thread_flag(TIF_IRET);
 #endif /* CONFIG_X86_32 */
index 6bf67ce17625990f46a6248ae8c4164874ac11da..0d833228eee5a6b307c7beecb3d5bcfef072fdb8 100644 (file)
@@ -275,6 +275,10 @@ struct perf_mmap_data {
        void                            *data_pages[0];
 };
 
+struct perf_wakeup_entry {
+       struct perf_wakeup_entry *next;
+};
+
 /**
  * struct perf_counter - performance counter kernel representation:
  */
@@ -350,7 +354,7 @@ struct perf_counter {
        /* poll related */
        wait_queue_head_t               waitq;
        /* optional: for NMIs */
-       int                             wakeup_pending;
+       struct perf_wakeup_entry        wakeup;
 
        void (*destroy)(struct perf_counter *);
        struct rcu_head                 rcu_head;
@@ -427,7 +431,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, int cpu);
 extern void perf_counter_task_tick(struct task_struct *task, int cpu);
 extern void perf_counter_init_task(struct task_struct *child);
 extern void perf_counter_exit_task(struct task_struct *child);
-extern void perf_counter_notify(struct pt_regs *regs);
+extern void perf_counter_do_pending(void);
 extern void perf_counter_print_debug(void);
 extern void perf_counter_unthrottle(void);
 extern u64 hw_perf_save_disable(void);
@@ -461,7 +465,7 @@ static inline void
 perf_counter_task_tick(struct task_struct *task, int cpu)              { }
 static inline void perf_counter_init_task(struct task_struct *child)   { }
 static inline void perf_counter_exit_task(struct task_struct *child)   { }
-static inline void perf_counter_notify(struct pt_regs *regs)           { }
+static inline void perf_counter_do_pending(void)                       { }
 static inline void perf_counter_print_debug(void)                      { }
 static inline void perf_counter_unthrottle(void)                       { }
 static inline void hw_perf_restore(u64 ctrl)                           { }
@@ -469,8 +473,9 @@ static inline u64 hw_perf_save_disable(void)                      { return 0; }
 static inline int perf_counter_task_disable(void)      { return -EINVAL; }
 static inline int perf_counter_task_enable(void)       { return -EINVAL; }
 
-static inline void perf_swcounter_event(u32 event, u64 nr,
-                                       int nmi, struct pt_regs *regs)  { }
+static inline void
+perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { }
+
 #endif
 
 #endif /* __KERNEL__ */
index 3b862a7988cda5180f79723073e9e77828a41fe6..f70ff80e79d7a6f99f42793a1d5a286fab2c57e4 100644 (file)
@@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head)
        kfree(counter);
 }
 
+static void perf_pending_sync(struct perf_counter *counter);
+
 static void free_counter(struct perf_counter *counter)
 {
+       perf_pending_sync(counter);
+
        if (counter->destroy)
                counter->destroy(counter);
 
@@ -1528,6 +1532,118 @@ static const struct file_operations perf_fops = {
        .mmap                   = perf_mmap,
 };
 
+/*
+ * Perf counter wakeup
+ *
+ * If there's data, ensure we set the poll() state and publish everything
+ * to user-space before waking everybody up.
+ */
+
+void perf_counter_wakeup(struct perf_counter *counter)
+{
+       struct perf_mmap_data *data;
+
+       rcu_read_lock();
+       data = rcu_dereference(counter->data);
+       if (data) {
+               (void)atomic_xchg(&data->wakeup, POLL_IN);
+               __perf_counter_update_userpage(counter, data);
+       }
+       rcu_read_unlock();
+
+       wake_up_all(&counter->waitq);
+}
+
+/*
+ * Pending wakeups
+ *
+ * Handle the case where we need to wakeup up from NMI (or rq->lock) context.
+ *
+ * The NMI bit means we cannot possibly take locks. Therefore, maintain a
+ * single linked list and use cmpxchg() to add entries lockless.
+ */
+
+#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL)
+
+static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = {
+       PENDING_TAIL,
+};
+
+static void perf_pending_queue(struct perf_counter *counter)
+{
+       struct perf_wakeup_entry **head;
+       struct perf_wakeup_entry *prev, *next;
+
+       if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL)
+               return;
+
+       head = &get_cpu_var(perf_wakeup_head);
+
+       do {
+               prev = counter->wakeup.next = *head;
+               next = &counter->wakeup;
+       } while (cmpxchg(head, prev, next) != prev);
+
+       set_perf_counter_pending();
+
+       put_cpu_var(perf_wakeup_head);
+}
+
+static int __perf_pending_run(void)
+{
+       struct perf_wakeup_entry *list;
+       int nr = 0;
+
+       list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL);
+       while (list != PENDING_TAIL) {
+               struct perf_counter *counter = container_of(list,
+                               struct perf_counter, wakeup);
+
+               list = list->next;
+
+               counter->wakeup.next = NULL;
+               /*
+                * Ensure we observe the unqueue before we issue the wakeup,
+                * so that we won't be waiting forever.
+                * -- see perf_not_pending().
+                */
+               smp_wmb();
+
+               perf_counter_wakeup(counter);
+               nr++;
+       }
+
+       return nr;
+}
+
+static inline int perf_not_pending(struct perf_counter *counter)
+{
+       /*
+        * If we flush on whatever cpu we run, there is a chance we don't
+        * need to wait.
+        */
+       get_cpu();
+       __perf_pending_run();
+       put_cpu();
+
+       /*
+        * Ensure we see the proper queue state before going to sleep
+        * so that we do not miss the wakeup. -- see perf_pending_handle()
+        */
+       smp_rmb();
+       return counter->wakeup.next == NULL;
+}
+
+static void perf_pending_sync(struct perf_counter *counter)
+{
+       wait_event(counter->waitq, perf_not_pending(counter));
+}
+
+void perf_counter_do_pending(void)
+{
+       __perf_pending_run();
+}
+
 /*
  * Output
  */
@@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle,
 static void perf_output_end(struct perf_output_handle *handle, int nmi)
 {
        if (handle->wakeup) {
-               (void)atomic_xchg(&handle->data->wakeup, POLL_IN);
-               __perf_counter_update_userpage(handle->counter, handle->data);
-               if (nmi) {
-                       handle->counter->wakeup_pending = 1;
-                       set_perf_counter_pending();
-               } else
-                       wake_up(&handle->counter->waitq);
+               if (nmi)
+                       perf_pending_queue(handle->counter);
+               else
+                       perf_counter_wakeup(handle->counter);
        }
        rcu_read_unlock();
 }
@@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event,
 
        counter->cpu                    = cpu;
        counter->hw_event               = *hw_event;
-       counter->wakeup_pending         = 0;
        counter->group_leader           = group_leader;
        counter->hw_ops                 = NULL;
        counter->ctx                    = ctx;
index b4555568b4e4ad16f34a887eabed6f21e05abfba..672ca25fbc43a9c151f48be2ec6fab222337d357 100644 (file)
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/tick.h>
 #include <linux/kallsyms.h>
+#include <linux/perf_counter.h>
 
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
@@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h)
 {
        struct tvec_base *base = __get_cpu_var(tvec_bases);
 
+       perf_counter_do_pending();
+
        hrtimer_run_pending();
 
        if (time_after_eq(jiffies, base->timer_jiffies))