timer: Reduce timer migration overhead if disabled
authorThomas Gleixner <tglx@linutronix.de>
Tue, 26 May 2015 22:50:33 +0000 (22:50 +0000)
committerThomas Gleixner <tglx@linutronix.de>
Fri, 19 Jun 2015 13:18:28 +0000 (15:18 +0200)
Eric reported that the timer_migration sysctl is not really nice
performance wise as it needs to check at every timer insertion whether
the feature is enabled or not. Further the check does not live in the
timer code, so we have an extra function call which checks an extra
cache line to figure out that it is disabled.

We can do better and store that information in the per cpu (hr)timer
bases. I pondered to use a static key, but that's a nightmare to
update from the nohz code and the timer base cache line is hot anyway
when we select a timer base.

The old logic enabled the timer migration unconditionally if
CONFIG_NO_HZ was set even if nohz was disabled on the kernel command
line.

With this modification, we start off with migration disabled. The user
visible sysctl is still set to enabled. If the kernel switches to NOHZ
migration is enabled, if the user did not disable it via the sysctl
prior to the switch. If nohz=off is on the kernel command line,
migration stays disabled no matter what.

Before:
  47.76%  hog       [.] main
  14.84%  [kernel]  [k] _raw_spin_lock_irqsave
   9.55%  [kernel]  [k] _raw_spin_unlock_irqrestore
   6.71%  [kernel]  [k] mod_timer
   6.24%  [kernel]  [k] lock_timer_base.isra.38
   3.76%  [kernel]  [k] detach_if_pending
   3.71%  [kernel]  [k] del_timer
   2.50%  [kernel]  [k] internal_add_timer
   1.51%  [kernel]  [k] get_nohz_timer_target
   1.28%  [kernel]  [k] __internal_add_timer
   0.78%  [kernel]  [k] timerfn
   0.48%  [kernel]  [k] wake_up_nohz_cpu

After:
  48.10%  hog       [.] main
  15.25%  [kernel]  [k] _raw_spin_lock_irqsave
   9.76%  [kernel]  [k] _raw_spin_unlock_irqrestore
   6.50%  [kernel]  [k] mod_timer
   6.44%  [kernel]  [k] lock_timer_base.isra.38
   3.87%  [kernel]  [k] detach_if_pending
   3.80%  [kernel]  [k] del_timer
   2.67%  [kernel]  [k] internal_add_timer
   1.33%  [kernel]  [k] __internal_add_timer
   0.73%  [kernel]  [k] timerfn
   0.54%  [kernel]  [k] wake_up_nohz_cpu

Reported-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Paul McKenney <paulmck@linux.vnet.ibm.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Viresh Kumar <viresh.kumar@linaro.org>
Cc: John Stultz <john.stultz@linaro.org>
Cc: Joonwoo Park <joonwoop@codeaurora.org>
Cc: Wenbo Wang <wenbo.wang@memblaze.com>
Link: http://lkml.kernel.org/r/20150526224512.127050787@linutronix.de
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
12 files changed:
include/linux/hrtimer.h
include/linux/sched.h
include/linux/sched/sysctl.h
include/linux/timer.h
kernel/rcu/tree_plugin.h
kernel/sched/core.c
kernel/sysctl.c
kernel/time/hrtimer.c
kernel/time/tick-internal.h
kernel/time/tick-sched.c
kernel/time/timer.c
kernel/time/timer_list.c

index 5db055821ef3282dcec7f794823e8551a91480e3..69551020bb97250d763eaceda60f6877c867840a 100644 (file)
@@ -163,6 +163,7 @@ enum  hrtimer_base_type {
  * @cpu:               cpu number
  * @active_bases:      Bitfield to mark bases with active timers
  * @clock_was_set_seq: Sequence counter of clock was set events
+ * @migration_enabled: The migration of hrtimers to other cpus is enabled
  * @expires_next:      absolute time of the next event which was scheduled
  *                     via clock_set_next_event()
  * @next_timer:                Pointer to the first expiring timer
@@ -186,6 +187,7 @@ struct hrtimer_cpu_base {
        unsigned int                    cpu;
        unsigned int                    active_bases;
        unsigned int                    clock_was_set_seq;
+       bool                            migration_enabled;
 #ifdef CONFIG_HIGH_RES_TIMERS
        unsigned int                    in_hrtirq       : 1,
                                        hres_active     : 1,
index 26a2e6122734f8237ac44d47fb6bf4e96cca124b..d7151460b0cfc98d437211a347524164a5d3bc0f 100644 (file)
@@ -335,14 +335,10 @@ extern int runqueue_is_locked(int cpu);
 #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
 extern void nohz_balance_enter_idle(int cpu);
 extern void set_cpu_sd_state_idle(void);
-extern int get_nohz_timer_target(int pinned);
+extern int get_nohz_timer_target(void);
 #else
 static inline void nohz_balance_enter_idle(int cpu) { }
 static inline void set_cpu_sd_state_idle(void) { }
-static inline int get_nohz_timer_target(int pinned)
-{
-       return smp_processor_id();
-}
 #endif
 
 /*
index 596a0e007c62d97e57d040ee45fa3df784403880..c9e4731cf10b8e97956b160c503e447490991931 100644 (file)
@@ -57,24 +57,12 @@ extern unsigned int sysctl_numa_balancing_scan_size;
 extern unsigned int sysctl_sched_migration_cost;
 extern unsigned int sysctl_sched_nr_migrate;
 extern unsigned int sysctl_sched_time_avg;
-extern unsigned int sysctl_timer_migration;
 extern unsigned int sysctl_sched_shares_window;
 
 int sched_proc_update_handler(struct ctl_table *table, int write,
                void __user *buffer, size_t *length,
                loff_t *ppos);
 #endif
-#ifdef CONFIG_SCHED_DEBUG
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-       return sysctl_timer_migration;
-}
-#else
-static inline unsigned int get_sysctl_timer_migration(void)
-{
-       return 1;
-}
-#endif
 
 /*
  *  control realtime throttling:
index ff0689b6e297355232488ecf18200474df448632..61aa61dc410cf5035beb63c2873a471be4c50372 100644 (file)
@@ -238,6 +238,15 @@ extern void run_local_timers(void);
 struct hrtimer;
 extern enum hrtimer_restart it_real_fn(struct hrtimer *);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+#include <linux/sysctl.h>
+
+extern unsigned int sysctl_timer_migration;
+int timer_migration_handler(struct ctl_table *table, int write,
+                           void __user *buffer, size_t *lenp,
+                           loff_t *ppos);
+#endif
+
 unsigned long __round_jiffies(unsigned long j, int cpu);
 unsigned long __round_jiffies_relative(unsigned long j, int cpu);
 unsigned long round_jiffies(unsigned long j);
index 0ef80a0bbabbc6736533b49e1daf68691794e206..d72fa24f23128a640a386525070676a5446de2cb 100644 (file)
@@ -1432,8 +1432,6 @@ module_param(rcu_idle_gp_delay, int, 0644);
 static int rcu_idle_lazy_gp_delay = RCU_IDLE_LAZY_GP_DELAY;
 module_param(rcu_idle_lazy_gp_delay, int, 0644);
 
-extern int tick_nohz_active;
-
 /*
  * Try to advance callbacks for all flavors of RCU on the current CPU, but
  * only if it has been awhile since the last time we did so.  Afterwards,
index ecb7c4216350cf00f3be979246fc44b8a14a0c23..e9f25ce70c77396d312335552428e43535ce1df4 100644 (file)
@@ -572,13 +572,12 @@ void resched_cpu(int cpu)
  * selecting an idle cpu will add more delays to the timers than intended
  * (as that cpu's timer base may not be uptodate wrt jiffies etc).
  */
-int get_nohz_timer_target(int pinned)
+int get_nohz_timer_target(void)
 {
-       int cpu = smp_processor_id();
-       int i;
+       int i, cpu = smp_processor_id();
        struct sched_domain *sd;
 
-       if (pinned || !get_sysctl_timer_migration() || !idle_cpu(cpu))
+       if (!idle_cpu(cpu))
                return cpu;
 
        rcu_read_lock();
@@ -7050,8 +7049,6 @@ void __init sched_init_smp(void)
 }
 #endif /* CONFIG_SMP */
 
-const_debug unsigned int sysctl_timer_migration = 1;
-
 int in_sched_functions(unsigned long addr)
 {
        return in_lock_functions(addr) ||
index 2082b1a88fb9a451a00a759379bec8d786c3bab7..b13e9d2de302411438ba62898ca27697130d38b0 100644 (file)
@@ -349,15 +349,6 @@ static struct ctl_table kern_table[] = {
                .mode           = 0644,
                .proc_handler   = proc_dointvec,
        },
-       {
-               .procname       = "timer_migration",
-               .data           = &sysctl_timer_migration,
-               .maxlen         = sizeof(unsigned int),
-               .mode           = 0644,
-               .proc_handler   = proc_dointvec_minmax,
-               .extra1         = &zero,
-               .extra2         = &one,
-       },
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
        {
@@ -1132,6 +1123,15 @@ static struct ctl_table kern_table[] = {
                .extra1         = &zero,
                .extra2         = &one,
        },
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+       {
+               .procname       = "timer_migration",
+               .data           = &sysctl_timer_migration,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = timer_migration_handler,
+       },
+#endif
        { }
 };
 
index f026413de4d68fb6fdd5ca28dc21dc837340ec46..6115f4df119b91a44143d60f2fe97d40a5b4d9e0 100644 (file)
@@ -177,6 +177,24 @@ hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base)
 #endif
 }
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+                                        int pinned)
+{
+       if (pinned || !base->migration_enabled)
+               return this_cpu_ptr(&hrtimer_bases);
+       return &per_cpu(hrtimer_bases, get_nohz_timer_target());
+}
+#else
+static inline
+struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base,
+                                        int pinned)
+{
+       return this_cpu_ptr(&hrtimer_bases);
+}
+#endif
+
 /*
  * Switch the timer base to the current CPU when possible.
  */
@@ -184,14 +202,13 @@ static inline struct hrtimer_clock_base *
 switch_hrtimer_base(struct hrtimer *timer, struct hrtimer_clock_base *base,
                    int pinned)
 {
+       struct hrtimer_cpu_base *new_cpu_base, *this_base;
        struct hrtimer_clock_base *new_base;
-       struct hrtimer_cpu_base *new_cpu_base;
-       int this_cpu = smp_processor_id();
-       int cpu = get_nohz_timer_target(pinned);
        int basenum = base->index;
 
+       this_base = this_cpu_ptr(&hrtimer_bases);
+       new_cpu_base = get_target_base(this_base, pinned);
 again:
-       new_cpu_base = &per_cpu(hrtimer_bases, cpu);
        new_base = &new_cpu_base->clock_base[basenum];
 
        if (base != new_base) {
@@ -212,17 +229,19 @@ again:
                raw_spin_unlock(&base->cpu_base->lock);
                raw_spin_lock(&new_base->cpu_base->lock);
 
-               if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-                       cpu = this_cpu;
+               if (new_cpu_base != this_base &&
+                   hrtimer_check_target(timer, new_base)) {
                        raw_spin_unlock(&new_base->cpu_base->lock);
                        raw_spin_lock(&base->cpu_base->lock);
+                       new_cpu_base = this_base;
                        timer->base = base;
                        goto again;
                }
                timer->base = new_base;
        } else {
-               if (cpu != this_cpu && hrtimer_check_target(timer, new_base)) {
-                       cpu = this_cpu;
+               if (new_cpu_base != this_base &&
+                   hrtimer_check_target(timer, new_base)) {
+                       new_cpu_base = this_base;
                        goto again;
                }
        }
index ec2208aabdd180ec5b34fe73538a66f8ed121f9d..2edde84744df92a99d283a29ec2bb13f1499370e 100644 (file)
@@ -149,4 +149,18 @@ extern void tick_nohz_init(void);
 static inline void tick_nohz_init(void) { }
 #endif
 
+#ifdef CONFIG_NO_HZ_COMMON
+extern unsigned long tick_nohz_active;
+#else
+#define tick_nohz_active (0)
+#endif
+
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+extern void timers_update_migration(void);
+#else
+static inline void timers_update_migration(void) { }
+#endif
+
+DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
+
 extern u64 get_next_timer_interrupt(unsigned long basej, u64 basem);
index 812f7a3b9898368d85952f1e15d732fb588ad6dc..b1cb0169935509aa1ff77146e61db4b8afb90d85 100644 (file)
@@ -399,7 +399,7 @@ void __init tick_nohz_init(void)
  * NO HZ enabled ?
  */
 static int tick_nohz_enabled __read_mostly  = 1;
-int tick_nohz_active  __read_mostly;
+unsigned long tick_nohz_active  __read_mostly;
 /*
  * Enable / Disable tickless mode
  */
@@ -956,6 +956,16 @@ static void tick_nohz_handler(struct clock_event_device *dev)
        tick_program_event(hrtimer_get_expires(&ts->sched_timer), 1);
 }
 
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode)
+{
+       if (!tick_nohz_enabled)
+               return;
+       ts->nohz_mode = mode;
+       /* One update is enough */
+       if (!test_and_set_bit(0, &tick_nohz_active))
+               timers_update_migration();
+}
+
 /**
  * tick_nohz_switch_to_nohz - switch to nohz mode
  */
@@ -970,9 +980,6 @@ static void tick_nohz_switch_to_nohz(void)
        if (tick_switch_to_oneshot(tick_nohz_handler))
                return;
 
-       tick_nohz_active = 1;
-       ts->nohz_mode = NOHZ_MODE_LOWRES;
-
        /*
         * Recycle the hrtimer in ts, so we can share the
         * hrtimer_forward with the highres code.
@@ -984,6 +991,7 @@ static void tick_nohz_switch_to_nohz(void)
        hrtimer_forward_now(&ts->sched_timer, tick_period);
        hrtimer_set_expires(&ts->sched_timer, next);
        tick_program_event(next, 1);
+       tick_nohz_activate(ts, NOHZ_MODE_LOWRES);
 }
 
 /*
@@ -1035,6 +1043,7 @@ static inline void tick_nohz_irq_enter(void)
 
 static inline void tick_nohz_switch_to_nohz(void) { }
 static inline void tick_nohz_irq_enter(void) { }
+static inline void tick_nohz_activate(struct tick_sched *ts, int mode) { }
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
@@ -1117,13 +1126,7 @@ void tick_setup_sched_timer(void)
 
        hrtimer_forward(&ts->sched_timer, now, tick_period);
        hrtimer_start_expires(&ts->sched_timer, HRTIMER_MODE_ABS_PINNED);
-
-#ifdef CONFIG_NO_HZ_COMMON
-       if (tick_nohz_enabled) {
-               ts->nohz_mode = NOHZ_MODE_HIGHRES;
-               tick_nohz_active = 1;
-       }
-#endif
+       tick_nohz_activate(ts, NOHZ_MODE_HIGHRES);
 }
 #endif /* HIGH_RES_TIMERS */
 
index 3398d93c74a7a0cdd30fbc355a29c43177520e66..343142ed996aa10149a30ed82a36f5fd78d1680d 100644 (file)
@@ -85,6 +85,7 @@ struct tvec_base {
        unsigned long active_timers;
        unsigned long all_timers;
        int cpu;
+       bool migration_enabled;
        struct tvec_root tv1;
        struct tvec tv2;
        struct tvec tv3;
@@ -95,6 +96,54 @@ struct tvec_base {
 
 static DEFINE_PER_CPU(struct tvec_base, tvec_bases);
 
+#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
+unsigned int sysctl_timer_migration = 1;
+
+void timers_update_migration(void)
+{
+       bool on = sysctl_timer_migration && tick_nohz_active;
+       unsigned int cpu;
+
+       /* Avoid the loop, if nothing to update */
+       if (this_cpu_read(tvec_bases.migration_enabled) == on)
+               return;
+
+       for_each_possible_cpu(cpu) {
+               per_cpu(tvec_bases.migration_enabled, cpu) = on;
+               per_cpu(hrtimer_bases.migration_enabled, cpu) = on;
+       }
+}
+
+int timer_migration_handler(struct ctl_table *table, int write,
+                           void __user *buffer, size_t *lenp,
+                           loff_t *ppos)
+{
+       static DEFINE_MUTEX(mutex);
+       int ret;
+
+       mutex_lock(&mutex);
+       ret = proc_dointvec(table, write, buffer, lenp, ppos);
+       if (!ret && write)
+               timers_update_migration();
+       mutex_unlock(&mutex);
+       return ret;
+}
+
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+                                               int pinned)
+{
+       if (pinned || !base->migration_enabled)
+               return this_cpu_ptr(&tvec_bases);
+       return per_cpu_ptr(&tvec_bases, get_nohz_timer_target());
+}
+#else
+static inline struct tvec_base *get_target_base(struct tvec_base *base,
+                                               int pinned)
+{
+       return this_cpu_ptr(&tvec_bases);
+}
+#endif
+
 static unsigned long round_jiffies_common(unsigned long j, int cpu,
                bool force_up)
 {
@@ -716,11 +765,11 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 
 static inline int
 __mod_timer(struct timer_list *timer, unsigned long expires,
-                                               bool pending_only, int pinned)
+           bool pending_only, int pinned)
 {
        struct tvec_base *base, *new_base;
        unsigned long flags;
-       int ret = 0 , cpu;
+       int ret = 0;
 
        timer_stats_timer_set_start_info(timer);
        BUG_ON(!timer->function);
@@ -733,8 +782,7 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
 
        debug_activate(timer, expires);
 
-       cpu = get_nohz_timer_target(pinned);
-       new_base = per_cpu_ptr(&tvec_bases, cpu);
+       new_base = get_target_base(base, pinned);
 
        if (base != new_base) {
                /*
@@ -751,7 +799,8 @@ __mod_timer(struct timer_list *timer, unsigned long expires,
                        spin_unlock(&base->lock);
                        base = new_base;
                        spin_lock(&base->lock);
-                       timer->flags = (timer->flags & ~TIMER_BASEMASK) | cpu;
+                       timer->flags &= ~TIMER_BASEMASK;
+                       timer->flags |= base->cpu;
                }
        }
 
index 1327004429be6fee0b29a7aa9a19ff16a31362e2..a4536e1e3e2ab7f0e298322e2217474a2f05579d 100644 (file)
@@ -29,8 +29,6 @@ struct timer_list_iter {
 
 typedef void (*print_fn_t)(struct seq_file *m, unsigned int *classes);
 
-DECLARE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases);
-
 /*
  * This allows printing both to /proc/timer_list and
  * to the console (on SysRq-Q):