sched/fair: Prefer sibiling only if local group is under-utilized
authorSrikar Dronamraju <srikar@linux.vnet.ibm.com>
Wed, 22 Mar 2017 17:57:50 +0000 (23:27 +0530)
committerIngo Molnar <mingo@kernel.org>
Mon, 27 Mar 2017 08:22:26 +0000 (10:22 +0200)
If the child domain prefers tasks to go siblings, the local group could
end up pulling tasks to itself even if the local group is almost equally
loaded as the source group.

Lets assume a 4 core,smt==2 machine running 5 thread ebizzy workload.
Everytime, local group has capacity and source group has atleast 2 threads,
local group tries to pull the task. This causes the threads to constantly
move between different cores. This is even more profound if the cores have
more threads, like in Power 8, smt 8 mode.

Fix this by only allowing local group to pull a task, if the source group
has more number of tasks than the local group.

Here are the relevant perf stat numbers of a 22 core,smt 8 Power 8 machine.

Without patch:
 Performance counter stats for 'ebizzy -t 22 -S 100' (5 runs):

             1,440      context-switches          #    0.001 K/sec                    ( +-  1.26% )
               366      cpu-migrations            #    0.000 K/sec                    ( +-  5.58% )
             3,933      page-faults               #    0.002 K/sec                    ( +- 11.08% )

 Performance counter stats for 'ebizzy -t 48 -S 100' (5 runs):

             6,287      context-switches          #    0.001 K/sec                    ( +-  3.65% )
             3,776      cpu-migrations            #    0.001 K/sec                    ( +-  4.84% )
             5,702      page-faults               #    0.001 K/sec                    ( +-  9.36% )

 Performance counter stats for 'ebizzy -t 96 -S 100' (5 runs):

             8,776      context-switches          #    0.001 K/sec                    ( +-  0.73% )
             2,790      cpu-migrations            #    0.000 K/sec                    ( +-  0.98% )
            10,540      page-faults               #    0.001 K/sec                    ( +-  3.12% )

With patch:

 Performance counter stats for 'ebizzy -t 22 -S 100' (5 runs):

             1,133      context-switches          #    0.001 K/sec                    ( +-  4.72% )
               123      cpu-migrations            #    0.000 K/sec                    ( +-  3.42% )
             3,858      page-faults               #    0.002 K/sec                    ( +-  8.52% )

 Performance counter stats for 'ebizzy -t 48 -S 100' (5 runs):

             2,169      context-switches          #    0.000 K/sec                    ( +-  6.19% )
               189      cpu-migrations            #    0.000 K/sec                    ( +- 12.75% )
             5,917      page-faults               #    0.001 K/sec                    ( +-  8.09% )

 Performance counter stats for 'ebizzy -t 96 -S 100' (5 runs):

             5,333      context-switches          #    0.001 K/sec                    ( +-  5.91% )
               506      cpu-migrations            #    0.000 K/sec                    ( +-  3.35% )
            10,792      page-faults               #    0.001 K/sec                    ( +-  7.75% )

Which show that in these workloads CPU migrations get reduced significantly.

Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vincent Guittot <vincent.guittot@linaro.org>
Link: http://lkml.kernel.org/r/1490205470-10249-1-git-send-email-srikar@linux.vnet.ibm.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
kernel/sched/fair.c

index 03adf9fb48b1c12dba2e5328809b63b97af0b1bc..31453d57e8f5cc8eff7d042a0f5bb5e07459ca78 100644 (file)
@@ -7565,6 +7565,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
 {
        struct sched_domain *child = env->sd->child;
        struct sched_group *sg = env->sd->groups;
+       struct sg_lb_stats *local = &sds->local_stat;
        struct sg_lb_stats tmp_sgs;
        int load_idx, prefer_sibling = 0;
        bool overload = false;
@@ -7581,7 +7582,7 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
                if (local_group) {
                        sds->local = sg;
-                       sgs = &sds->local_stat;
+                       sgs = local;
 
                        if (env->idle != CPU_NEWLY_IDLE ||
                            time_after_eq(jiffies, sg->sgc->next_update))
@@ -7605,8 +7606,8 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
                 * the tasks on the system).
                 */
                if (prefer_sibling && sds->local &&
-                   group_has_capacity(env, &sds->local_stat) &&
-                   (sgs->sum_nr_running > 1)) {
+                   group_has_capacity(env, local) &&
+                   (sgs->sum_nr_running > local->sum_nr_running + 1)) {
                        sgs->group_no_capacity = 1;
                        sgs->group_type = group_classify(sg, sgs);
                }