BACKPORT: sched/fair: Fix cpu_util_wake() for 'execl' type workloads

author Chris Redpath <chris.redpath@arm.com>

Wed, 14 Nov 2018 10:46:34 +0000 (10:46 +0000)

committer Chris Redpath <chris.redpath@arm.com>

Thu, 29 Nov 2018 12:33:13 +0000 (12:33 +0000)
author Chris Redpath <chris.redpath@arm.com>
Wed, 14 Nov 2018 10:46:34 +0000 (10:46 +0000)
committer Chris Redpath <chris.redpath@arm.com>
Thu, 29 Nov 2018 12:33:13 +0000 (12:33 +0000)
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 9fd3bd0ba2fe525849faa9dce786c7503a302c07..30cb2055126271f823f5e8e840513a9bffbf18fb 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -5942,10 +5942,19 @@ static inline unsigned long cpu_util_freq(int cpu)
  }
  
  /*
- * cpu_util_wake: Compute CPU utilization with any contributions from
- * the waking task p removed.
+ * cpu_util_without: compute cpu utilization without any contributions from *p
+ * @cpu: the CPU which utilization is requested
+ * @p: the task which utilization should be discounted
+ *
+ * The utilization of a CPU is defined by the utilization of tasks currently
+ * enqueued on that CPU as well as tasks which are currently sleeping after an
+ * execution on that CPU.
+ *
+ * This method returns the utilization of the specified CPU by discounting the
+ * utilization of the specified task, whenever the task is currently
+ * contributing to the CPU utilization.
   */
-static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p)
  {
         struct cfs_rq *cfs_rq;
         unsigned int util;
@@ -5968,7 +5977,7 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
         cfs_rq = &cpu_rq(cpu)->cfs;
         util = READ_ONCE(cfs_rq->avg.util_avg);
  
-       /* Discount task's blocked util from CPU's util */
+       /* Discount task's util from CPU's util */
         util -= min_t(unsigned int, util, task_util(p));
  
         /*
@@ -5977,14 +5986,14 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
          * a) if *p is the only task sleeping on this CPU, then:
          *      cpu_util (== task_util) > util_est (== 0)
          *    and thus we return:
-        *      cpu_util_wake = (cpu_util - task_util) = 0
+        *      cpu_util_without = (cpu_util - task_util) = 0
          *
          * b) if other tasks are SLEEPING on this CPU, which is now exiting
          *    IDLE, then:
          *      cpu_util >= task_util
          *      cpu_util > util_est (== 0)
          *    and thus we discount *p's blocked utilization to return:
-        *      cpu_util_wake = (cpu_util - task_util) >= 0
+        *      cpu_util_without = (cpu_util - task_util) >= 0
          *
          * c) if other tasks are RUNNABLE on that CPU and
          *      util_est > cpu_util
@@ -5997,8 +6006,33 @@ static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
          * covered by the following code when estimated utilization is
          * enabled.
          */
-       if (sched_feat(UTIL_EST))
-               util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+       if (sched_feat(UTIL_EST)) {
+               unsigned int estimated =
+                       READ_ONCE(cfs_rq->avg.util_est.enqueued);
+
+               /*
+                * Despite the following checks we still have a small window
+                * for a possible race, when an execl's select_task_rq_fair()
+                * races with LB's detach_task():
+                *
+                *   detach_task()
+                *     p->on_rq = TASK_ON_RQ_MIGRATING;
+                *     ---------------------------------- A
+                *     deactivate_task()                   \
+                *       dequeue_task()                     + RaceTime
+                *         util_est_dequeue()              /
+                *     ---------------------------------- B
+                *
+                * The additional check on "current == p" it's required to
+                * properly fix the execl regression and it helps in further
+                * reducing the chances for the above race.
+                */
+               if (unlikely(task_on_rq_queued(p) || current == p)) {
+                       estimated -= min_t(unsigned int, estimated,
+                                          (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+               }
+               util = max(util, estimated);
+       }
  
         /*
          * Utilization (estimated) can exceed the CPU capacity, thus let's
@@ -6015,7 +6049,7 @@ static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
         int cpu;
  
         for_each_cpu(cpu, sched_group_span(eenv->sg_cap)) {
-               util = cpu_util_wake(cpu, eenv->p);
+               util = cpu_util_without(cpu, eenv->p);
  
                 /*
                  * If we are looking at the target CPU specified by the eenv,
@@ -6048,7 +6082,7 @@ long group_norm_util(struct energy_env *eenv, int cpu_idx)
         int cpu;
  
         for_each_cpu(cpu, sched_group_span(eenv->sg)) {
-               util = cpu_util_wake(cpu, eenv->p);
+               util = cpu_util_without(cpu, eenv->p);
  
                 /*
                  * If we are looking at the target CPU specified by the eenv,
@@ -6709,9 +6743,11 @@ boosted_task_util(struct task_struct *task)
         return util + margin;
  }
  
-static unsigned long capacity_spare_wake(int cpu, struct task_struct *p)
+static unsigned long cpu_util_without(int cpu, struct task_struct *p);
+
+static unsigned long capacity_spare_without(int cpu, struct task_struct *p)
  {
-       return max_t(long, capacity_of(cpu) - cpu_util_wake(cpu, p), 0);
+       return max_t(long, capacity_of(cpu) - cpu_util_without(cpu, p), 0);
  }
  
  /*
@@ -6771,7 +6807,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p,
  
                         avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs);
  
-                       spare_cap = capacity_spare_wake(i, p);
+                       spare_cap = capacity_spare_without(i, p);
  
                         if (spare_cap > max_spare_cap)
                                 max_spare_cap = spare_cap;
@@ -7310,7 +7346,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
                          * so prev_cpu will receive a negative bias due to the double
                          * accounting. However, the blocked utilization may be zero.
                          */
-                       wake_util = cpu_util_wake(i, p);
+                       wake_util = cpu_util_without(i, p);
                         new_util = wake_util + task_util_est(p);
  
                         /*
@@ -7767,7 +7803,7 @@ static int find_energy_efficient_cpu(struct sched_domain *sd,
                          * Consider only CPUs where the task is expected to
                          * fit without making the CPU overutilized.
                          */
-                       spare = capacity_spare_wake(cpu_iter, p);
+                       spare = capacity_spare_without(cpu_iter, p);
                         if (spare * 1024 < capacity_margin * task_util_est(p))
                                 continue;
  
@@ -7963,7 +7999,7 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f
  
         if (sd && !(sd_flag & SD_BALANCE_FORK)) {
                 /*
-                * We're going to need the task's util for capacity_spare_wake
+                * We're going to need the task's util for capacity_spare_without
                  * in find_idlest_group. Sync it up to prev_cpu's
                  * last_update_time.
                  */
author	Chris Redpath <chris.redpath@arm.com>
	Wed, 14 Nov 2018 10:46:34 +0000 (10:46 +0000)
committer	Chris Redpath <chris.redpath@arm.com>
	Thu, 29 Nov 2018 12:33:13 +0000 (12:33 +0000)