Merge 4.14.95 into android-4.14-p
[GitHub/moto-9609/android_kernel_motorola_exynos9610.git] / kernel / sched / fair.c
index 710d9ab354f2aca9cb3f3e2ddbb3ecfde2deae07..08d90c69d123d31fc05f316592399fa454603e78 100644 (file)
@@ -387,10 +387,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
        }
 }
 
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                     \
-       list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
-                                leaf_cfs_rq_list)
+/* Iterate through all leaf cfs_rq's on a runqueue: */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+       list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 
 /* Do the two (enqueued) entities belong to the same group ? */
 static inline struct cfs_rq *
@@ -483,8 +482,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 {
 }
 
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)     \
-               for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
+#define for_each_leaf_cfs_rq(rq, cfs_rq)       \
+               for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 
 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 {
@@ -729,6 +728,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 
 static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
 static unsigned long task_h_load(struct task_struct *p);
+static unsigned long capacity_of(int cpu);
 
 /* Give new sched_entity start runnable values to heavy its load in infant time */
 void init_entity_runnable_average(struct sched_entity *se)
@@ -775,11 +775,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
  * To solve this problem, we also cap the util_avg of successive tasks to
  * only 1/2 of the left utilization budget:
  *
- *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
  *
- * where n denotes the nth task.
+ * where n denotes the nth task and cpu_scale the CPU capacity.
  *
- * For example, a simplest series from the beginning would be like:
+ * For example, for a CPU with 1024 of capacity, a simplest series from
+ * the beginning would be like:
  *
  *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
  * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
@@ -791,7 +792,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
 {
        struct cfs_rq *cfs_rq = cfs_rq_of(se);
        struct sched_avg *sa = &se->avg;
-       long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+       long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+       long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
 
        if (cap > 0) {
                if (cfs_rq->avg.util_avg != 0) {
@@ -2829,9 +2831,6 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
                 * See cpu_util().
                 */
                cpufreq_update_util(rq, 0);
-#ifdef CONFIG_SMP
-               trace_sched_load_avg_cpu(cpu_of(rq), cfs_rq);
-#endif
        }
 }
 
@@ -3057,6 +3056,32 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa,
        return 1;
 }
 
+/*
+ * When a task is dequeued, its estimated utilization should not be update if
+ * its util_avg has not been updated at least once.
+ * This flag is used to synchronize util_avg updates with util_est updates.
+ * We map this information into the LSB bit of the utilization saved at
+ * dequeue time (i.e. util_est.dequeued).
+ */
+#define UTIL_AVG_UNCHANGED 0x1
+
+static inline void cfs_se_util_change(struct sched_avg *avg)
+{
+       unsigned int enqueued;
+
+       if (!sched_feat(UTIL_EST))
+               return;
+
+       /* Avoid store if the flag has been already set */
+       enqueued = avg->util_est.enqueued;
+       if (!(enqueued & UTIL_AVG_UNCHANGED))
+               return;
+
+       /* Reset flag to report util_avg has been updated */
+       enqueued &= ~UTIL_AVG_UNCHANGED;
+       WRITE_ONCE(avg->util_est.enqueued, enqueued);
+}
+
 static int
 __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 {
@@ -3066,9 +3091,36 @@ __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
 static int
 __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
 {
-       return ___update_load_avg(now, cpu, &se->avg,
-                                 se->on_rq * scale_load_down(se->load.weight),
-                                 cfs_rq->curr == se, NULL, NULL);
+       if (___update_load_avg(now, cpu, &se->avg,
+                              se->on_rq * scale_load_down(se->load.weight),
+                              cfs_rq->curr == se, NULL, NULL)) {
+               cfs_se_util_change(&se->avg);
+
+#ifdef UTIL_EST_DEBUG
+               /*
+                * Trace utilization only for actual tasks.
+                *
+                * These trace events are mostly useful to get easier to
+                * read plots for the estimated utilization, where we can
+                * compare it with the actual grow/decrease of the original
+                * PELT signal.
+                * Let's keep them disabled by default in "production kernels".
+                */
+               if (entity_is_task(se)) {
+                       struct task_struct *tsk = task_of(se);
+
+                       trace_sched_util_est_task(tsk, &se->avg);
+
+                       /* Trace utilization only for top level CFS RQ */
+                       cfs_rq = &(task_rq(tsk)->cfs);
+                       trace_sched_util_est_cpu(cpu, cfs_rq);
+               }
+#endif /* UTIL_EST_DEBUG */
+
+               return 1;
+       }
+
+       return 0;
 }
 
 static int
@@ -3598,6 +3650,157 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq)
 
 static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
 
+static inline int task_fits_capacity(struct task_struct *p, long capacity);
+
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+{
+       if (!static_branch_unlikely(&sched_asym_cpucapacity))
+               return;
+
+       if (!p) {
+               rq->misfit_task_load = 0;
+               return;
+       }
+
+       if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+               rq->misfit_task_load = 0;
+               return;
+       }
+
+       rq->misfit_task_load = task_h_load(p);
+}
+
+static inline unsigned long task_util(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+       if (likely(!walt_disabled && sysctl_sched_use_walt_task_util))
+               return (p->ravg.demand /
+                       (walt_ravg_window >> SCHED_CAPACITY_SHIFT));
+#endif
+       return READ_ONCE(p->se.avg.util_avg);
+}
+
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+       struct util_est ue = READ_ONCE(p->se.avg.util_est);
+
+       return max(ue.ewma, ue.enqueued);
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+       if (likely(!walt_disabled && sysctl_sched_use_walt_task_util))
+               return (p->ravg.demand /
+                       (walt_ravg_window >> SCHED_CAPACITY_SHIFT));
+#endif
+       return max(task_util(p), _task_util_est(p));
+}
+
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+                                   struct task_struct *p)
+{
+       unsigned int enqueued;
+
+       if (!sched_feat(UTIL_EST))
+               return;
+
+       /* Update root cfs_rq's estimated utilization */
+       enqueued  = cfs_rq->avg.util_est.enqueued;
+       enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
+       WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+       trace_sched_util_est_task(p, &p->se.avg);
+       trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+}
+
+/*
+ * Check if a (signed) value is within a specified (unsigned) margin,
+ * based on the observation that:
+ *
+ *     abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
+ *
+ * NOTE: this only works when value + maring < INT_MAX.
+ */
+static inline bool within_margin(int value, int margin)
+{
+       return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+}
+
+static void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+{
+       long last_ewma_diff;
+       struct util_est ue;
+
+       if (!sched_feat(UTIL_EST))
+               return;
+
+       /*
+        * Update root cfs_rq's estimated utilization
+        *
+        * If *p is the last task then the root cfs_rq's estimated utilization
+        * of a CPU is 0 by definition.
+        */
+       ue.enqueued = 0;
+       if (cfs_rq->nr_running) {
+               ue.enqueued  = cfs_rq->avg.util_est.enqueued;
+               ue.enqueued -= min_t(unsigned int, ue.enqueued,
+                                    (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+       }
+       WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+
+       trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
+       /*
+        * Skip update of task's estimated utilization when the task has not
+        * yet completed an activation, e.g. being migrated.
+        */
+       if (!task_sleep)
+               return;
+
+       /*
+        * If the PELT values haven't changed since enqueue time,
+        * skip the util_est update.
+        */
+       ue = p->se.avg.util_est;
+       if (ue.enqueued & UTIL_AVG_UNCHANGED)
+               return;
+
+       /*
+        * Skip update of task's estimated utilization when its EWMA is
+        * already ~1% close to its last activation value.
+        */
+       ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+       last_ewma_diff = ue.enqueued - ue.ewma;
+       if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+               return;
+
+       /*
+        * Update Task's estimated utilization
+        *
+        * When *p completes an activation we can consolidate another sample
+        * of the task size. This is done by storing the current PELT value
+        * as ue.enqueued and by using this value to update the Exponential
+        * Weighted Moving Average (EWMA):
+        *
+        *  ewma(t) = w *  task_util(p) + (1-w) * ewma(t-1)
+        *          = w *  task_util(p) +         ewma(t-1)  - w * ewma(t-1)
+        *          = w * (task_util(p) -         ewma(t-1)) +     ewma(t-1)
+        *          = w * (      last_ewma_diff            ) +     ewma(t-1)
+        *          = w * (last_ewma_diff  +  ewma(t-1) / w)
+        *
+        * Where 'w' is the weight of new samples, which is configured to be
+        * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+        */
+       ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
+       ue.ewma  += last_ewma_diff;
+       ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+       WRITE_ONCE(p->se.avg.util_est, ue);
+
+       trace_sched_util_est_task(p, &p->se.avg);
+}
+
 #else /* CONFIG_SMP */
 
 static inline int
@@ -3635,6 +3838,15 @@ static inline int idle_balance(struct rq *rq, struct rq_flags *rf)
        return 0;
 }
 
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
+
+static inline void
+util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
+                bool task_sleep) {}
+
 #endif /* CONFIG_SMP */
 
 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
@@ -3883,7 +4095,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
         * put back on, and if we advance min_vruntime, we'll be placed back
         * further than we started -- ie. we'll be penalized.
         */
-       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
                update_min_vruntime(cfs_rq);
 }
 
@@ -4098,12 +4310,12 @@ static inline bool cfs_bandwidth_used(void)
 
 void cfs_bandwidth_usage_inc(void)
 {
-       static_key_slow_inc(&__cfs_bandwidth_used);
+       static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
 }
 
 void cfs_bandwidth_usage_dec(void)
 {
-       static_key_slow_dec(&__cfs_bandwidth_used);
+       static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
 }
 #else /* HAVE_JUMP_LABEL */
 static bool cfs_bandwidth_used(void)
@@ -4146,6 +4358,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
        now = sched_clock_cpu(smp_processor_id());
        cfs_b->runtime = cfs_b->quota;
        cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+       cfs_b->expires_seq++;
 }
 
 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4168,6 +4381,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
        struct task_group *tg = cfs_rq->tg;
        struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
        u64 amount = 0, min_amount, expires;
+       int expires_seq;
 
        /* note: this is a positive sum as runtime_remaining <= 0 */
        min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4184,6 +4398,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                        cfs_b->idle = 0;
                }
        }
+       expires_seq = cfs_b->expires_seq;
        expires = cfs_b->runtime_expires;
        raw_spin_unlock(&cfs_b->lock);
 
@@ -4193,8 +4408,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
         * spread between our sched_clock and the one on which runtime was
         * issued.
         */
-       if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+       if (cfs_rq->expires_seq != expires_seq) {
+               cfs_rq->expires_seq = expires_seq;
                cfs_rq->runtime_expires = expires;
+       }
 
        return cfs_rq->runtime_remaining > 0;
 }
@@ -4220,12 +4437,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
         * has not truly expired.
         *
         * Fortunately we can check determine whether this the case by checking
-        * whether the global deadline has advanced. It is valid to compare
-        * cfs_b->runtime_expires without any locks since we only care about
-        * exact equality, so a partial write will still work.
+        * whether the global deadline(cfs_b->expires_seq) has advanced.
         */
-
-       if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
+       if (cfs_rq->expires_seq == cfs_b->expires_seq) {
                /* extend local deadline, drift is bounded above by 2 ticks */
                cfs_rq->runtime_expires += TICK_NSEC;
        } else {
@@ -4357,9 +4571,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
 
        /*
         * Add to the _head_ of the list, so that an already-started
-        * distribute_cfs_runtime will not see us
+        * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+        * not running add to the tail so that later runqueues don't get starved.
         */
-       list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       if (cfs_b->distribute_running)
+               list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       else
+               list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
 
        /*
         * If we're the first throttled task, make sure the bandwidth
@@ -4503,14 +4721,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
         * in us over-using our runtime if it is all used during this loop, but
         * only by limited amounts in that extreme case.
         */
-       while (throttled && cfs_b->runtime > 0) {
+       while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
                runtime = cfs_b->runtime;
+               cfs_b->distribute_running = 1;
                raw_spin_unlock(&cfs_b->lock);
                /* we can't nest cfs_b->lock while distributing bandwidth */
                runtime = distribute_cfs_runtime(cfs_b, runtime,
                                                 runtime_expires);
                raw_spin_lock(&cfs_b->lock);
 
+               cfs_b->distribute_running = 0;
                throttled = !list_empty(&cfs_b->throttled_cfs_rq);
 
                cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4621,6 +4841,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 
        /* confirm we're still not at a refresh boundary */
        raw_spin_lock(&cfs_b->lock);
+       if (cfs_b->distribute_running) {
+               raw_spin_unlock(&cfs_b->lock);
+               return;
+       }
+
        if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
                raw_spin_unlock(&cfs_b->lock);
                return;
@@ -4630,6 +4855,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
                runtime = cfs_b->runtime;
 
        expires = cfs_b->runtime_expires;
+       if (runtime)
+               cfs_b->distribute_running = 1;
+
        raw_spin_unlock(&cfs_b->lock);
 
        if (!runtime)
@@ -4640,6 +4868,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
        raw_spin_lock(&cfs_b->lock);
        if (expires == cfs_b->runtime_expires)
                cfs_b->runtime -= min(runtime, cfs_b->runtime);
+       cfs_b->distribute_running = 0;
        raw_spin_unlock(&cfs_b->lock);
 }
 
@@ -4748,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
        cfs_b->period_timer.function = sched_cfs_period_timer;
        hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
        cfs_b->slack_timer.function = sched_cfs_slack_timer;
+       cfs_b->distribute_running = 0;
 }
 
 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -4933,8 +5163,6 @@ static inline void hrtick_update(struct rq *rq)
 #ifdef CONFIG_SMP
 static bool cpu_overutilized(int cpu);
 
-static unsigned long cpu_util(int cpu);
-
 static bool sd_overutilized(struct sched_domain *sd)
 {
        return sd->shared->overutilized;
@@ -4964,11 +5192,11 @@ static inline void update_overutilized_status(struct rq *rq)
        rcu_read_unlock();
 }
 
-unsigned long boosted_cpu_util(int cpu);
+unsigned long boosted_cpu_util(int cpu, unsigned long other_util);
 #else
 
 #define update_overutilized_status(rq) do {} while (0)
-#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
+#define boosted_cpu_util(cpu, other_util) cpu_util_freq(cpu)
 
 #endif /* CONFIG_SMP */
 
@@ -4984,6 +5212,32 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        struct sched_entity *se = &p->se;
        int task_new = !(flags & ENQUEUE_WAKEUP);
 
+       /*
+        * The code below (indirectly) updates schedutil which looks at
+        * the cfs_rq utilization to select a frequency.
+        * Let's add the task's estimated utilization to the cfs_rq's
+        * estimated utilization, before we update schedutil.
+        */
+       util_est_enqueue(&rq->cfs, p);
+
+       /*
+        * The code below (indirectly) updates schedutil which looks at
+        * the cfs_rq utilization to select a frequency.
+        * Let's update schedtune here to ensure the boost value of the
+        * current task is accounted for in the selection of the OPP.
+        *
+        * We do it also in the case where we enqueue a throttled task;
+        * we could argue that a throttled task should not boost a CPU,
+        * however:
+        * a) properly implementing CPU boosting considering throttled
+        *    tasks will increase a lot the complexity of the solution
+        * b) it's not easy to quantify the benefits introduced by
+        *    such a more complex solution.
+        * Thus, for the time being we go for the simple solution and boost
+        * also for throttled RQs.
+        */
+       schedtune_enqueue_task(p, cpu_of(rq));
+
        /*
         * If in_iowait is set, the code below may not trigger any cpufreq
         * utilization updates, so do it here explicitly with the IOWAIT flag
@@ -5024,25 +5278,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                update_cfs_shares(se);
        }
 
-       /*
-        * Update SchedTune accounting.
-        *
-        * We do it before updating the CPU capacity to ensure the
-        * boost value of the current task is accounted for in the
-        * selection of the OPP.
-        *
-        * We do it also in the case where we enqueue a throttled task;
-        * we could argue that a throttled task should not boost a CPU,
-        * however:
-        * a) properly implementing CPU boosting considering throttled
-        *    tasks will increase a lot the complexity of the solution
-        * b) it's not easy to quantify the benefits introduced by
-        *    such a more complex solution.
-        * Thus, for the time being we go for the simple solution and boost
-        * also for throttled RQs.
-        */
-       schedtune_enqueue_task(p, cpu_of(rq));
-
        if (!se) {
                add_nr_running(rq, 1);
                if (!task_new)
@@ -5066,6 +5301,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        struct sched_entity *se = &p->se;
        int task_sleep = flags & DEQUEUE_SLEEP;
 
+       /*
+        * The code below (indirectly) updates schedutil which looks at
+        * the cfs_rq utilization to select a frequency.
+        * Let's update schedtune here to ensure the boost value of the
+        * current task is not more accounted for in the selection of the OPP.
+        */
+       schedtune_dequeue_task(p, cpu_of(rq));
+
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                dequeue_entity(cfs_rq, se, flags);
@@ -5108,20 +5351,12 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                update_cfs_shares(se);
        }
 
-       /*
-        * Update SchedTune accounting
-        *
-        * We do it before updating the CPU capacity to ensure the
-        * boost value of the current task is accounted for in the
-        * selection of the OPP.
-        */
-       schedtune_dequeue_task(p, cpu_of(rq));
-
        if (!se) {
                sub_nr_running(rq, 1);
                walt_dec_cumulative_runnable_avg(rq, p);
        }
 
+       util_est_dequeue(&rq->cfs, p, task_sleep);
        hrtick_update(rq);
 }
 
@@ -5475,8 +5710,6 @@ static inline bool energy_aware(void)
        return sched_feat(ENERGY_AWARE);
 }
 
-static int cpu_util_wake(int cpu, struct task_struct *p);
-
 /*
  * __cpu_norm_util() returns the cpu util relative to a specific capacity,
  * i.e. it's busy ratio, in the range [0..SCHED_CAPACITY_SCALE] which is useful
@@ -5630,7 +5863,154 @@ struct energy_env {
        struct sched_group      *sg;
 };
 
-static int cpu_util_wake(int cpu, struct task_struct *p);
+/**
+ * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
+ * @cpu: the CPU to get the utilization of
+ *
+ * The unit of the return value must be the one of capacity so we can compare
+ * the utilization with the capacity of the CPU that is available for CFS task
+ * (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency,
+ * i.e. arch_scale_cpu_capacity().
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * The estimated utilization of a CPU is defined to be the maximum between its
+ * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
+ * currently RUNNABLE on that CPU.
+ * This allows to properly represent the expected utilization of a CPU which
+ * has just got a big task running since a long sleep period. At the same time
+ * however it preserves the benefits of the "blocked utilization" in
+ * describing the potential for other tasks waking up on the same CPU.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ *
+ * Return: the (estimated) utilization for the specified CPU
+ */
+static inline unsigned long cpu_util(int cpu)
+{
+       struct cfs_rq *cfs_rq;
+       unsigned int util;
+
+#ifdef CONFIG_SCHED_WALT
+       if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) {
+               u64 walt_cpu_util = cpu_rq(cpu)->cumulative_runnable_avg;
+
+               walt_cpu_util <<= SCHED_CAPACITY_SHIFT;
+               do_div(walt_cpu_util, walt_ravg_window);
+
+               return min_t(unsigned long, walt_cpu_util,
+                            capacity_orig_of(cpu));
+       }
+#endif
+
+       cfs_rq = &cpu_rq(cpu)->cfs;
+       util = READ_ONCE(cfs_rq->avg.util_avg);
+
+       if (sched_feat(UTIL_EST))
+               util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+       return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
+
+static inline unsigned long cpu_util_freq(int cpu)
+{
+#ifdef CONFIG_SCHED_WALT
+       u64 walt_cpu_util;
+
+       if (unlikely(walt_disabled || !sysctl_sched_use_walt_cpu_util))
+               return cpu_util(cpu);
+
+       walt_cpu_util = cpu_rq(cpu)->prev_runnable_sum;
+       walt_cpu_util <<= SCHED_CAPACITY_SHIFT;
+       do_div(walt_cpu_util, walt_ravg_window);
+
+       return min_t(unsigned long, walt_cpu_util, capacity_orig_of(cpu));
+#else
+       return cpu_util(cpu);
+#endif
+}
+
+/*
+ * cpu_util_wake: Compute CPU utilization with any contributions from
+ * the waking task p removed.
+ */
+static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+{
+       struct cfs_rq *cfs_rq;
+       unsigned int util;
+
+#ifdef CONFIG_SCHED_WALT
+       /*
+        * WALT does not decay idle tasks in the same manner
+        * as PELT, so it makes little sense to subtract task
+        * utilization from cpu utilization. Instead just use
+        * cpu_util for this case.
+        */
+       if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util))
+               return cpu_util(cpu);
+#endif
+
+       /* Task has no contribution or is new */
+       if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+               return cpu_util(cpu);
+
+       cfs_rq = &cpu_rq(cpu)->cfs;
+       util = READ_ONCE(cfs_rq->avg.util_avg);
+
+       /* Discount task's blocked util from CPU's util */
+       util -= min_t(unsigned int, util, task_util(p));
+
+       /*
+        * Covered cases:
+        *
+        * a) if *p is the only task sleeping on this CPU, then:
+        *      cpu_util (== task_util) > util_est (== 0)
+        *    and thus we return:
+        *      cpu_util_wake = (cpu_util - task_util) = 0
+        *
+        * b) if other tasks are SLEEPING on this CPU, which is now exiting
+        *    IDLE, then:
+        *      cpu_util >= task_util
+        *      cpu_util > util_est (== 0)
+        *    and thus we discount *p's blocked utilization to return:
+        *      cpu_util_wake = (cpu_util - task_util) >= 0
+        *
+        * c) if other tasks are RUNNABLE on that CPU and
+        *      util_est > cpu_util
+        *    then we use util_est since it returns a more restrictive
+        *    estimation of the spare capacity on that CPU, by just
+        *    considering the expected utilization of tasks already
+        *    runnable on that CPU.
+        *
+        * Cases a) and b) are covered by the above code, while case c) is
+        * covered by the following code when estimated utilization is
+        * enabled.
+        */
+       if (sched_feat(UTIL_EST))
+               util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+       /*
+        * Utilization (estimated) can exceed the CPU capacity, thus let's
+        * clamp to the maximum CPU capacity to ensure consistency with
+        * the cpu_util call.
+        */
+       return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
 
 static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
 {
@@ -6210,8 +6590,6 @@ static int wake_affine(struct sched_domain *sd, struct task_struct *p,
        return affine;
 }
 
-static inline unsigned long task_util(struct task_struct *p);
-
 #ifdef CONFIG_SCHED_TUNE
 struct reciprocal_value schedtune_spc_rdiv;
 
@@ -6262,7 +6640,7 @@ schedtune_task_margin(struct task_struct *task)
        if (boost == 0)
                return 0;
 
-       util = task_util(task);
+       util = task_util_est(task);
        margin = schedtune_margin(util, boost);
 
        return margin;
@@ -6285,9 +6663,9 @@ schedtune_task_margin(struct task_struct *task)
 #endif /* CONFIG_SCHED_TUNE */
 
 unsigned long
-boosted_cpu_util(int cpu)
+boosted_cpu_util(int cpu, unsigned long other_util)
 {
-       unsigned long util = cpu_util_freq(cpu);
+       unsigned long util = cpu_util_freq(cpu) + other_util;
        long margin = schedtune_cpu_margin(util, cpu);
 
        trace_sched_boost_cpu(cpu, util, margin);
@@ -6298,7 +6676,7 @@ boosted_cpu_util(int cpu)
 static inline unsigned long
 boosted_task_util(struct task_struct *task)
 {
-       unsigned long util = task_util(task);
+       unsigned long util = task_util_est(task);
        long margin = schedtune_task_margin(task);
 
        trace_sched_boost_task(task, util, margin);
@@ -6546,6 +6924,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
 }
 
 #ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
 
 static inline void set_idle_cores(int cpu, int val)
 {
@@ -6832,44 +7211,6 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
        return select_idle_sibling_cstate_aware(p, prev, target);
 }
 
-static inline unsigned long task_util(struct task_struct *p)
-{
-#ifdef CONFIG_SCHED_WALT
-       if (!walt_disabled && sysctl_sched_use_walt_task_util) {
-               return (p->ravg.demand / (walt_ravg_window >> SCHED_CAPACITY_SHIFT));
-       }
-#endif
-       return p->se.avg.util_avg;
-}
-
-/*
- * cpu_util_wake: Compute cpu utilization with any contributions from
- * the waking task p removed.
- */
-static int cpu_util_wake(int cpu, struct task_struct *p)
-{
-       unsigned long util, capacity;
-
-#ifdef CONFIG_SCHED_WALT
-       /*
-        * WALT does not decay idle tasks in the same manner
-        * as PELT, so it makes little sense to subtract task
-        * utilization from cpu utilization. Instead just use
-        * cpu_util for this case.
-        */
-       if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
-               return cpu_util(cpu);
-#endif
-       /* Task has no contribution or is new */
-       if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
-               return cpu_util(cpu);
-
-       capacity = capacity_orig_of(cpu);
-       util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
-
-       return (util >= capacity) ? capacity : util;
-}
-
 static inline int task_fits_capacity(struct task_struct *p, long capacity)
 {
        return capacity * 1024 > boosted_task_util(p) * capacity_margin;
@@ -6885,7 +7226,6 @@ static int start_cpu(bool boosted)
 static inline int find_best_target(struct task_struct *p, int *backup_cpu,
                                   bool boosted, bool prefer_idle)
 {
-       unsigned long best_idle_min_cap_orig = ULONG_MAX;
        unsigned long min_util = boosted_task_util(p);
        unsigned long target_capacity = ULONG_MAX;
        unsigned long min_wake_util = ULONG_MAX;
@@ -6902,6 +7242,18 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
 
        *backup_cpu = -1;
 
+       /*
+        * In most cases, target_capacity tracks capacity_orig of the most
+        * energy efficient CPU candidate, thus requiring to minimise
+        * target_capacity. For these cases target_capacity is already
+        * initialized to ULONG_MAX.
+        * However, for prefer_idle and boosted tasks we look for a high
+        * performance CPU, thus requiring to maximise target_capacity. In this
+        * case we initialise target_capacity to 0.
+        */
+       if (prefer_idle && boosted)
+               target_capacity = 0;
+
        /* Find start CPU based on boost value */
        cpu = start_cpu(boosted);
        if (cpu < 0)
@@ -6919,6 +7271,8 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
                        unsigned long capacity_curr = capacity_curr_of(i);
                        unsigned long capacity_orig = capacity_orig_of(i);
                        unsigned long wake_util, new_util;
+                       long spare_cap;
+                       int idle_idx = INT_MAX;
 
                        if (!cpu_online(i))
                                continue;
@@ -6932,7 +7286,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
                         * accounting. However, the blocked utilization may be zero.
                         */
                        wake_util = cpu_util_wake(i, p);
-                       new_util = wake_util + task_util(p);
+                       new_util = wake_util + task_util_est(p);
 
                        /*
                         * Ensure minimum capacity to grant the required boost.
@@ -6943,6 +7297,17 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
                        if (new_util > capacity_orig)
                                continue;
 
+                       /*
+                        * Pre-compute the maximum possible capacity we expect
+                        * to have available on this CPU once the task is
+                        * enqueued here.
+                        */
+                       spare_cap = capacity_orig - new_util;
+
+                       if (idle_cpu(i))
+                               idle_idx = idle_get_state_idx(cpu_rq(i));
+
+
                        /*
                         * Case A) Latency sensitive tasks
                         *
@@ -6975,24 +7340,39 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
 
                                /*
                                 * Case A.1: IDLE CPU
-                                * Return the first IDLE CPU we find.
+                                * Return the best IDLE CPU we find:
+                                * - for boosted tasks: the CPU with the highest
+                                * performance (i.e. biggest capacity_orig)
+                                * - for !boosted tasks: the most energy
+                                * efficient CPU (i.e. smallest capacity_orig)
                                 */
                                if (idle_cpu(i)) {
-                                       trace_sched_find_best_target(p,
-                                                       prefer_idle, min_util,
-                                                       cpu, best_idle_cpu,
-                                                       best_active_cpu, i);
-
-                                       return i;
+                                       if (boosted &&
+                                           capacity_orig < target_capacity)
+                                               continue;
+                                       if (!boosted &&
+                                           capacity_orig > target_capacity)
+                                               continue;
+                                       if (capacity_orig == target_capacity &&
+                                           sysctl_sched_cstate_aware &&
+                                           best_idle_cstate <= idle_idx)
+                                               continue;
+
+                                       target_capacity = capacity_orig;
+                                       best_idle_cstate = idle_idx;
+                                       best_idle_cpu = i;
+                                       continue;
                                }
+                               if (best_idle_cpu != -1)
+                                       continue;
 
                                /*
                                 * Case A.2: Target ACTIVE CPU
                                 * Favor CPUs with max spare capacity.
                                 */
-                               if ((capacity_curr > new_util) &&
-                                       (capacity_orig - new_util > target_max_spare_cap)) {
-                                       target_max_spare_cap = capacity_orig - new_util;
+                               if (capacity_curr > new_util &&
+                                   spare_cap > target_max_spare_cap) {
+                                       target_max_spare_cap = spare_cap;
                                        target_cpu = i;
                                        continue;
                                }
@@ -7029,6 +7409,13 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
                            (capacity_orig * SCHED_CAPACITY_SCALE))
                                continue;
 
+                       /*
+                        * Favor CPUs with smaller capacity for non latency
+                        * sensitive tasks.
+                        */
+                       if (capacity_orig > target_capacity)
+                               continue;
+
                        /*
                         * Case B) Non latency sensitive tasks on IDLE CPUs.
                         *
@@ -7054,24 +7441,18 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
                         * consumptions without affecting performance.
                         */
                        if (idle_cpu(i)) {
-                               int idle_idx = idle_get_state_idx(cpu_rq(i));
-
-                               /* Select idle CPU with lower cap_orig */
-                               if (capacity_orig > best_idle_min_cap_orig)
-                                       continue;
-
                                /*
                                 * Skip CPUs in deeper idle state, but only
                                 * if they are also less energy efficient.
                                 * IOW, prefer a deep IDLE LITTLE CPU vs a
                                 * shallow idle big CPU.
                                 */
-                               if (sysctl_sched_cstate_aware &&
+                               if (capacity_orig == target_capacity &&
+                                   sysctl_sched_cstate_aware &&
                                    best_idle_cstate <= idle_idx)
                                        continue;
 
-                               /* Keep track of best idle CPU */
-                               best_idle_min_cap_orig = capacity_orig;
+                               target_capacity = capacity_orig;
                                best_idle_cstate = idle_idx;
                                best_idle_cpu = i;
                                continue;
@@ -7097,15 +7478,12 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
                         * capacity.
                         */
 
-                       /* Favor CPUs with smaller capacity */
-                       if (capacity_orig > target_capacity)
-                               continue;
-
                        /* Favor CPUs with maximum spare capacity */
-                       if ((capacity_orig - new_util) < target_max_spare_cap)
+                       if (capacity_orig == target_capacity &&
+                           spare_cap < target_max_spare_cap)
                                continue;
 
-                       target_max_spare_cap = capacity_orig - new_util;
+                       target_max_spare_cap = spare_cap;
                        target_capacity = capacity_orig;
                        target_util = new_util;
                        target_cpu = i;
@@ -7122,7 +7500,7 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
         *
         * - prefer_idle tasks:
         *
-        *   a) IDLE CPU available, we return immediately
+        *   a) IDLE CPU available: best_idle_cpu
         *   b) ACTIVE CPU where task fits and has the bigger maximum spare
         *      capacity (i.e. target_cpu)
         *   c) ACTIVE CPU with less contention due to other tasks
@@ -7133,6 +7511,15 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
         *   a) ACTIVE CPU: target_cpu
         *   b) IDLE CPU: best_idle_cpu
         */
+
+       if (prefer_idle && (best_idle_cpu != -1)) {
+               trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+                                            best_idle_cpu, best_active_cpu,
+                                            best_idle_cpu);
+
+               return best_idle_cpu;
+       }
+
        if (target_cpu == -1)
                target_cpu = prefer_idle
                        ? best_active_cpu
@@ -7170,7 +7557,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
                return 0;
 
        min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
-       max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+       max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
 
        /* Minimum capacity is close to max, no need to abort wake_affine */
        if (max_cap - min_cap < max_cap >> 3)
@@ -7296,7 +7683,7 @@ static inline struct energy_env *get_eenv(struct task_struct *p, int prev_cpu)
         * during energy calculation, but unboosted task
         * util for group utilization calculations
         */
-       eenv->util_delta = task_util(p);
+       eenv->util_delta = task_util_est(p);
        eenv->util_delta_boosted = boosted_task_util(p);
 
        cpumask_and(&cpumask_possible_cpus, &p->cpus_allowed, cpu_online_mask);
@@ -7322,7 +7709,7 @@ static int find_energy_efficient_cpu(struct sched_domain *sd,
 {
        int use_fbt = sched_feat(FIND_BEST_TARGET);
        int cpu_iter, eas_cpu_idx = EAS_CPU_NXT;
-       int energy_cpu = -1;
+       int target_cpu = -1;
        struct energy_env *eenv;
 
        if (sysctl_sched_sync_hint_enable && sync) {
@@ -7334,7 +7721,7 @@ static int find_energy_efficient_cpu(struct sched_domain *sd,
        /* prepopulate energy diff environment */
        eenv = get_eenv(p, prev_cpu);
        if (eenv->max_cpu_count < 2)
-               return energy_cpu;
+               return -1;
 
        if(!use_fbt) {
                /*
@@ -7351,9 +7738,12 @@ static int find_energy_efficient_cpu(struct sched_domain *sd,
                        if (cpu_iter == prev_cpu)
                                continue;
 
+                       /*
+                        * Consider only CPUs where the task is expected to
+                        * fit without making the CPU overutilized.
+                        */
                        spare = capacity_spare_wake(cpu_iter, p);
-
-                       if (spare * 1024 < capacity_margin * task_util(p))
+                       if (spare * 1024 < capacity_margin * task_util_est(p))
                                continue;
 
                        /* Add CPU candidate */
@@ -7379,9 +7769,15 @@ static int find_energy_efficient_cpu(struct sched_domain *sd,
                eenv->max_cpu_count = EAS_CPU_BKP + 1;
 
                /* Find a cpu with sufficient capacity */
-               eenv->cpu[EAS_CPU_NXT].cpu_id = find_best_target(p,
-                               &eenv->cpu[EAS_CPU_BKP].cpu_id,
-                               boosted, prefer_idle);
+               target_cpu = find_best_target(p, &eenv->cpu[EAS_CPU_BKP].cpu_id,
+                                             boosted, prefer_idle);
+
+               /* Immediately return a found idle CPU for a prefer_idle task */
+               if (prefer_idle && target_cpu >= 0 && idle_cpu(target_cpu))
+                       return target_cpu;
+
+               /* Place target into NEXT slot */
+               eenv->cpu[EAS_CPU_NXT].cpu_id = target_cpu;
 
                /* take note if no backup was found */
                if (eenv->cpu[EAS_CPU_BKP].cpu_id < 0)
@@ -7398,14 +7794,14 @@ static int find_energy_efficient_cpu(struct sched_domain *sd,
                 * candidates beyond prev_cpu, so we will
                 * fall-back to the regular slow-path.
                 */
-               return energy_cpu;
+               return -1;
        }
 
        /* find most energy-efficient CPU */
-       energy_cpu = select_energy_cpu_idx(eenv) < 0 ? -1 :
+       target_cpu = select_energy_cpu_idx(eenv) < 0 ? -1 :
                                        eenv->cpu[eenv->next_idx].cpu_id;
 
-       return energy_cpu;
+       return target_cpu;
 }
 
 static inline bool nohz_kick_needed(struct rq *rq, bool only_update);
@@ -7449,7 +7845,7 @@ static inline int wake_energy(struct task_struct *p, int prev_cpu,
         * the heuristics we use there in selecting candidate
         * CPUs.
         */
-       if (unlikely(!sched_feat(FIND_BEST_TARGET) && !task_util(p)))
+       if (unlikely(!sched_feat(FIND_BEST_TARGET) && !task_util_est(p)))
                return false;
 
        if(!sched_feat(EAS_PREFER_IDLE)){
@@ -7796,29 +8192,6 @@ preempt:
                set_last_buddy(se);
 }
 
-static inline void update_misfit_task(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
-       rq->misfit_task = !task_fits_capacity(p, capacity_of(rq->cpu));
-#endif
-}
-
-static inline void clear_rq_misfit(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-       rq->misfit_task = 0;
-#endif
-}
-
-static inline unsigned int rq_has_misfit(struct rq *rq)
-{
-#ifdef CONFIG_SMP
-       return rq->misfit_task;
-#else
-       return 0;
-#endif
-}
-
 static struct task_struct *
 pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
 {
@@ -7909,7 +8282,7 @@ again:
        if (hrtick_enabled(rq))
                hrtick_start_fair(rq, p);
 
-       update_misfit_task(rq, p);
+       update_misfit_status(p, rq);
 
        return p;
 simple:
@@ -7928,12 +8301,12 @@ simple:
        if (hrtick_enabled(rq))
                hrtick_start_fair(rq, p);
 
-       update_misfit_task(rq, p);
+       update_misfit_status(p, rq);
 
        return p;
 
 idle:
-       clear_rq_misfit(rq);
+       update_misfit_status(NULL, rq);
        new_tasks = idle_balance(rq, rf);
 
        /*
@@ -8537,27 +8910,10 @@ static void attach_tasks(struct lb_env *env)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
-       if (cfs_rq->load.weight)
-               return false;
-
-       if (cfs_rq->avg.load_sum)
-               return false;
-
-       if (cfs_rq->avg.util_sum)
-               return false;
-
-       if (cfs_rq->runnable_load_sum)
-               return false;
-
-       return true;
-}
-
 static void update_blocked_averages(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
-       struct cfs_rq *cfs_rq, *pos;
+       struct cfs_rq *cfs_rq;
        struct rq_flags rf;
 
        rq_lock_irqsave(rq, &rf);
@@ -8567,7 +8923,7 @@ static void update_blocked_averages(int cpu)
         * Iterates the task_group tree in a bottom up fashion, see
         * list_add_leaf_cfs_rq() for details.
         */
-       for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+       for_each_leaf_cfs_rq(rq, cfs_rq) {
                struct sched_entity *se;
 
                /* throttled entities do not contribute to load */
@@ -8581,13 +8937,6 @@ static void update_blocked_averages(int cpu)
                se = cfs_rq->tg->se[cpu];
                if (se && !skip_blocked_update(se))
                        update_load_avg(se, 0);
-
-               /*
-                * There can be a lot of idle CPU cgroups.  Don't let fully
-                * decayed cfs_rqs linger on the list.
-                */
-               if (cfs_rq_is_decayed(cfs_rq))
-                       list_del_leaf_cfs_rq(cfs_rq);
        }
        update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
 #ifdef CONFIG_NO_HZ_COMMON
@@ -8682,7 +9031,8 @@ struct sg_lb_stats {
        unsigned int group_weight;
        enum group_type group_type;
        int group_no_capacity;
-       int group_misfit_task; /* A cpu has a task too big for its capacity */
+       /* A cpu has a task too big for its capacity */
+       unsigned long group_misfit_task_load;
 #ifdef CONFIG_NUMA_BALANCING
        unsigned int nr_numa_running;
        unsigned int nr_preferred_running;
@@ -8784,13 +9134,46 @@ static unsigned long scale_rt_capacity(int cpu)
        return 1;
 }
 
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
+{
+       raw_spin_lock_init(&mcc->lock);
+       mcc->val = 0;
+       mcc->cpu = -1;
+}
+
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
        unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
        struct sched_group *sdg = sd->groups;
+       struct max_cpu_capacity *mcc;
+       unsigned long max_capacity;
+       int max_cap_cpu;
+       unsigned long flags;
 
        cpu_rq(cpu)->cpu_capacity_orig = capacity;
 
+       capacity *= arch_scale_max_freq_capacity(sd, cpu);
+       capacity >>= SCHED_CAPACITY_SHIFT;
+
+       mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
+
+       raw_spin_lock_irqsave(&mcc->lock, flags);
+       max_capacity = mcc->val;
+       max_cap_cpu = mcc->cpu;
+
+       if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+           max_capacity < capacity) {
+               mcc->val = capacity;
+               mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+               raw_spin_unlock_irqrestore(&mcc->lock, flags);
+               pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+               goto skip_unlock;
+#endif
+       }
+       raw_spin_unlock_irqrestore(&mcc->lock, flags);
+
+skip_unlock: __attribute__ ((unused));
        capacity *= scale_rt_capacity(cpu);
        capacity >>= SCHED_CAPACITY_SHIFT;
 
@@ -8800,6 +9183,7 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
        cpu_rq(cpu)->cpu_capacity = capacity;
        sdg->sgc->capacity = capacity;
        sdg->sgc->min_capacity = capacity;
+       sdg->sgc->max_capacity = capacity;
 }
 
 void update_group_capacity(struct sched_domain *sd, int cpu)
@@ -8968,16 +9352,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
 }
 
 /*
- * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
  * per-CPU capacity than sched_group ref.
  */
 static inline bool
-group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
 {
        return sg->sgc->min_capacity * capacity_margin <
                                                ref->sgc->min_capacity * 1024;
 }
 
+/*
+ * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity_orig than sched_group ref.
+ */
+static inline bool
+group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+       return sg->sgc->max_capacity * capacity_margin <
+                                               ref->sgc->max_capacity * 1024;
+}
+
 /*
  * group_similar_cpu_capacity: Returns true if the minimum capacity of the
  * compared groups differ by less than 12.5%.
@@ -9001,7 +9396,7 @@ group_type group_classify(struct sched_group *group,
        if (sg_imbalanced(group))
                return group_imbalanced;
 
-       if (sgs->group_misfit_task)
+       if (sgs->group_misfit_task_load)
                return group_misfit_task;
 
        return group_other;
@@ -9014,7 +9409,7 @@ group_type group_classify(struct sched_group *group,
  * @load_idx: Load index of sched_domain of this_cpu for load calc.
  * @local_group: Does group contain this_cpu.
  * @sgs: variable to hold the statistics for this group.
- * @overload: Indicate more than one runnable task for any CPU.
+ * @overload: Indicate pullable load (e.g. >1 runnable task).
  * @overutilized: Indicate overutilization for any CPU.
  */
 static inline void update_sg_lb_stats(struct lb_env *env,
@@ -9056,13 +9451,16 @@ static inline void update_sg_lb_stats(struct lb_env *env,
                        sgs->idle_cpus++;
 
                if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
-                   !sgs->group_misfit_task && rq_has_misfit(rq))
-                       sgs->group_misfit_task = capacity_of(i);
+                   sgs->group_misfit_task_load < rq->misfit_task_load) {
+                       sgs->group_misfit_task_load = rq->misfit_task_load;
+                       *overload = 1;
+               }
+
 
                if (cpu_overutilized(i)) {
                        *overutilized = true;
 
-                       if (rq_has_misfit(rq))
+                       if (rq->misfit_task_load)
                                *misfit_task = true;
                }
        }
@@ -9102,9 +9500,12 @@ static bool update_sd_pick_busiest(struct lb_env *env,
 
        /*
         * Don't try to pull misfit tasks we can't help.
+        * We can use max_capacity here as reduction in capacity on some
+        * cpus in the group should either be possible to resolve
+        * internally or be covered by avg_load imbalance (eventually).
         */
        if (sgs->group_type == group_misfit_task &&
-           (!group_smaller_cpu_capacity(sg, sds->local) ||
+           (!group_smaller_max_cpu_capacity(sg, sds->local) ||
             !group_has_capacity(env, &sds->local_stat)))
                return false;
 
@@ -9127,7 +9528,7 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         * power/energy consequences are not considered.
         */
        if (sgs->sum_nr_running <= sgs->group_weight &&
-           group_smaller_cpu_capacity(sds->local, sg))
+           group_smaller_min_cpu_capacity(sds->local, sg))
                return false;
 
        /*
@@ -9139,6 +9540,13 @@ static bool update_sd_pick_busiest(struct lb_env *env,
                !group_similar_cpu_capacity(sds->local, sg))
                return false;
 
+       /*
+        * If we have more than one misfit sg go with the biggest misfit.
+        */
+       if (sgs->group_type == group_misfit_task &&
+           sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+               return false;
+
 asym_packing:
        /* This is the busiest node in its class. */
        if (!(env->sd->flags & SD_ASYM_PACKING))
@@ -9219,11 +9627,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
        struct sched_group *sg = env->sd->groups;
        struct sg_lb_stats *local = &sds->local_stat;
        struct sg_lb_stats tmp_sgs;
-       int load_idx, prefer_sibling = 0;
+       int load_idx;
        bool overload = false, overutilized = false, misfit_task = false;
-
-       if (child && child->flags & SD_PREFER_SIBLING)
-               prefer_sibling = 1;
+       bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
 
 #ifdef CONFIG_NO_HZ_COMMON
        if (env->idle == CPU_NEWLY_IDLE) {
@@ -9311,8 +9717,8 @@ next_group:
 
        if (!lb_sd_parent(env->sd)) {
                /* update overload indicator if we are at root domain */
-               if (env->dst_rq->rd->overload != overload)
-                       env->dst_rq->rd->overload = overload;
+               if (READ_ONCE(env->dst_rq->rd->overload) != overload)
+                       WRITE_ONCE(env->dst_rq->rd->overload, overload);
        }
 
        if (overutilized)
@@ -9468,7 +9874,22 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
        capa_move /= SCHED_CAPACITY_SCALE;
 
        /* Move if we gain throughput */
-       if (capa_move > capa_now)
+       if (capa_move > capa_now) {
+               env->imbalance = busiest->load_per_task;
+               return;
+       }
+
+       /* We can't see throughput improvement with the load-based
+        * method, but it is possible depending upon group size and
+        * capacity range that there might still be an underutilized
+        * cpu available in an asymmetric capacity system. Do one last
+        * check just in case.
+        */
+       if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+           busiest->group_type == group_overloaded &&
+           busiest->sum_nr_running > busiest->group_weight &&
+           local->sum_nr_running < local->group_weight &&
+           local->group_capacity < busiest->group_capacity)
                env->imbalance = busiest->load_per_task;
 }
 
@@ -9537,10 +9958,20 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
                (sds->avg_load - local->avg_load) * local->group_capacity
        ) / SCHED_CAPACITY_SCALE;
 
-       /* Boost imbalance to allow misfit task to be balanced. */
-       if (busiest->group_type == group_misfit_task) {
+       /* Boost imbalance to allow misfit task to be balanced.
+        * Always do this if we are doing a NEWLY_IDLE balance
+        * on the assumption that any tasks we have must not be
+        * long-running (and hence we cannot rely upon load).
+        * However if we are not idle, we should assume the tasks
+        * we have are longer running and not override load-based
+        * calculations above unless we are sure that the local
+        * group is underutilized.
+        */
+       if (busiest->group_type == group_misfit_task &&
+           (env->idle == CPU_NEWLY_IDLE ||
+            local->sum_nr_running < local->group_weight)) {
                env->imbalance = max_t(long, env->imbalance,
-                                      busiest->group_misfit_task);
+                                      busiest->group_misfit_task_load);
        }
 
        /*
@@ -9613,7 +10044,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
            busiest->group_no_capacity)
                goto force_balance;
 
-       /* Misfitting tasks should be dealt with regardless of the avg load */
+       /* Misfit tasks should be dealt with regardless of the avg load */
        if (busiest->group_type == group_misfit_task)
                goto force_balance;
 
@@ -9703,14 +10134,30 @@ static struct rq *find_busiest_queue(struct lb_env *env,
                        continue;
 
                /*
-                * For ASYM_CPUCAPACITY domains with misfit tasks we ignore
-                * load.
+                * For ASYM_CPUCAPACITY domains with misfit tasks we simply
+                * seek the "biggest" misfit task.
                 */
-               if (env->src_grp_type == group_misfit_task && rq_has_misfit(rq))
-                       return rq;
+               if (env->src_grp_type == group_misfit_task) {
+                       if (rq->misfit_task_load > busiest_load) {
+                               busiest_load = rq->misfit_task_load;
+                               busiest = rq;
+                       }
+                       continue;
+               }
 
                capacity = capacity_of(i);
 
+               /*
+                * For ASYM_CPUCAPACITY domains, don't pick a cpu that could
+                * eventually lead to active_balancing high->low capacity.
+                * Higher per-cpu capacity is considered better than balancing
+                * average load.
+                */
+               if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+                   capacity_of(env->dst_cpu) < capacity &&
+                   rq->nr_running == 1)
+                       continue;
+
                wl = weighted_cpuload(rq);
 
                /*
@@ -9786,6 +10233,13 @@ static int need_active_balance(struct lb_env *env)
                        return 1;
        }
 
+       if (env->src_grp_type == group_misfit_task)
+               return 1;
+
+       if (env->src_grp_type == group_overloaded &&
+           env->src_rq->misfit_task_load)
+               return 1;
+
        return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
 
@@ -10124,7 +10578,7 @@ get_sd_balance_interval(struct sched_domain *sd, int cpu_busy)
        if (energy_aware() && sd_overutilized(sd)) {
                /* we know the root is overutilized, let's check for a misfit task */
                for_each_cpu(cpu, sched_domain_span(sd)) {
-                       if (rq_has_misfit(cpu_rq(cpu)))
+                       if (cpu_rq(cpu)->misfit_task_load)
                                return 1;
                }
        }
@@ -10177,7 +10631,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
        rq_unpin_lock(this_rq, rf);
 
        if (this_rq->avg_idle < sysctl_sched_migration_cost ||
-           !this_rq->rd->overload) {
+           !READ_ONCE(this_rq->rd->overload)) {
                rcu_read_lock();
                sd = rcu_dereference_check_sched_domain(this_rq->sd);
                if (sd)
@@ -10195,8 +10649,12 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
                int continue_balancing = 1;
                u64 t0, domain_cost;
 
-               if (!(sd->flags & SD_LOAD_BALANCE))
+               if (!(sd->flags & SD_LOAD_BALANCE)) {
+                       if (time_after_eq(jiffies,
+                                         sd->groups->sgc->next_update))
+                               update_group_capacity(sd, this_cpu);
                        continue;
+               }
 
                if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
                        update_next_balance(sd, &next_balance);
@@ -10531,8 +10989,12 @@ static void rebalance_domains(struct rq *rq, enum cpu_idle_type idle)
                if (energy_aware() && !sd_overutilized(sd))
                        continue;
 
-               if (!(sd->flags & SD_LOAD_BALANCE))
+               if (!(sd->flags & SD_LOAD_BALANCE)) {
+                       if (time_after_eq(jiffies,
+                                         sd->groups->sgc->next_update))
+                               update_group_capacity(sd, cpu);
                        continue;
+               }
 
                /*
                 * Stop the load balance at this level. There is another
@@ -10748,7 +11210,7 @@ static inline bool nohz_kick_needed(struct rq *rq, bool only_update)
 
        /* Do idle load balance if there have misfit task */
        if (energy_aware())
-               return rq_has_misfit(rq);
+               return rq->misfit_task_load;
 
        rcu_read_lock();
        sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
@@ -10875,7 +11337,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
        if (static_branch_unlikely(&sched_numa_balancing))
                task_tick_numa(rq, curr);
 
-       update_misfit_task(rq, curr);
+       update_misfit_status(curr, rq);
 
        update_overutilized_status(rq);
 }
@@ -10959,7 +11421,8 @@ static inline bool vruntime_normalized(struct task_struct *p)
         * - A task which has been woken up by try_to_wake_up() and
         *   waiting for actually being woken up by sched_ttwu_pending().
         */
-       if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+       if (!se->sum_exec_runtime ||
+           (p->state == TASK_WAKING && p->sched_remote_wakeup))
                return true;
 
        return false;
@@ -11385,10 +11848,10 @@ const struct sched_class fair_sched_class = {
 #ifdef CONFIG_SCHED_DEBUG
 void print_cfs_stats(struct seq_file *m, int cpu)
 {
-       struct cfs_rq *cfs_rq, *pos;
+       struct cfs_rq *cfs_rq;
 
        rcu_read_lock();
-       for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
+       for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
                print_cfs_rq(m, cpu, cfs_rq);
        rcu_read_unlock();
 }