}
}
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
- list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list, \
- leaf_cfs_rq_list)
+/* Iterate through all leaf cfs_rq's on a runqueue: */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
/* Do the two (enqueued) entities belong to the same group ? */
static inline struct cfs_rq *
{
}
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) \
- for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+ for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
static inline struct sched_entity *parent_entity(struct sched_entity *se)
{
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
static unsigned long task_h_load(struct task_struct *p);
+static unsigned long capacity_of(int cpu);
/* Give new sched_entity start runnable values to heavy its load in infant time */
void init_entity_runnable_average(struct sched_entity *se)
* To solve this problem, we also cap the util_avg of successive tasks to
* only 1/2 of the left utilization budget:
*
- * util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ * util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
*
- * where n denotes the nth task.
+ * where n denotes the nth task and cpu_scale the CPU capacity.
*
- * For example, a simplest series from the beginning would be like:
+ * For example, for a CPU with 1024 of capacity, a simplest series from
+ * the beginning would be like:
*
* task util_avg: 512, 256, 128, 64, 32, 16, 8, ...
* cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
{
struct cfs_rq *cfs_rq = cfs_rq_of(se);
struct sched_avg *sa = &se->avg;
- long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+ long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+ long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
if (cap > 0) {
if (cfs_rq->avg.util_avg != 0) {
* See cpu_util().
*/
cpufreq_update_util(rq, 0);
-#ifdef CONFIG_SMP
- trace_sched_load_avg_cpu(cpu_of(rq), cfs_rq);
-#endif
}
}
return 1;
}
+/*
+ * When a task is dequeued, its estimated utilization should not be update if
+ * its util_avg has not been updated at least once.
+ * This flag is used to synchronize util_avg updates with util_est updates.
+ * We map this information into the LSB bit of the utilization saved at
+ * dequeue time (i.e. util_est.dequeued).
+ */
+#define UTIL_AVG_UNCHANGED 0x1
+
+static inline void cfs_se_util_change(struct sched_avg *avg)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Avoid store if the flag has been already set */
+ enqueued = avg->util_est.enqueued;
+ if (!(enqueued & UTIL_AVG_UNCHANGED))
+ return;
+
+ /* Reset flag to report util_avg has been updated */
+ enqueued &= ~UTIL_AVG_UNCHANGED;
+ WRITE_ONCE(avg->util_est.enqueued, enqueued);
+}
+
static int
__update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
{
static int
__update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
{
- return ___update_load_avg(now, cpu, &se->avg,
- se->on_rq * scale_load_down(se->load.weight),
- cfs_rq->curr == se, NULL, NULL);
+ if (___update_load_avg(now, cpu, &se->avg,
+ se->on_rq * scale_load_down(se->load.weight),
+ cfs_rq->curr == se, NULL, NULL)) {
+ cfs_se_util_change(&se->avg);
+
+#ifdef UTIL_EST_DEBUG
+ /*
+ * Trace utilization only for actual tasks.
+ *
+ * These trace events are mostly useful to get easier to
+ * read plots for the estimated utilization, where we can
+ * compare it with the actual grow/decrease of the original
+ * PELT signal.
+ * Let's keep them disabled by default in "production kernels".
+ */
+ if (entity_is_task(se)) {
+ struct task_struct *tsk = task_of(se);
+
+ trace_sched_util_est_task(tsk, &se->avg);
+
+ /* Trace utilization only for top level CFS RQ */
+ cfs_rq = &(task_rq(tsk)->cfs);
+ trace_sched_util_est_cpu(cpu, cfs_rq);
+ }
+#endif /* UTIL_EST_DEBUG */
+
+ return 1;
+ }
+
+ return 0;
}
static int
static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
+static inline int task_fits_capacity(struct task_struct *p, long capacity);
+
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
+{
+ if (!static_branch_unlikely(&sched_asym_cpucapacity))
+ return;
+
+ if (!p) {
+ rq->misfit_task_load = 0;
+ return;
+ }
+
+ if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
+ rq->misfit_task_load = 0;
+ return;
+ }
+
+ rq->misfit_task_load = task_h_load(p);
+}
+
+static inline unsigned long task_util(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+ if (likely(!walt_disabled && sysctl_sched_use_walt_task_util))
+ return (p->ravg.demand /
+ (walt_ravg_window >> SCHED_CAPACITY_SHIFT));
+#endif
+ return READ_ONCE(p->se.avg.util_avg);
+}
+
+static inline unsigned long _task_util_est(struct task_struct *p)
+{
+ struct util_est ue = READ_ONCE(p->se.avg.util_est);
+
+ return max(ue.ewma, ue.enqueued);
+}
+
+static inline unsigned long task_util_est(struct task_struct *p)
+{
+#ifdef CONFIG_SCHED_WALT
+ if (likely(!walt_disabled && sysctl_sched_use_walt_task_util))
+ return (p->ravg.demand /
+ (walt_ravg_window >> SCHED_CAPACITY_SHIFT));
+#endif
+ return max(task_util(p), _task_util_est(p));
+}
+
+static inline void util_est_enqueue(struct cfs_rq *cfs_rq,
+ struct task_struct *p)
+{
+ unsigned int enqueued;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /* Update root cfs_rq's estimated utilization */
+ enqueued = cfs_rq->avg.util_est.enqueued;
+ enqueued += (_task_util_est(p) | UTIL_AVG_UNCHANGED);
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, enqueued);
+
+ trace_sched_util_est_task(p, &p->se.avg);
+ trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+}
+
+/*
+ * Check if a (signed) value is within a specified (unsigned) margin,
+ * based on the observation that:
+ *
+ * abs(x) < y := (unsigned)(x + y - 1) < (2 * y - 1)
+ *
+ * NOTE: this only works when value + maring < INT_MAX.
+ */
+static inline bool within_margin(int value, int margin)
+{
+ return ((unsigned int)(value + margin - 1) < (2 * margin - 1));
+}
+
+static void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
+{
+ long last_ewma_diff;
+ struct util_est ue;
+
+ if (!sched_feat(UTIL_EST))
+ return;
+
+ /*
+ * Update root cfs_rq's estimated utilization
+ *
+ * If *p is the last task then the root cfs_rq's estimated utilization
+ * of a CPU is 0 by definition.
+ */
+ ue.enqueued = 0;
+ if (cfs_rq->nr_running) {
+ ue.enqueued = cfs_rq->avg.util_est.enqueued;
+ ue.enqueued -= min_t(unsigned int, ue.enqueued,
+ (_task_util_est(p) | UTIL_AVG_UNCHANGED));
+ }
+ WRITE_ONCE(cfs_rq->avg.util_est.enqueued, ue.enqueued);
+
+ trace_sched_util_est_cpu(cpu_of(rq_of(cfs_rq)), cfs_rq);
+
+ /*
+ * Skip update of task's estimated utilization when the task has not
+ * yet completed an activation, e.g. being migrated.
+ */
+ if (!task_sleep)
+ return;
+
+ /*
+ * If the PELT values haven't changed since enqueue time,
+ * skip the util_est update.
+ */
+ ue = p->se.avg.util_est;
+ if (ue.enqueued & UTIL_AVG_UNCHANGED)
+ return;
+
+ /*
+ * Skip update of task's estimated utilization when its EWMA is
+ * already ~1% close to its last activation value.
+ */
+ ue.enqueued = (task_util(p) | UTIL_AVG_UNCHANGED);
+ last_ewma_diff = ue.enqueued - ue.ewma;
+ if (within_margin(last_ewma_diff, (SCHED_CAPACITY_SCALE / 100)))
+ return;
+
+ /*
+ * Update Task's estimated utilization
+ *
+ * When *p completes an activation we can consolidate another sample
+ * of the task size. This is done by storing the current PELT value
+ * as ue.enqueued and by using this value to update the Exponential
+ * Weighted Moving Average (EWMA):
+ *
+ * ewma(t) = w * task_util(p) + (1-w) * ewma(t-1)
+ * = w * task_util(p) + ewma(t-1) - w * ewma(t-1)
+ * = w * (task_util(p) - ewma(t-1)) + ewma(t-1)
+ * = w * ( last_ewma_diff ) + ewma(t-1)
+ * = w * (last_ewma_diff + ewma(t-1) / w)
+ *
+ * Where 'w' is the weight of new samples, which is configured to be
+ * 0.25, thus making w=1/4 ( >>= UTIL_EST_WEIGHT_SHIFT)
+ */
+ ue.ewma <<= UTIL_EST_WEIGHT_SHIFT;
+ ue.ewma += last_ewma_diff;
+ ue.ewma >>= UTIL_EST_WEIGHT_SHIFT;
+ WRITE_ONCE(p->se.avg.util_est, ue);
+
+ trace_sched_util_est_task(p, &p->se.avg);
+}
+
#else /* CONFIG_SMP */
static inline int
return 0;
}
+static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
+
+static inline void
+util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
+
+static inline void
+util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
+ bool task_sleep) {}
+
#endif /* CONFIG_SMP */
static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
* put back on, and if we advance min_vruntime, we'll be placed back
* further than we started -- ie. we'll be penalized.
*/
- if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+ if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
update_min_vruntime(cfs_rq);
}
void cfs_bandwidth_usage_inc(void)
{
- static_key_slow_inc(&__cfs_bandwidth_used);
+ static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
}
void cfs_bandwidth_usage_dec(void)
{
- static_key_slow_dec(&__cfs_bandwidth_used);
+ static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
}
#else /* HAVE_JUMP_LABEL */
static bool cfs_bandwidth_used(void)
now = sched_clock_cpu(smp_processor_id());
cfs_b->runtime = cfs_b->quota;
cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+ cfs_b->expires_seq++;
}
static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
struct task_group *tg = cfs_rq->tg;
struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
u64 amount = 0, min_amount, expires;
+ int expires_seq;
/* note: this is a positive sum as runtime_remaining <= 0 */
min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
cfs_b->idle = 0;
}
}
+ expires_seq = cfs_b->expires_seq;
expires = cfs_b->runtime_expires;
raw_spin_unlock(&cfs_b->lock);
* spread between our sched_clock and the one on which runtime was
* issued.
*/
- if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+ if (cfs_rq->expires_seq != expires_seq) {
+ cfs_rq->expires_seq = expires_seq;
cfs_rq->runtime_expires = expires;
+ }
return cfs_rq->runtime_remaining > 0;
}
* has not truly expired.
*
* Fortunately we can check determine whether this the case by checking
- * whether the global deadline has advanced. It is valid to compare
- * cfs_b->runtime_expires without any locks since we only care about
- * exact equality, so a partial write will still work.
+ * whether the global deadline(cfs_b->expires_seq) has advanced.
*/
-
- if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
+ if (cfs_rq->expires_seq == cfs_b->expires_seq) {
/* extend local deadline, drift is bounded above by 2 ticks */
cfs_rq->runtime_expires += TICK_NSEC;
} else {
/*
* Add to the _head_ of the list, so that an already-started
- * distribute_cfs_runtime will not see us
+ * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+ * not running add to the tail so that later runqueues don't get starved.
*/
- list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ if (cfs_b->distribute_running)
+ list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+ else
+ list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
/*
* If we're the first throttled task, make sure the bandwidth
* in us over-using our runtime if it is all used during this loop, but
* only by limited amounts in that extreme case.
*/
- while (throttled && cfs_b->runtime > 0) {
+ while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
+ cfs_b->distribute_running = 1;
raw_spin_unlock(&cfs_b->lock);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
runtime_expires);
raw_spin_lock(&cfs_b->lock);
+ cfs_b->distribute_running = 0;
throttled = !list_empty(&cfs_b->throttled_cfs_rq);
cfs_b->runtime -= min(runtime, cfs_b->runtime);
/* confirm we're still not at a refresh boundary */
raw_spin_lock(&cfs_b->lock);
+ if (cfs_b->distribute_running) {
+ raw_spin_unlock(&cfs_b->lock);
+ return;
+ }
+
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
raw_spin_unlock(&cfs_b->lock);
return;
runtime = cfs_b->runtime;
expires = cfs_b->runtime_expires;
+ if (runtime)
+ cfs_b->distribute_running = 1;
+
raw_spin_unlock(&cfs_b->lock);
if (!runtime)
raw_spin_lock(&cfs_b->lock);
if (expires == cfs_b->runtime_expires)
cfs_b->runtime -= min(runtime, cfs_b->runtime);
+ cfs_b->distribute_running = 0;
raw_spin_unlock(&cfs_b->lock);
}
cfs_b->period_timer.function = sched_cfs_period_timer;
hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
cfs_b->slack_timer.function = sched_cfs_slack_timer;
+ cfs_b->distribute_running = 0;
}
static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
#ifdef CONFIG_SMP
static bool cpu_overutilized(int cpu);
-static unsigned long cpu_util(int cpu);
-
static bool sd_overutilized(struct sched_domain *sd)
{
return sd->shared->overutilized;
rcu_read_unlock();
}
-unsigned long boosted_cpu_util(int cpu);
+unsigned long boosted_cpu_util(int cpu, unsigned long other_util);
#else
#define update_overutilized_status(rq) do {} while (0)
-#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
+#define boosted_cpu_util(cpu, other_util) cpu_util_freq(cpu)
#endif /* CONFIG_SMP */
struct sched_entity *se = &p->se;
int task_new = !(flags & ENQUEUE_WAKEUP);
+ /*
+ * The code below (indirectly) updates schedutil which looks at
+ * the cfs_rq utilization to select a frequency.
+ * Let's add the task's estimated utilization to the cfs_rq's
+ * estimated utilization, before we update schedutil.
+ */
+ util_est_enqueue(&rq->cfs, p);
+
+ /*
+ * The code below (indirectly) updates schedutil which looks at
+ * the cfs_rq utilization to select a frequency.
+ * Let's update schedtune here to ensure the boost value of the
+ * current task is accounted for in the selection of the OPP.
+ *
+ * We do it also in the case where we enqueue a throttled task;
+ * we could argue that a throttled task should not boost a CPU,
+ * however:
+ * a) properly implementing CPU boosting considering throttled
+ * tasks will increase a lot the complexity of the solution
+ * b) it's not easy to quantify the benefits introduced by
+ * such a more complex solution.
+ * Thus, for the time being we go for the simple solution and boost
+ * also for throttled RQs.
+ */
+ schedtune_enqueue_task(p, cpu_of(rq));
+
/*
* If in_iowait is set, the code below may not trigger any cpufreq
* utilization updates, so do it here explicitly with the IOWAIT flag
update_cfs_shares(se);
}
- /*
- * Update SchedTune accounting.
- *
- * We do it before updating the CPU capacity to ensure the
- * boost value of the current task is accounted for in the
- * selection of the OPP.
- *
- * We do it also in the case where we enqueue a throttled task;
- * we could argue that a throttled task should not boost a CPU,
- * however:
- * a) properly implementing CPU boosting considering throttled
- * tasks will increase a lot the complexity of the solution
- * b) it's not easy to quantify the benefits introduced by
- * such a more complex solution.
- * Thus, for the time being we go for the simple solution and boost
- * also for throttled RQs.
- */
- schedtune_enqueue_task(p, cpu_of(rq));
-
if (!se) {
add_nr_running(rq, 1);
if (!task_new)
struct sched_entity *se = &p->se;
int task_sleep = flags & DEQUEUE_SLEEP;
+ /*
+ * The code below (indirectly) updates schedutil which looks at
+ * the cfs_rq utilization to select a frequency.
+ * Let's update schedtune here to ensure the boost value of the
+ * current task is not more accounted for in the selection of the OPP.
+ */
+ schedtune_dequeue_task(p, cpu_of(rq));
+
for_each_sched_entity(se) {
cfs_rq = cfs_rq_of(se);
dequeue_entity(cfs_rq, se, flags);
update_cfs_shares(se);
}
- /*
- * Update SchedTune accounting
- *
- * We do it before updating the CPU capacity to ensure the
- * boost value of the current task is accounted for in the
- * selection of the OPP.
- */
- schedtune_dequeue_task(p, cpu_of(rq));
-
if (!se) {
sub_nr_running(rq, 1);
walt_dec_cumulative_runnable_avg(rq, p);
}
+ util_est_dequeue(&rq->cfs, p, task_sleep);
hrtick_update(rq);
}
return sched_feat(ENERGY_AWARE);
}
-static int cpu_util_wake(int cpu, struct task_struct *p);
-
/*
* __cpu_norm_util() returns the cpu util relative to a specific capacity,
* i.e. it's busy ratio, in the range [0..SCHED_CAPACITY_SCALE] which is useful
struct sched_group *sg;
};
-static int cpu_util_wake(int cpu, struct task_struct *p);
+/**
+ * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks
+ * @cpu: the CPU to get the utilization of
+ *
+ * The unit of the return value must be the one of capacity so we can compare
+ * the utilization with the capacity of the CPU that is available for CFS task
+ * (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency,
+ * i.e. arch_scale_cpu_capacity().
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * The estimated utilization of a CPU is defined to be the maximum between its
+ * cfs_rq.avg.util_avg and the sum of the estimated utilization of the tasks
+ * currently RUNNABLE on that CPU.
+ * This allows to properly represent the expected utilization of a CPU which
+ * has just got a big task running since a long sleep period. At the same time
+ * however it preserves the benefits of the "blocked utilization" in
+ * describing the potential for other tasks waking up on the same CPU.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ *
+ * Return: the (estimated) utilization for the specified CPU
+ */
+static inline unsigned long cpu_util(int cpu)
+{
+ struct cfs_rq *cfs_rq;
+ unsigned int util;
+
+#ifdef CONFIG_SCHED_WALT
+ if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util)) {
+ u64 walt_cpu_util = cpu_rq(cpu)->cumulative_runnable_avg;
+
+ walt_cpu_util <<= SCHED_CAPACITY_SHIFT;
+ do_div(walt_cpu_util, walt_ravg_window);
+
+ return min_t(unsigned long, walt_cpu_util,
+ capacity_orig_of(cpu));
+ }
+#endif
+
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
+
+ if (sched_feat(UTIL_EST))
+ util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+ return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
+
+static inline unsigned long cpu_util_freq(int cpu)
+{
+#ifdef CONFIG_SCHED_WALT
+ u64 walt_cpu_util;
+
+ if (unlikely(walt_disabled || !sysctl_sched_use_walt_cpu_util))
+ return cpu_util(cpu);
+
+ walt_cpu_util = cpu_rq(cpu)->prev_runnable_sum;
+ walt_cpu_util <<= SCHED_CAPACITY_SHIFT;
+ do_div(walt_cpu_util, walt_ravg_window);
+
+ return min_t(unsigned long, walt_cpu_util, capacity_orig_of(cpu));
+#else
+ return cpu_util(cpu);
+#endif
+}
+
+/*
+ * cpu_util_wake: Compute CPU utilization with any contributions from
+ * the waking task p removed.
+ */
+static unsigned long cpu_util_wake(int cpu, struct task_struct *p)
+{
+ struct cfs_rq *cfs_rq;
+ unsigned int util;
+
+#ifdef CONFIG_SCHED_WALT
+ /*
+ * WALT does not decay idle tasks in the same manner
+ * as PELT, so it makes little sense to subtract task
+ * utilization from cpu utilization. Instead just use
+ * cpu_util for this case.
+ */
+ if (likely(!walt_disabled && sysctl_sched_use_walt_cpu_util))
+ return cpu_util(cpu);
+#endif
+
+ /* Task has no contribution or is new */
+ if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
+ return cpu_util(cpu);
+
+ cfs_rq = &cpu_rq(cpu)->cfs;
+ util = READ_ONCE(cfs_rq->avg.util_avg);
+
+ /* Discount task's blocked util from CPU's util */
+ util -= min_t(unsigned int, util, task_util(p));
+
+ /*
+ * Covered cases:
+ *
+ * a) if *p is the only task sleeping on this CPU, then:
+ * cpu_util (== task_util) > util_est (== 0)
+ * and thus we return:
+ * cpu_util_wake = (cpu_util - task_util) = 0
+ *
+ * b) if other tasks are SLEEPING on this CPU, which is now exiting
+ * IDLE, then:
+ * cpu_util >= task_util
+ * cpu_util > util_est (== 0)
+ * and thus we discount *p's blocked utilization to return:
+ * cpu_util_wake = (cpu_util - task_util) >= 0
+ *
+ * c) if other tasks are RUNNABLE on that CPU and
+ * util_est > cpu_util
+ * then we use util_est since it returns a more restrictive
+ * estimation of the spare capacity on that CPU, by just
+ * considering the expected utilization of tasks already
+ * runnable on that CPU.
+ *
+ * Cases a) and b) are covered by the above code, while case c) is
+ * covered by the following code when estimated utilization is
+ * enabled.
+ */
+ if (sched_feat(UTIL_EST))
+ util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
+
+ /*
+ * Utilization (estimated) can exceed the CPU capacity, thus let's
+ * clamp to the maximum CPU capacity to ensure consistency with
+ * the cpu_util call.
+ */
+ return min_t(unsigned long, util, capacity_orig_of(cpu));
+}
static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx)
{
return affine;
}
-static inline unsigned long task_util(struct task_struct *p);
-
#ifdef CONFIG_SCHED_TUNE
struct reciprocal_value schedtune_spc_rdiv;
if (boost == 0)
return 0;
- util = task_util(task);
+ util = task_util_est(task);
margin = schedtune_margin(util, boost);
return margin;
#endif /* CONFIG_SCHED_TUNE */
unsigned long
-boosted_cpu_util(int cpu)
+boosted_cpu_util(int cpu, unsigned long other_util)
{
- unsigned long util = cpu_util_freq(cpu);
+ unsigned long util = cpu_util_freq(cpu) + other_util;
long margin = schedtune_cpu_margin(util, cpu);
trace_sched_boost_cpu(cpu, util, margin);
static inline unsigned long
boosted_task_util(struct task_struct *task)
{
- unsigned long util = task_util(task);
+ unsigned long util = task_util_est(task);
long margin = schedtune_task_margin(task);
trace_sched_boost_task(task, util, margin);
}
#ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
static inline void set_idle_cores(int cpu, int val)
{
return select_idle_sibling_cstate_aware(p, prev, target);
}
-static inline unsigned long task_util(struct task_struct *p)
-{
-#ifdef CONFIG_SCHED_WALT
- if (!walt_disabled && sysctl_sched_use_walt_task_util) {
- return (p->ravg.demand / (walt_ravg_window >> SCHED_CAPACITY_SHIFT));
- }
-#endif
- return p->se.avg.util_avg;
-}
-
-/*
- * cpu_util_wake: Compute cpu utilization with any contributions from
- * the waking task p removed.
- */
-static int cpu_util_wake(int cpu, struct task_struct *p)
-{
- unsigned long util, capacity;
-
-#ifdef CONFIG_SCHED_WALT
- /*
- * WALT does not decay idle tasks in the same manner
- * as PELT, so it makes little sense to subtract task
- * utilization from cpu utilization. Instead just use
- * cpu_util for this case.
- */
- if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
- return cpu_util(cpu);
-#endif
- /* Task has no contribution or is new */
- if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
- return cpu_util(cpu);
-
- capacity = capacity_orig_of(cpu);
- util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
-
- return (util >= capacity) ? capacity : util;
-}
-
static inline int task_fits_capacity(struct task_struct *p, long capacity)
{
return capacity * 1024 > boosted_task_util(p) * capacity_margin;
static inline int find_best_target(struct task_struct *p, int *backup_cpu,
bool boosted, bool prefer_idle)
{
- unsigned long best_idle_min_cap_orig = ULONG_MAX;
unsigned long min_util = boosted_task_util(p);
unsigned long target_capacity = ULONG_MAX;
unsigned long min_wake_util = ULONG_MAX;
*backup_cpu = -1;
+ /*
+ * In most cases, target_capacity tracks capacity_orig of the most
+ * energy efficient CPU candidate, thus requiring to minimise
+ * target_capacity. For these cases target_capacity is already
+ * initialized to ULONG_MAX.
+ * However, for prefer_idle and boosted tasks we look for a high
+ * performance CPU, thus requiring to maximise target_capacity. In this
+ * case we initialise target_capacity to 0.
+ */
+ if (prefer_idle && boosted)
+ target_capacity = 0;
+
/* Find start CPU based on boost value */
cpu = start_cpu(boosted);
if (cpu < 0)
unsigned long capacity_curr = capacity_curr_of(i);
unsigned long capacity_orig = capacity_orig_of(i);
unsigned long wake_util, new_util;
+ long spare_cap;
+ int idle_idx = INT_MAX;
if (!cpu_online(i))
continue;
* accounting. However, the blocked utilization may be zero.
*/
wake_util = cpu_util_wake(i, p);
- new_util = wake_util + task_util(p);
+ new_util = wake_util + task_util_est(p);
/*
* Ensure minimum capacity to grant the required boost.
if (new_util > capacity_orig)
continue;
+ /*
+ * Pre-compute the maximum possible capacity we expect
+ * to have available on this CPU once the task is
+ * enqueued here.
+ */
+ spare_cap = capacity_orig - new_util;
+
+ if (idle_cpu(i))
+ idle_idx = idle_get_state_idx(cpu_rq(i));
+
+
/*
* Case A) Latency sensitive tasks
*
/*
* Case A.1: IDLE CPU
- * Return the first IDLE CPU we find.
+ * Return the best IDLE CPU we find:
+ * - for boosted tasks: the CPU with the highest
+ * performance (i.e. biggest capacity_orig)
+ * - for !boosted tasks: the most energy
+ * efficient CPU (i.e. smallest capacity_orig)
*/
if (idle_cpu(i)) {
- trace_sched_find_best_target(p,
- prefer_idle, min_util,
- cpu, best_idle_cpu,
- best_active_cpu, i);
-
- return i;
+ if (boosted &&
+ capacity_orig < target_capacity)
+ continue;
+ if (!boosted &&
+ capacity_orig > target_capacity)
+ continue;
+ if (capacity_orig == target_capacity &&
+ sysctl_sched_cstate_aware &&
+ best_idle_cstate <= idle_idx)
+ continue;
+
+ target_capacity = capacity_orig;
+ best_idle_cstate = idle_idx;
+ best_idle_cpu = i;
+ continue;
}
+ if (best_idle_cpu != -1)
+ continue;
/*
* Case A.2: Target ACTIVE CPU
* Favor CPUs with max spare capacity.
*/
- if ((capacity_curr > new_util) &&
- (capacity_orig - new_util > target_max_spare_cap)) {
- target_max_spare_cap = capacity_orig - new_util;
+ if (capacity_curr > new_util &&
+ spare_cap > target_max_spare_cap) {
+ target_max_spare_cap = spare_cap;
target_cpu = i;
continue;
}
(capacity_orig * SCHED_CAPACITY_SCALE))
continue;
+ /*
+ * Favor CPUs with smaller capacity for non latency
+ * sensitive tasks.
+ */
+ if (capacity_orig > target_capacity)
+ continue;
+
/*
* Case B) Non latency sensitive tasks on IDLE CPUs.
*
* consumptions without affecting performance.
*/
if (idle_cpu(i)) {
- int idle_idx = idle_get_state_idx(cpu_rq(i));
-
- /* Select idle CPU with lower cap_orig */
- if (capacity_orig > best_idle_min_cap_orig)
- continue;
-
/*
* Skip CPUs in deeper idle state, but only
* if they are also less energy efficient.
* IOW, prefer a deep IDLE LITTLE CPU vs a
* shallow idle big CPU.
*/
- if (sysctl_sched_cstate_aware &&
+ if (capacity_orig == target_capacity &&
+ sysctl_sched_cstate_aware &&
best_idle_cstate <= idle_idx)
continue;
- /* Keep track of best idle CPU */
- best_idle_min_cap_orig = capacity_orig;
+ target_capacity = capacity_orig;
best_idle_cstate = idle_idx;
best_idle_cpu = i;
continue;
* capacity.
*/
- /* Favor CPUs with smaller capacity */
- if (capacity_orig > target_capacity)
- continue;
-
/* Favor CPUs with maximum spare capacity */
- if ((capacity_orig - new_util) < target_max_spare_cap)
+ if (capacity_orig == target_capacity &&
+ spare_cap < target_max_spare_cap)
continue;
- target_max_spare_cap = capacity_orig - new_util;
+ target_max_spare_cap = spare_cap;
target_capacity = capacity_orig;
target_util = new_util;
target_cpu = i;
*
* - prefer_idle tasks:
*
- * a) IDLE CPU available, we return immediately
+ * a) IDLE CPU available: best_idle_cpu
* b) ACTIVE CPU where task fits and has the bigger maximum spare
* capacity (i.e. target_cpu)
* c) ACTIVE CPU with less contention due to other tasks
* a) ACTIVE CPU: target_cpu
* b) IDLE CPU: best_idle_cpu
*/
+
+ if (prefer_idle && (best_idle_cpu != -1)) {
+ trace_sched_find_best_target(p, prefer_idle, min_util, cpu,
+ best_idle_cpu, best_active_cpu,
+ best_idle_cpu);
+
+ return best_idle_cpu;
+ }
+
if (target_cpu == -1)
target_cpu = prefer_idle
? best_active_cpu
return 0;
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
- max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+ max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
/* Minimum capacity is close to max, no need to abort wake_affine */
if (max_cap - min_cap < max_cap >> 3)
* during energy calculation, but unboosted task
* util for group utilization calculations
*/
- eenv->util_delta = task_util(p);
+ eenv->util_delta = task_util_est(p);
eenv->util_delta_boosted = boosted_task_util(p);
cpumask_and(&cpumask_possible_cpus, &p->cpus_allowed, cpu_online_mask);
{
int use_fbt = sched_feat(FIND_BEST_TARGET);
int cpu_iter, eas_cpu_idx = EAS_CPU_NXT;
- int energy_cpu = -1;
+ int target_cpu = -1;
struct energy_env *eenv;
if (sysctl_sched_sync_hint_enable && sync) {
/* prepopulate energy diff environment */
eenv = get_eenv(p, prev_cpu);
if (eenv->max_cpu_count < 2)
- return energy_cpu;
+ return -1;
if(!use_fbt) {
/*
if (cpu_iter == prev_cpu)
continue;
+ /*
+ * Consider only CPUs where the task is expected to
+ * fit without making the CPU overutilized.
+ */
spare = capacity_spare_wake(cpu_iter, p);
-
- if (spare * 1024 < capacity_margin * task_util(p))
+ if (spare * 1024 < capacity_margin * task_util_est(p))
continue;
/* Add CPU candidate */
eenv->max_cpu_count = EAS_CPU_BKP + 1;
/* Find a cpu with sufficient capacity */
- eenv->cpu[EAS_CPU_NXT].cpu_id = find_best_target(p,
- &eenv->cpu[EAS_CPU_BKP].cpu_id,
- boosted, prefer_idle);
+ target_cpu = find_best_target(p, &eenv->cpu[EAS_CPU_BKP].cpu_id,
+ boosted, prefer_idle);
+
+ /* Immediately return a found idle CPU for a prefer_idle task */
+ if (prefer_idle && target_cpu >= 0 && idle_cpu(target_cpu))
+ return target_cpu;
+
+ /* Place target into NEXT slot */
+ eenv->cpu[EAS_CPU_NXT].cpu_id = target_cpu;
/* take note if no backup was found */
if (eenv->cpu[EAS_CPU_BKP].cpu_id < 0)
* candidates beyond prev_cpu, so we will
* fall-back to the regular slow-path.
*/
- return energy_cpu;
+ return -1;
}
/* find most energy-efficient CPU */
- energy_cpu = select_energy_cpu_idx(eenv) < 0 ? -1 :
+ target_cpu = select_energy_cpu_idx(eenv) < 0 ? -1 :
eenv->cpu[eenv->next_idx].cpu_id;
- return energy_cpu;
+ return target_cpu;
}
static inline bool nohz_kick_needed(struct rq *rq, bool only_update);
* the heuristics we use there in selecting candidate
* CPUs.
*/
- if (unlikely(!sched_feat(FIND_BEST_TARGET) && !task_util(p)))
+ if (unlikely(!sched_feat(FIND_BEST_TARGET) && !task_util_est(p)))
return false;
if(!sched_feat(EAS_PREFER_IDLE)){
set_last_buddy(se);
}
-static inline void update_misfit_task(struct rq *rq, struct task_struct *p)
-{
-#ifdef CONFIG_SMP
- rq->misfit_task = !task_fits_capacity(p, capacity_of(rq->cpu));
-#endif
-}
-
-static inline void clear_rq_misfit(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- rq->misfit_task = 0;
-#endif
-}
-
-static inline unsigned int rq_has_misfit(struct rq *rq)
-{
-#ifdef CONFIG_SMP
- return rq->misfit_task;
-#else
- return 0;
-#endif
-}
-
static struct task_struct *
pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
{
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
- update_misfit_task(rq, p);
+ update_misfit_status(p, rq);
return p;
simple:
if (hrtick_enabled(rq))
hrtick_start_fair(rq, p);
- update_misfit_task(rq, p);
+ update_misfit_status(p, rq);
return p;
idle:
- clear_rq_misfit(rq);
+ update_misfit_status(NULL, rq);
new_tasks = idle_balance(rq, rf);
/*
#ifdef CONFIG_FAIR_GROUP_SCHED
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
- if (cfs_rq->load.weight)
- return false;
-
- if (cfs_rq->avg.load_sum)
- return false;
-
- if (cfs_rq->avg.util_sum)
- return false;
-
- if (cfs_rq->runnable_load_sum)
- return false;
-
- return true;
-}
-
static void update_blocked_averages(int cpu)
{
struct rq *rq = cpu_rq(cpu);
- struct cfs_rq *cfs_rq, *pos;
+ struct cfs_rq *cfs_rq;
struct rq_flags rf;
rq_lock_irqsave(rq, &rf);
* Iterates the task_group tree in a bottom up fashion, see
* list_add_leaf_cfs_rq() for details.
*/
- for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+ for_each_leaf_cfs_rq(rq, cfs_rq) {
struct sched_entity *se;
/* throttled entities do not contribute to load */
se = cfs_rq->tg->se[cpu];
if (se && !skip_blocked_update(se))
update_load_avg(se, 0);
-
- /*
- * There can be a lot of idle CPU cgroups. Don't let fully
- * decayed cfs_rqs linger on the list.
- */
- if (cfs_rq_is_decayed(cfs_rq))
- list_del_leaf_cfs_rq(cfs_rq);
}
update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
#ifdef CONFIG_NO_HZ_COMMON
unsigned int group_weight;
enum group_type group_type;
int group_no_capacity;
- int group_misfit_task; /* A cpu has a task too big for its capacity */
+ /* A cpu has a task too big for its capacity */
+ unsigned long group_misfit_task_load;
#ifdef CONFIG_NUMA_BALANCING
unsigned int nr_numa_running;
unsigned int nr_preferred_running;
return 1;
}
+void init_max_cpu_capacity(struct max_cpu_capacity *mcc)
+{
+ raw_spin_lock_init(&mcc->lock);
+ mcc->val = 0;
+ mcc->cpu = -1;
+}
+
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = arch_scale_cpu_capacity(sd, cpu);
struct sched_group *sdg = sd->groups;
+ struct max_cpu_capacity *mcc;
+ unsigned long max_capacity;
+ int max_cap_cpu;
+ unsigned long flags;
cpu_rq(cpu)->cpu_capacity_orig = capacity;
+ capacity *= arch_scale_max_freq_capacity(sd, cpu);
+ capacity >>= SCHED_CAPACITY_SHIFT;
+
+ mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
+
+ raw_spin_lock_irqsave(&mcc->lock, flags);
+ max_capacity = mcc->val;
+ max_cap_cpu = mcc->cpu;
+
+ if ((max_capacity > capacity && max_cap_cpu == cpu) ||
+ max_capacity < capacity) {
+ mcc->val = capacity;
+ mcc->cpu = cpu;
+#ifdef CONFIG_SCHED_DEBUG
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+ pr_info("CPU%d: update max cpu_capacity %lu\n", cpu, capacity);
+ goto skip_unlock;
+#endif
+ }
+ raw_spin_unlock_irqrestore(&mcc->lock, flags);
+
+skip_unlock: __attribute__ ((unused));
capacity *= scale_rt_capacity(cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
cpu_rq(cpu)->cpu_capacity = capacity;
sdg->sgc->capacity = capacity;
sdg->sgc->min_capacity = capacity;
+ sdg->sgc->max_capacity = capacity;
}
void update_group_capacity(struct sched_domain *sd, int cpu)
}
/*
- * group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
+ * group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
* per-CPU capacity than sched_group ref.
*/
static inline bool
-group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
{
return sg->sgc->min_capacity * capacity_margin <
ref->sgc->min_capacity * 1024;
}
+/*
+ * group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
+ * per-CPU capacity_orig than sched_group ref.
+ */
+static inline bool
+group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
+{
+ return sg->sgc->max_capacity * capacity_margin <
+ ref->sgc->max_capacity * 1024;
+}
+
/*
* group_similar_cpu_capacity: Returns true if the minimum capacity of the
* compared groups differ by less than 12.5%.
if (sg_imbalanced(group))
return group_imbalanced;
- if (sgs->group_misfit_task)
+ if (sgs->group_misfit_task_load)
return group_misfit_task;
return group_other;
* @load_idx: Load index of sched_domain of this_cpu for load calc.
* @local_group: Does group contain this_cpu.
* @sgs: variable to hold the statistics for this group.
- * @overload: Indicate more than one runnable task for any CPU.
+ * @overload: Indicate pullable load (e.g. >1 runnable task).
* @overutilized: Indicate overutilization for any CPU.
*/
static inline void update_sg_lb_stats(struct lb_env *env,
sgs->idle_cpus++;
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
- !sgs->group_misfit_task && rq_has_misfit(rq))
- sgs->group_misfit_task = capacity_of(i);
+ sgs->group_misfit_task_load < rq->misfit_task_load) {
+ sgs->group_misfit_task_load = rq->misfit_task_load;
+ *overload = 1;
+ }
+
if (cpu_overutilized(i)) {
*overutilized = true;
- if (rq_has_misfit(rq))
+ if (rq->misfit_task_load)
*misfit_task = true;
}
}
/*
* Don't try to pull misfit tasks we can't help.
+ * We can use max_capacity here as reduction in capacity on some
+ * cpus in the group should either be possible to resolve
+ * internally or be covered by avg_load imbalance (eventually).
*/
if (sgs->group_type == group_misfit_task &&
- (!group_smaller_cpu_capacity(sg, sds->local) ||
+ (!group_smaller_max_cpu_capacity(sg, sds->local) ||
!group_has_capacity(env, &sds->local_stat)))
return false;
* power/energy consequences are not considered.
*/
if (sgs->sum_nr_running <= sgs->group_weight &&
- group_smaller_cpu_capacity(sds->local, sg))
+ group_smaller_min_cpu_capacity(sds->local, sg))
return false;
/*
!group_similar_cpu_capacity(sds->local, sg))
return false;
+ /*
+ * If we have more than one misfit sg go with the biggest misfit.
+ */
+ if (sgs->group_type == group_misfit_task &&
+ sgs->group_misfit_task_load < busiest->group_misfit_task_load)
+ return false;
+
asym_packing:
/* This is the busiest node in its class. */
if (!(env->sd->flags & SD_ASYM_PACKING))
struct sched_group *sg = env->sd->groups;
struct sg_lb_stats *local = &sds->local_stat;
struct sg_lb_stats tmp_sgs;
- int load_idx, prefer_sibling = 0;
+ int load_idx;
bool overload = false, overutilized = false, misfit_task = false;
-
- if (child && child->flags & SD_PREFER_SIBLING)
- prefer_sibling = 1;
+ bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
#ifdef CONFIG_NO_HZ_COMMON
if (env->idle == CPU_NEWLY_IDLE) {
if (!lb_sd_parent(env->sd)) {
/* update overload indicator if we are at root domain */
- if (env->dst_rq->rd->overload != overload)
- env->dst_rq->rd->overload = overload;
+ if (READ_ONCE(env->dst_rq->rd->overload) != overload)
+ WRITE_ONCE(env->dst_rq->rd->overload, overload);
}
if (overutilized)
capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */
- if (capa_move > capa_now)
+ if (capa_move > capa_now) {
+ env->imbalance = busiest->load_per_task;
+ return;
+ }
+
+ /* We can't see throughput improvement with the load-based
+ * method, but it is possible depending upon group size and
+ * capacity range that there might still be an underutilized
+ * cpu available in an asymmetric capacity system. Do one last
+ * check just in case.
+ */
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ busiest->group_type == group_overloaded &&
+ busiest->sum_nr_running > busiest->group_weight &&
+ local->sum_nr_running < local->group_weight &&
+ local->group_capacity < busiest->group_capacity)
env->imbalance = busiest->load_per_task;
}
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
- /* Boost imbalance to allow misfit task to be balanced. */
- if (busiest->group_type == group_misfit_task) {
+ /* Boost imbalance to allow misfit task to be balanced.
+ * Always do this if we are doing a NEWLY_IDLE balance
+ * on the assumption that any tasks we have must not be
+ * long-running (and hence we cannot rely upon load).
+ * However if we are not idle, we should assume the tasks
+ * we have are longer running and not override load-based
+ * calculations above unless we are sure that the local
+ * group is underutilized.
+ */
+ if (busiest->group_type == group_misfit_task &&
+ (env->idle == CPU_NEWLY_IDLE ||
+ local->sum_nr_running < local->group_weight)) {
env->imbalance = max_t(long, env->imbalance,
- busiest->group_misfit_task);
+ busiest->group_misfit_task_load);
}
/*
busiest->group_no_capacity)
goto force_balance;
- /* Misfitting tasks should be dealt with regardless of the avg load */
+ /* Misfit tasks should be dealt with regardless of the avg load */
if (busiest->group_type == group_misfit_task)
goto force_balance;
continue;
/*
- * For ASYM_CPUCAPACITY domains with misfit tasks we ignore
- * load.
+ * For ASYM_CPUCAPACITY domains with misfit tasks we simply
+ * seek the "biggest" misfit task.
*/
- if (env->src_grp_type == group_misfit_task && rq_has_misfit(rq))
- return rq;
+ if (env->src_grp_type == group_misfit_task) {
+ if (rq->misfit_task_load > busiest_load) {
+ busiest_load = rq->misfit_task_load;
+ busiest = rq;
+ }
+ continue;
+ }
capacity = capacity_of(i);
+ /*
+ * For ASYM_CPUCAPACITY domains, don't pick a cpu that could
+ * eventually lead to active_balancing high->low capacity.
+ * Higher per-cpu capacity is considered better than balancing
+ * average load.
+ */
+ if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
+ capacity_of(env->dst_cpu) < capacity &&
+ rq->nr_running == 1)
+ continue;
+
wl = weighted_cpuload(rq);
/*
return 1;
}
+ if (env->src_grp_type == group_misfit_task)
+ return 1;
+
+ if (env->src_grp_type == group_overloaded &&
+ env->src_rq->misfit_task_load)
+ return 1;
+
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
if (energy_aware() && sd_overutilized(sd)) {
/* we know the root is overutilized, let's check for a misfit task */
for_each_cpu(cpu, sched_domain_span(sd)) {
- if (rq_has_misfit(cpu_rq(cpu)))
+ if (cpu_rq(cpu)->misfit_task_load)
return 1;
}
}
rq_unpin_lock(this_rq, rf);
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
- !this_rq->rd->overload) {
+ !READ_ONCE(this_rq->rd->overload)) {
rcu_read_lock();
sd = rcu_dereference_check_sched_domain(this_rq->sd);
if (sd)
int continue_balancing = 1;
u64 t0, domain_cost;
- if (!(sd->flags & SD_LOAD_BALANCE))
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ if (time_after_eq(jiffies,
+ sd->groups->sgc->next_update))
+ update_group_capacity(sd, this_cpu);
continue;
+ }
if (this_rq->avg_idle < curr_cost + sd->max_newidle_lb_cost) {
update_next_balance(sd, &next_balance);
if (energy_aware() && !sd_overutilized(sd))
continue;
- if (!(sd->flags & SD_LOAD_BALANCE))
+ if (!(sd->flags & SD_LOAD_BALANCE)) {
+ if (time_after_eq(jiffies,
+ sd->groups->sgc->next_update))
+ update_group_capacity(sd, cpu);
continue;
+ }
/*
* Stop the load balance at this level. There is another
/* Do idle load balance if there have misfit task */
if (energy_aware())
- return rq_has_misfit(rq);
+ return rq->misfit_task_load;
rcu_read_lock();
sds = rcu_dereference(per_cpu(sd_llc_shared, cpu));
if (static_branch_unlikely(&sched_numa_balancing))
task_tick_numa(rq, curr);
- update_misfit_task(rq, curr);
+ update_misfit_status(curr, rq);
update_overutilized_status(rq);
}
* - A task which has been woken up by try_to_wake_up() and
* waiting for actually being woken up by sched_ttwu_pending().
*/
- if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+ if (!se->sum_exec_runtime ||
+ (p->state == TASK_WAKING && p->sched_remote_wakeup))
return true;
return false;
#ifdef CONFIG_SCHED_DEBUG
void print_cfs_stats(struct seq_file *m, int cpu)
{
- struct cfs_rq *cfs_rq, *pos;
+ struct cfs_rq *cfs_rq;
rcu_read_lock();
- for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
+ for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
print_cfs_rq(m, cpu, cfs_rq);
rcu_read_unlock();
}