import PULS_20160108
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / kernel / sched / fair.c
CommitLineData
bf0f6f24
IM
1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
21805085
PZ
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
bf0f6f24
IM
21 */
22
9745512c 23#include <linux/latencytop.h>
1983a922 24#include <linux/sched.h>
3436ae12 25#include <linux/cpumask.h>
029632fb
PZ
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
cbee9f88 29#include <linux/mempolicy.h>
e14808b4 30#include <linux/migrate.h>
cbee9f88 31#include <linux/task_work.h>
029632fb
PZ
32
33#include <trace/events/sched.h>
6fa3eb70
S
34#ifdef CONFIG_HMP_VARIABLE_SCALE
35#include <linux/sysfs.h>
36#include <linux/vmalloc.h>
37#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
38/* Include cpufreq header to add a notifier so that cpu frequency
39 * scaling can track the current CPU frequency
40 */
41#include <linux/cpufreq.h>
42#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
43#endif /* CONFIG_HMP_VARIABLE_SCALE */
029632fb
PZ
44
45#include "sched.h"
9745512c 46
6fa3eb70
S
47#include <mtlbprof/mtlbprof.h>
48
49
50#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
51#ifdef CONFIG_LOCAL_TIMERS
52unsigned long localtimer_get_counter(void);
53#endif
54#endif
55
56#ifdef CONFIG_HEVTASK_INTERFACE
57#include <linux/proc_fs.h>
58#include <linux/seq_file.h>
59#ifdef CONFIG_KGDB_KDB
60#include <linux/kdb.h>
61#endif
62#endif
63
bf0f6f24 64/*
21805085 65 * Targeted preemption latency for CPU-bound tasks:
864616ee 66 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24 67 *
21805085 68 * NOTE: this latency value is not the same as the concept of
d274a4ce
IM
69 * 'timeslice length' - timeslices in CFS are of variable length
70 * and have no persistent notion like in traditional, time-slice
71 * based scheduling concepts.
bf0f6f24 72 *
d274a4ce
IM
73 * (to see the precise effective timeslice length of your workload,
74 * run vmstat and monitor the context-switches (cs) field)
bf0f6f24 75 */
21406928
MG
76unsigned int sysctl_sched_latency = 6000000ULL;
77unsigned int normalized_sysctl_sched_latency = 6000000ULL;
2bd8e6d4 78
1983a922
CE
79/*
80 * The initial- and re-scaling of tunables is configurable
81 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
82 *
83 * Options are:
84 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
85 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
86 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
87 */
88enum sched_tunable_scaling sysctl_sched_tunable_scaling
89 = SCHED_TUNABLESCALING_LOG;
90
2bd8e6d4 91/*
b2be5e96 92 * Minimal preemption granularity for CPU-bound tasks:
864616ee 93 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
2bd8e6d4 94 */
0bf377bb
IM
95unsigned int sysctl_sched_min_granularity = 750000ULL;
96unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
21805085
PZ
97
98/*
b2be5e96
PZ
99 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
100 */
0bf377bb 101static unsigned int sched_nr_latency = 8;
b2be5e96
PZ
102
103/*
2bba22c5 104 * After fork, child runs first. If set to 0 (default) then
b2be5e96 105 * parent will (try to) run first.
21805085 106 */
2bba22c5 107unsigned int sysctl_sched_child_runs_first __read_mostly;
bf0f6f24 108
bf0f6f24
IM
109/*
110 * SCHED_OTHER wake-up granularity.
172e082a 111 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24
IM
112 *
113 * This option delays the preemption effects of decoupled workloads
114 * and reduces their over-scheduling. Synchronous workloads will still
115 * have immediate wakeup/sleep latencies.
116 */
172e082a 117unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
0bcdcf28 118unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
bf0f6f24 119
6fa3eb70 120const_debug unsigned int sysctl_sched_migration_cost = 100000UL;
da84d961 121
a7a4f8a7
PT
122/*
123 * The exponential sliding window over which load is averaged for shares
124 * distribution.
125 * (default: 10msec)
126 */
127unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
128
ec12cb7f
PT
129#ifdef CONFIG_CFS_BANDWIDTH
130/*
131 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
132 * each time a cfs_rq requests quota.
133 *
134 * Note: in the case that the slice exceeds the runtime remaining (either due
135 * to consumption or the quota being specified to be smaller than the slice)
136 * we will always only issue the remaining available time.
137 *
138 * default: 5 msec, units: microseconds
139 */
140unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
141#endif
6fa3eb70
S
142#if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
143static int need_lazy_balance(int dst_cpu, int src_cpu, struct task_struct *p);
144#endif
ec12cb7f 145
029632fb
PZ
146/*
147 * Increase the granularity value when there are more CPUs,
148 * because with more CPUs the 'effective latency' as visible
149 * to users decreases. But the relationship is not linear,
150 * so pick a second-best guess by going with the log2 of the
151 * number of CPUs.
152 *
153 * This idea comes from the SD scheduler of Con Kolivas:
154 */
155static int get_update_sysctl_factor(void)
156{
157 unsigned int cpus = min_t(int, num_online_cpus(), 8);
158 unsigned int factor;
159
160 switch (sysctl_sched_tunable_scaling) {
161 case SCHED_TUNABLESCALING_NONE:
162 factor = 1;
163 break;
164 case SCHED_TUNABLESCALING_LINEAR:
165 factor = cpus;
166 break;
167 case SCHED_TUNABLESCALING_LOG:
168 default:
169 factor = 1 + ilog2(cpus);
170 break;
171 }
172
173 return factor;
174}
175
176static void update_sysctl(void)
177{
178 unsigned int factor = get_update_sysctl_factor();
179
180#define SET_SYSCTL(name) \
181 (sysctl_##name = (factor) * normalized_sysctl_##name)
182 SET_SYSCTL(sched_min_granularity);
183 SET_SYSCTL(sched_latency);
184 SET_SYSCTL(sched_wakeup_granularity);
185#undef SET_SYSCTL
186}
187
188void sched_init_granularity(void)
189{
190 update_sysctl();
191}
6fa3eb70
S
192#if defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK) || defined (CONFIG_HMP_PACK_SMALL_TASK)
193/*
194 * Save the id of the optimal CPU that should be used to pack small tasks
195 * The value -1 is used when no buddy has been found
196 */
197DEFINE_PER_CPU(int, sd_pack_buddy) = {-1};
198
199#ifdef CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK
200struct cpumask buddy_cpu_map = {{0}};
201#endif
202
203/* Look for the best buddy CPU that can be used to pack small tasks
204 * We make the assumption that it doesn't wort to pack on CPU that share the
205 * same powerline. We looks for the 1st sched_domain without the
206 * SD_SHARE_POWERLINE flag. Then We look for the sched_group witht the lowest
207 * power per core based on the assumption that their power efficiency is
208 * better */
209void update_packing_domain(int cpu)
210{
211 struct sched_domain *sd;
212 int id = -1;
213
214#ifdef CONFIG_HMP_PACK_BUDDY_INFO
215 pr_info("[PACK] update_packing_domain() CPU%d\n", cpu);
216#endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
217 mt_sched_printf("[PACK] update_packing_domain() CPU%d", cpu);
218
219 sd = highest_flag_domain(cpu, SD_SHARE_POWERLINE);
220 if (!sd)
221 {
222 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
223 }
224 else
225 if (cpumask_first(sched_domain_span(sd)) == cpu || !sd->parent)
226 sd = sd->parent;
227
228 while (sd) {
229 struct sched_group *sg = sd->groups;
230 struct sched_group *pack = sg;
231 struct sched_group *tmp = sg->next;
232
233#ifdef CONFIG_HMP_PACK_BUDDY_INFO
234 pr_info("[PACK] sd = 0x%08x, flags = %d\n", (unsigned int)sd, sd->flags);
235#endif /* CONFIG_HMP_PACK_BUDDY_INFO */
236
237#ifdef CONFIG_HMP_PACK_BUDDY_INFO
238 pr_info("[PACK] sg = 0x%08x\n", (unsigned int)sg);
239#endif /* CONFIG_HMP_PACK_BUDDY_INFO */
240
241 /* 1st CPU of the sched domain is a good candidate */
242 if (id == -1)
243 id = cpumask_first(sched_domain_span(sd));
244
245#ifdef CONFIG_HMP_PACK_BUDDY_INFO
246 pr_info("[PACK] First cpu in this sd id = %d\n", id);
247#endif /* CONFIG_HMP_PACK_BUDDY_INFO */
248
249 /* Find sched group of candidate */
250 tmp = sd->groups;
251 do {
252 if (cpumask_test_cpu(id, sched_group_cpus(tmp))) {
253 sg = tmp;
254 break;
255 }
256 } while (tmp = tmp->next, tmp != sd->groups);
257
258#ifdef CONFIG_HMP_PACK_BUDDY_INFO
259 pr_info("[PACK] pack = 0x%08x\n", (unsigned int)sg);
260#endif /* CONFIG_HMP_PACK_BUDDY_INFO */
261
262 pack = sg;
263 tmp = sg->next;
264
265 /* loop the sched groups to find the best one */
266 //Stop find the best one in the same Load Balance Domain
267 //while (tmp != sg) {
268 while (tmp != sg && !(sd->flags & SD_LOAD_BALANCE)) {
269 if (tmp->sgp->power * sg->group_weight <
270 sg->sgp->power * tmp->group_weight) {
271
272#ifdef CONFIG_HMP_PACK_BUDDY_INFO
273 pr_info("[PACK] Now sg power = %u, weight = %u, mask = %lu\n", sg->sgp->power, sg->group_weight, sg->cpumask[0]);
274 pr_info("[PACK] Better sg power = %u, weight = %u, mask = %lu\n", tmp->sgp->power, tmp->group_weight, tmp->cpumask[0]);
275#endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
276
277 pack = tmp;
278 }
279 tmp = tmp->next;
280 }
281
282 /* we have found a better group */
283 if (pack != sg) {
284 id = cpumask_first(sched_group_cpus(pack));
285
286#ifdef CONFIG_HMP_PACK_BUDDY_INFO
287 pr_info("[PACK] Better sg, first cpu id = %d\n", id);
288#endif /* CONFIG_HMP_PACK_BUDDY_INFO */
289
290 }
291
292#ifdef CONFIG_HMP_PACK_BUDDY_INFO
293 if(sd->parent) {
294 pr_info("[PACK] cpu = %d, id = %d, sd->parent = 0x%08x, flags = %d, SD_LOAD_BALANCE = %d\n", cpu, id, (unsigned int)sd->parent, sd->parent->flags, SD_LOAD_BALANCE);
295 pr_info("[PACK] %d\n", (id != cpu));
296 pr_info("[PACK] 0x%08x\n", (unsigned int)(sd->parent));
297 pr_info("[PACK] %d\n", (sd->parent->flags & SD_LOAD_BALANCE));
298 }
299 else {
300 pr_info("[PACK] cpu = %d, id = %d, sd->parent = 0x%08x\n", cpu, id, (unsigned int)sd->parent);
301 }
302#endif /* CONFIG_HMP_PACK_BUDDY_INFO */
303
304
305 /* Look for another CPU than itself */
306 if ((id != cpu) ||
307 ((sd->parent) && (sd->parent->flags & SD_LOAD_BALANCE))) {
308
309#ifdef CONFIG_HMP_PACK_BUDDY_INFO
310 pr_info("[PACK] Break\n");
311#endif /*CONFIG_HMP_PACK_BUDDY_INFO */
312
313 break;
314 }
315 sd = sd->parent;
316 }
317
318#ifdef CONFIG_HMP_PACK_BUDDY_INFO
319 pr_info("[PACK] CPU%d packing on CPU%d\n", cpu, id);
320#endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
321 mt_sched_printf("[PACK] CPU%d packing on CPU%d", cpu, id);
322
323#ifdef CONFIG_HMP_PACK_SMALL_TASK
324 per_cpu(sd_pack_buddy, cpu) = id;
325#else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
326 if(per_cpu(sd_pack_buddy, cpu) != -1)
327 cpu_clear(per_cpu(sd_pack_buddy, cpu), buddy_cpu_map);
328 per_cpu(sd_pack_buddy, cpu) = id;
329 if(id != -1)
330 cpumask_set_cpu(id, &buddy_cpu_map);
331#endif
332}
333
334#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
335DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_USAGE);
336DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_PERIOD);
337DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_NR);
338DEFINE_PER_CPU(u32, TASK_USGAE);
339DEFINE_PER_CPU(u32, TASK_PERIOD);
340u32 PACK_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
341u32 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
342u32 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
343u32 TASK_PACK_CPU_COUNT[4][NR_CPUS] = {{0}};
344u32 PA_ENABLE = 1;
345u32 PA_MON_ENABLE = 0;
346char PA_MON[4][TASK_COMM_LEN]={{0}};
347#endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER */
348
349#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
350DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_USAGE);
351DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_PERIOD);
352DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_NR);
353DEFINE_PER_CPU(u32, TASK_USGAE);
354DEFINE_PER_CPU(u32, TASK_PERIOD);
355u32 PACK_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
356u32 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
357u32 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
358u32 HMP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
359u32 PA_ENABLE = 1;
360u32 LB_ENABLE = 1;
361u32 PA_MON_ENABLE = 0;
362char PA_MON[TASK_COMM_LEN];
363
364#ifdef CONFIG_HMP_TRACER
365#define POWER_AWARE_ACTIVE_MODULE_PACK_FORM_CPUX_TO_CPUY (0)
366#define POWER_AWARE_ACTIVE_MODULE_AVOID_WAKE_UP_FORM_CPUX_TO_CPUY (1)
367#define POWER_AWARE_ACTIVE_MODULE_AVOID_BALANCE_FORM_CPUX_TO_CPUY (2)
368#define POWER_AWARE_ACTIVE_MODULE_AVOID_FORCE_UP_FORM_CPUX_TO_CPUY (3)
369#endif /* CONFIG_HMP_TRACER */
370
371#endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER */
372
373
374static inline bool is_buddy_busy(int cpu)
375{
376#ifdef CONFIG_HMP_PACK_SMALL_TASK
377 struct rq *rq;
378
379 if (cpu < 0)
380 return 0;
381
382 rq = cpu_rq(cpu);
383#else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
384 struct rq *rq = cpu_rq(cpu);
385#endif
386 /*
387 * A busy buddy is a CPU with a high load or a small load with a lot of
388 * running tasks.
389 */
390
391#if defined (CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER)
392 per_cpu(BUDDY_CPU_RQ_USAGE, cpu) = rq->avg.usage_avg_sum;
393 per_cpu(BUDDY_CPU_RQ_PERIOD, cpu) = rq->avg.runnable_avg_period;
394 per_cpu(BUDDY_CPU_RQ_NR, cpu) = rq->nr_running;
395#endif /*(CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER) */
396
397 return ((rq->avg.usage_avg_sum << rq->nr_running) >
398 rq->avg.runnable_avg_period);
399
400}
401
402static inline bool is_light_task(struct task_struct *p)
403{
404#if defined (CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER)
405 per_cpu(TASK_USGAE, task_cpu(p)) = p->se.avg.usage_avg_sum;
406 per_cpu(TASK_PERIOD, task_cpu(p)) = p->se.avg.runnable_avg_period;
407#endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER || CONFIG_HMP_POWER_AWARE_CONTROLLER*/
408
409 /* A light task runs less than 25% in average */
410 return ((p->se.avg.usage_avg_sum << 2) < p->se.avg.runnable_avg_period);
411}
412
413
414static int check_pack_buddy(int cpu, struct task_struct *p)
415{
416#ifdef CONFIG_HMP_PACK_SMALL_TASK
417 int buddy;
418
419 if(cpu >= NR_CPUS || cpu < 0)
420 return false;
421 buddy = per_cpu(sd_pack_buddy, cpu);
422#else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
423 int buddy = cpu;
424#endif
425
426 /* No pack buddy for this CPU */
427 if (buddy == -1)
428 return false;
429
430 /*
431 * If a task is waiting for running on the CPU which is its own buddy,
432 * let the default behavior to look for a better CPU if available
433 * The threshold has been set to 37.5%
434 */
435#ifdef CONFIG_HMP_PACK_SMALL_TASK
436 if ((buddy == cpu)
437 && ((p->se.avg.usage_avg_sum << 3) < (p->se.avg.runnable_avg_sum * 5)))
438 return false;
439#endif
440
441 /* buddy is not an allowed CPU */
442 if (!cpumask_test_cpu(buddy, tsk_cpus_allowed(p)))
443 return false;
444
445 /*
446 * If the task is a small one and the buddy is not overloaded,
447 * we use buddy cpu
448 */
449 if (!is_light_task(p) || is_buddy_busy(buddy))
450 return false;
451
452 return true;
453}
454#endif /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK || CONFIG_HMP_PACK_SMALL_TASK*/
029632fb
PZ
455
456#if BITS_PER_LONG == 32
457# define WMULT_CONST (~0UL)
458#else
459# define WMULT_CONST (1UL << 32)
460#endif
461
462#define WMULT_SHIFT 32
463
464/*
465 * Shift right and round:
466 */
467#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
468
469/*
470 * delta *= weight / lw
471 */
472static unsigned long
473calc_delta_mine(unsigned long delta_exec, unsigned long weight,
474 struct load_weight *lw)
475{
476 u64 tmp;
477
478 /*
479 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
480 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
481 * 2^SCHED_LOAD_RESOLUTION.
482 */
483 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
484 tmp = (u64)delta_exec * scale_load_down(weight);
485 else
486 tmp = (u64)delta_exec;
487
488 if (!lw->inv_weight) {
489 unsigned long w = scale_load_down(lw->weight);
490
491 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
492 lw->inv_weight = 1;
493 else if (unlikely(!w))
494 lw->inv_weight = WMULT_CONST;
495 else
496 lw->inv_weight = WMULT_CONST / w;
497 }
498
499 /*
500 * Check whether we'd overflow the 64-bit multiplication:
501 */
502 if (unlikely(tmp > WMULT_CONST))
503 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
504 WMULT_SHIFT/2);
505 else
506 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
507
508 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
509}
510
511
512const struct sched_class fair_sched_class;
a4c2f00f 513
bf0f6f24
IM
514/**************************************************************
515 * CFS operations on generic schedulable entities:
516 */
517
62160e3f 518#ifdef CONFIG_FAIR_GROUP_SCHED
bf0f6f24 519
62160e3f 520/* cpu runqueue to which this cfs_rq is attached */
bf0f6f24
IM
521static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
522{
62160e3f 523 return cfs_rq->rq;
bf0f6f24
IM
524}
525
62160e3f
IM
526/* An entity is a task if it doesn't "own" a runqueue */
527#define entity_is_task(se) (!se->my_q)
bf0f6f24 528
8f48894f
PZ
529static inline struct task_struct *task_of(struct sched_entity *se)
530{
531#ifdef CONFIG_SCHED_DEBUG
532 WARN_ON_ONCE(!entity_is_task(se));
533#endif
534 return container_of(se, struct task_struct, se);
535}
536
b758149c
PZ
537/* Walk up scheduling entities hierarchy */
538#define for_each_sched_entity(se) \
539 for (; se; se = se->parent)
540
541static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
542{
543 return p->se.cfs_rq;
544}
545
546/* runqueue on which this entity is (to be) queued */
547static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
548{
549 return se->cfs_rq;
550}
551
552/* runqueue "owned" by this group */
553static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
554{
555 return grp->my_q;
556}
557
aff3e498
PT
558static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
559 int force_update);
9ee474f5 560
3d4b47b4
PZ
561static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
562{
563 if (!cfs_rq->on_list) {
67e86250
PT
564 /*
565 * Ensure we either appear before our parent (if already
566 * enqueued) or force our parent to appear after us when it is
567 * enqueued. The fact that we always enqueue bottom-up
568 * reduces this to two cases.
569 */
570 if (cfs_rq->tg->parent &&
571 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
572 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
573 &rq_of(cfs_rq)->leaf_cfs_rq_list);
574 } else {
575 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
3d4b47b4 576 &rq_of(cfs_rq)->leaf_cfs_rq_list);
67e86250 577 }
3d4b47b4
PZ
578
579 cfs_rq->on_list = 1;
9ee474f5 580 /* We should have no load, but we need to update last_decay. */
aff3e498 581 update_cfs_rq_blocked_load(cfs_rq, 0);
3d4b47b4
PZ
582 }
583}
584
585static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
586{
587 if (cfs_rq->on_list) {
588 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
589 cfs_rq->on_list = 0;
590 }
591}
592
b758149c
PZ
593/* Iterate thr' all leaf cfs_rq's on a runqueue */
594#define for_each_leaf_cfs_rq(rq, cfs_rq) \
595 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
596
597/* Do the two (enqueued) entities belong to the same group ? */
598static inline int
599is_same_group(struct sched_entity *se, struct sched_entity *pse)
600{
6fa3eb70
S
601 if (se && pse)
602 {
603 if (se->cfs_rq == pse->cfs_rq)
604 return 1;
605 }
b758149c
PZ
606
607 return 0;
608}
609
610static inline struct sched_entity *parent_entity(struct sched_entity *se)
611{
612 return se->parent;
613}
614
464b7527
PZ
615/* return depth at which a sched entity is present in the hierarchy */
616static inline int depth_se(struct sched_entity *se)
617{
618 int depth = 0;
619
620 for_each_sched_entity(se)
621 depth++;
622
623 return depth;
624}
625
626static void
627find_matching_se(struct sched_entity **se, struct sched_entity **pse)
628{
629 int se_depth, pse_depth;
630
631 /*
632 * preemption test can be made between sibling entities who are in the
633 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
634 * both tasks until we find their ancestors who are siblings of common
635 * parent.
636 */
637
638 /* First walk up until both entities are at same depth */
639 se_depth = depth_se(*se);
640 pse_depth = depth_se(*pse);
641
642 while (se_depth > pse_depth) {
643 se_depth--;
644 *se = parent_entity(*se);
645 }
646
647 while (pse_depth > se_depth) {
648 pse_depth--;
649 *pse = parent_entity(*pse);
650 }
651
652 while (!is_same_group(*se, *pse)) {
653 *se = parent_entity(*se);
654 *pse = parent_entity(*pse);
655 }
656}
657
8f48894f
PZ
658#else /* !CONFIG_FAIR_GROUP_SCHED */
659
660static inline struct task_struct *task_of(struct sched_entity *se)
661{
662 return container_of(se, struct task_struct, se);
663}
bf0f6f24 664
62160e3f
IM
665static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
666{
667 return container_of(cfs_rq, struct rq, cfs);
bf0f6f24
IM
668}
669
670#define entity_is_task(se) 1
671
b758149c
PZ
672#define for_each_sched_entity(se) \
673 for (; se; se = NULL)
bf0f6f24 674
b758149c 675static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
bf0f6f24 676{
b758149c 677 return &task_rq(p)->cfs;
bf0f6f24
IM
678}
679
b758149c
PZ
680static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
681{
682 struct task_struct *p = task_of(se);
683 struct rq *rq = task_rq(p);
684
685 return &rq->cfs;
686}
687
688/* runqueue "owned" by this group */
689static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
690{
691 return NULL;
692}
693
3d4b47b4
PZ
694static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
695{
696}
697
698static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
699{
700}
701
b758149c
PZ
702#define for_each_leaf_cfs_rq(rq, cfs_rq) \
703 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
704
705static inline int
706is_same_group(struct sched_entity *se, struct sched_entity *pse)
707{
708 return 1;
709}
710
711static inline struct sched_entity *parent_entity(struct sched_entity *se)
712{
713 return NULL;
714}
715
464b7527
PZ
716static inline void
717find_matching_se(struct sched_entity **se, struct sched_entity **pse)
718{
719}
720
b758149c
PZ
721#endif /* CONFIG_FAIR_GROUP_SCHED */
722
6c16a6dc
PZ
723static __always_inline
724void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
bf0f6f24
IM
725
726/**************************************************************
727 * Scheduling class tree data structure manipulation methods:
728 */
729
1bf08230 730static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
02e0431a 731{
1bf08230 732 s64 delta = (s64)(vruntime - max_vruntime);
368059a9 733 if (delta > 0)
1bf08230 734 max_vruntime = vruntime;
02e0431a 735
1bf08230 736 return max_vruntime;
02e0431a
PZ
737}
738
0702e3eb 739static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
b0ffd246
PZ
740{
741 s64 delta = (s64)(vruntime - min_vruntime);
742 if (delta < 0)
743 min_vruntime = vruntime;
744
745 return min_vruntime;
746}
747
54fdc581
FC
748static inline int entity_before(struct sched_entity *a,
749 struct sched_entity *b)
750{
751 return (s64)(a->vruntime - b->vruntime) < 0;
752}
753
1af5f730
PZ
754static void update_min_vruntime(struct cfs_rq *cfs_rq)
755{
756 u64 vruntime = cfs_rq->min_vruntime;
757
758 if (cfs_rq->curr)
759 vruntime = cfs_rq->curr->vruntime;
760
761 if (cfs_rq->rb_leftmost) {
762 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
763 struct sched_entity,
764 run_node);
765
e17036da 766 if (!cfs_rq->curr)
1af5f730
PZ
767 vruntime = se->vruntime;
768 else
769 vruntime = min_vruntime(vruntime, se->vruntime);
770 }
771
1bf08230 772 /* ensure we never gain time by being placed backwards. */
1af5f730 773 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
3fe1698b
PZ
774#ifndef CONFIG_64BIT
775 smp_wmb();
776 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
777#endif
1af5f730
PZ
778}
779
bf0f6f24
IM
780/*
781 * Enqueue an entity into the rb-tree:
782 */
0702e3eb 783static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24
IM
784{
785 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
786 struct rb_node *parent = NULL;
787 struct sched_entity *entry;
bf0f6f24
IM
788 int leftmost = 1;
789
790 /*
791 * Find the right place in the rbtree:
792 */
793 while (*link) {
794 parent = *link;
795 entry = rb_entry(parent, struct sched_entity, run_node);
796 /*
797 * We dont care about collisions. Nodes with
798 * the same key stay together.
799 */
2bd2d6f2 800 if (entity_before(se, entry)) {
bf0f6f24
IM
801 link = &parent->rb_left;
802 } else {
803 link = &parent->rb_right;
804 leftmost = 0;
805 }
806 }
807
808 /*
809 * Maintain a cache of leftmost tree entries (it is frequently
810 * used):
811 */
1af5f730 812 if (leftmost)
57cb499d 813 cfs_rq->rb_leftmost = &se->run_node;
bf0f6f24
IM
814
815 rb_link_node(&se->run_node, parent, link);
816 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24
IM
817}
818
0702e3eb 819static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 820{
3fe69747
PZ
821 if (cfs_rq->rb_leftmost == &se->run_node) {
822 struct rb_node *next_node;
3fe69747
PZ
823
824 next_node = rb_next(&se->run_node);
825 cfs_rq->rb_leftmost = next_node;
3fe69747 826 }
e9acbff6 827
bf0f6f24 828 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24
IM
829}
830
029632fb 831struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
bf0f6f24 832{
f4b6755f
PZ
833 struct rb_node *left = cfs_rq->rb_leftmost;
834
835 if (!left)
836 return NULL;
837
838 return rb_entry(left, struct sched_entity, run_node);
bf0f6f24
IM
839}
840
ac53db59
RR
841static struct sched_entity *__pick_next_entity(struct sched_entity *se)
842{
843 struct rb_node *next = rb_next(&se->run_node);
844
845 if (!next)
846 return NULL;
847
848 return rb_entry(next, struct sched_entity, run_node);
849}
850
851#ifdef CONFIG_SCHED_DEBUG
029632fb 852struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
aeb73b04 853{
7eee3e67 854 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
aeb73b04 855
70eee74b
BS
856 if (!last)
857 return NULL;
7eee3e67
IM
858
859 return rb_entry(last, struct sched_entity, run_node);
aeb73b04
PZ
860}
861
bf0f6f24
IM
862/**************************************************************
863 * Scheduling class statistics methods:
864 */
865
acb4a848 866int sched_proc_update_handler(struct ctl_table *table, int write,
8d65af78 867 void __user *buffer, size_t *lenp,
b2be5e96
PZ
868 loff_t *ppos)
869{
8d65af78 870 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
acb4a848 871 int factor = get_update_sysctl_factor();
b2be5e96
PZ
872
873 if (ret || !write)
874 return ret;
875
876 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
877 sysctl_sched_min_granularity);
878
acb4a848
CE
879#define WRT_SYSCTL(name) \
880 (normalized_sysctl_##name = sysctl_##name / (factor))
881 WRT_SYSCTL(sched_min_granularity);
882 WRT_SYSCTL(sched_latency);
883 WRT_SYSCTL(sched_wakeup_granularity);
acb4a848
CE
884#undef WRT_SYSCTL
885
b2be5e96
PZ
886 return 0;
887}
888#endif
647e7cac 889
a7be37ac 890/*
f9c0b095 891 * delta /= w
a7be37ac
PZ
892 */
893static inline unsigned long
894calc_delta_fair(unsigned long delta, struct sched_entity *se)
895{
f9c0b095
PZ
896 if (unlikely(se->load.weight != NICE_0_LOAD))
897 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
a7be37ac
PZ
898
899 return delta;
900}
901
647e7cac
IM
902/*
903 * The idea is to set a period in which each task runs once.
904 *
532b1858 905 * When there are too many tasks (sched_nr_latency) we have to stretch
647e7cac
IM
906 * this period because otherwise the slices get too small.
907 *
908 * p = (nr <= nl) ? l : l*nr/nl
909 */
4d78e7b6
PZ
910static u64 __sched_period(unsigned long nr_running)
911{
912 u64 period = sysctl_sched_latency;
b2be5e96 913 unsigned long nr_latency = sched_nr_latency;
4d78e7b6
PZ
914
915 if (unlikely(nr_running > nr_latency)) {
4bf0b771 916 period = sysctl_sched_min_granularity;
4d78e7b6 917 period *= nr_running;
4d78e7b6
PZ
918 }
919
920 return period;
921}
922
647e7cac
IM
923/*
924 * We calculate the wall-time slice from the period by taking a part
925 * proportional to the weight.
926 *
f9c0b095 927 * s = p*P[w/rw]
647e7cac 928 */
6d0f0ebd 929static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
21805085 930{
0a582440 931 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
f9c0b095 932
0a582440 933 for_each_sched_entity(se) {
6272d68c 934 struct load_weight *load;
3104bf03 935 struct load_weight lw;
6272d68c
LM
936
937 cfs_rq = cfs_rq_of(se);
938 load = &cfs_rq->load;
f9c0b095 939
0a582440 940 if (unlikely(!se->on_rq)) {
3104bf03 941 lw = cfs_rq->load;
0a582440
MG
942
943 update_load_add(&lw, se->load.weight);
944 load = &lw;
945 }
946 slice = calc_delta_mine(slice, se->load.weight, load);
947 }
948 return slice;
bf0f6f24
IM
949}
950
647e7cac 951/*
660cc00f 952 * We calculate the vruntime slice of a to-be-inserted task.
647e7cac 953 *
f9c0b095 954 * vs = s/w
647e7cac 955 */
f9c0b095 956static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
67e9fb2a 957{
f9c0b095 958 return calc_delta_fair(sched_slice(cfs_rq, se), se);
a7be37ac
PZ
959}
960
6fa3eb70
S
961
962#ifdef CONFIG_SMP
963static inline void __update_task_entity_contrib(struct sched_entity *se);
964
965static long __update_task_entity_ratio(struct sched_entity *se);
966
967#define LOAD_AVG_PERIOD 32
968#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
969#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
970#define LOAD_AVG_VARIABLE_PERIOD 512
971static unsigned int init_task_load_period = 4000;
972
973/* Give new task start runnable values to heavy its load in infant time */
974void init_task_runnable_average(struct task_struct *p)
975{
976 u32 slice;
977
978 p->se.avg.decay_count = 0;
979 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
980 p->se.avg.runnable_avg_sum = (init_task_load_period) ? 0 : slice;
981 p->se.avg.runnable_avg_period = (init_task_load_period)?(init_task_load_period):slice;
982 __update_task_entity_contrib(&p->se);
983
984#ifdef CONFIG_MTK_SCHED_CMP
985 /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
986 p->se.avg.usage_avg_sum = (init_task_load_period) ? 0 : slice;
987#endif
988 __update_task_entity_ratio(&p->se);
989 trace_sched_task_entity_avg(0, p, &p->se.avg);
990}
991#else
992void init_task_runnable_average(struct task_struct *p)
993{
994}
995#endif
996
bf0f6f24
IM
997/*
998 * Update the current task's runtime statistics. Skip current tasks that
999 * are not in our scheduling class.
1000 */
1001static inline void
8ebc91d9
IM
1002__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
1003 unsigned long delta_exec)
bf0f6f24 1004{
bbdba7c0 1005 unsigned long delta_exec_weighted;
bf0f6f24 1006
41acab88
LDM
1007 schedstat_set(curr->statistics.exec_max,
1008 max((u64)delta_exec, curr->statistics.exec_max));
bf0f6f24
IM
1009
1010 curr->sum_exec_runtime += delta_exec;
7a62eabc 1011 schedstat_add(cfs_rq, exec_clock, delta_exec);
a7be37ac 1012 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
88ec22d3 1013
e9acbff6 1014 curr->vruntime += delta_exec_weighted;
1af5f730 1015 update_min_vruntime(cfs_rq);
bf0f6f24
IM
1016}
1017
b7cc0896 1018static void update_curr(struct cfs_rq *cfs_rq)
bf0f6f24 1019{
429d43bc 1020 struct sched_entity *curr = cfs_rq->curr;
305e6835 1021 u64 now = rq_of(cfs_rq)->clock_task;
bf0f6f24
IM
1022 unsigned long delta_exec;
1023
1024 if (unlikely(!curr))
1025 return;
1026
1027 /*
1028 * Get the amount of time the current task was running
1029 * since the last time we changed load (this cannot
1030 * overflow on 32 bits):
1031 */
8ebc91d9 1032 delta_exec = (unsigned long)(now - curr->exec_start);
34f28ecd
PZ
1033 if (!delta_exec)
1034 return;
bf0f6f24 1035
8ebc91d9
IM
1036 __update_curr(cfs_rq, curr, delta_exec);
1037 curr->exec_start = now;
d842de87
SV
1038
1039 if (entity_is_task(curr)) {
1040 struct task_struct *curtask = task_of(curr);
1041
f977bb49 1042 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
d842de87 1043 cpuacct_charge(curtask, delta_exec);
f06febc9 1044 account_group_exec_runtime(curtask, delta_exec);
d842de87 1045 }
ec12cb7f
PT
1046
1047 account_cfs_rq_runtime(cfs_rq, delta_exec);
bf0f6f24
IM
1048}
1049
1050static inline void
5870db5b 1051update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 1052{
41acab88 1053 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
bf0f6f24
IM
1054}
1055
bf0f6f24
IM
1056/*
1057 * Task is being enqueued - update stats:
1058 */
d2417e5a 1059static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 1060{
bf0f6f24
IM
1061 /*
1062 * Are we enqueueing a waiting task? (for current tasks
1063 * a dequeue/enqueue event is a NOP)
1064 */
429d43bc 1065 if (se != cfs_rq->curr)
5870db5b 1066 update_stats_wait_start(cfs_rq, se);
bf0f6f24
IM
1067}
1068
bf0f6f24 1069static void
9ef0a961 1070update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 1071{
41acab88
LDM
1072 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
1073 rq_of(cfs_rq)->clock - se->statistics.wait_start));
1074 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
1075 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
1076 rq_of(cfs_rq)->clock - se->statistics.wait_start);
768d0c27
PZ
1077#ifdef CONFIG_SCHEDSTATS
1078 if (entity_is_task(se)) {
1079 trace_sched_stat_wait(task_of(se),
41acab88 1080 rq_of(cfs_rq)->clock - se->statistics.wait_start);
768d0c27
PZ
1081 }
1082#endif
41acab88 1083 schedstat_set(se->statistics.wait_start, 0);
bf0f6f24
IM
1084}
1085
1086static inline void
19b6a2e3 1087update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 1088{
bf0f6f24
IM
1089 /*
1090 * Mark the end of the wait period if dequeueing a
1091 * waiting task:
1092 */
429d43bc 1093 if (se != cfs_rq->curr)
9ef0a961 1094 update_stats_wait_end(cfs_rq, se);
bf0f6f24
IM
1095}
1096
1097/*
1098 * We are picking a new current task - update its stats:
1099 */
1100static inline void
79303e9e 1101update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24
IM
1102{
1103 /*
1104 * We are starting a new run period:
1105 */
305e6835 1106 se->exec_start = rq_of(cfs_rq)->clock_task;
bf0f6f24
IM
1107}
1108
bf0f6f24
IM
1109/**************************************************
1110 * Scheduling class queueing methods:
1111 */
1112
cbee9f88
PZ
1113#ifdef CONFIG_NUMA_BALANCING
1114/*
6e5fb223 1115 * numa task sample period in ms
cbee9f88 1116 */
6e5fb223 1117unsigned int sysctl_numa_balancing_scan_period_min = 100;
b8593bfd
MG
1118unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
1119unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
6e5fb223
PZ
1120
1121/* Portion of address space to scan in MB */
1122unsigned int sysctl_numa_balancing_scan_size = 256;
cbee9f88 1123
4b96a29b
PZ
1124/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1125unsigned int sysctl_numa_balancing_scan_delay = 1000;
1126
cbee9f88
PZ
1127static void task_numa_placement(struct task_struct *p)
1128{
2832bc19 1129 int seq;
cbee9f88 1130
2832bc19
HD
1131 if (!p->mm) /* for example, ksmd faulting in a user's mm */
1132 return;
1133 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
cbee9f88
PZ
1134 if (p->numa_scan_seq == seq)
1135 return;
1136 p->numa_scan_seq = seq;
1137
1138 /* FIXME: Scheduling placement policy hints go here */
1139}
1140
1141/*
1142 * Got a PROT_NONE fault for a page on @node.
1143 */
b8593bfd 1144void task_numa_fault(int node, int pages, bool migrated)
cbee9f88
PZ
1145{
1146 struct task_struct *p = current;
1147
1a687c2e
MG
1148 if (!sched_feat_numa(NUMA))
1149 return;
1150
cbee9f88
PZ
1151 /* FIXME: Allocate task-specific structure for placement policy here */
1152
fb003b80 1153 /*
b8593bfd
MG
1154 * If pages are properly placed (did not migrate) then scan slower.
1155 * This is reset periodically in case of phase changes
fb003b80 1156 */
b8593bfd
MG
1157 if (!migrated)
1158 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
1159 p->numa_scan_period + jiffies_to_msecs(10));
fb003b80 1160
cbee9f88
PZ
1161 task_numa_placement(p);
1162}
1163
6e5fb223
PZ
1164static void reset_ptenuma_scan(struct task_struct *p)
1165{
1166 ACCESS_ONCE(p->mm->numa_scan_seq)++;
1167 p->mm->numa_scan_offset = 0;
1168}
1169
cbee9f88
PZ
1170/*
1171 * The expensive part of numa migration is done from task_work context.
1172 * Triggered from task_tick_numa().
1173 */
1174void task_numa_work(struct callback_head *work)
1175{
1176 unsigned long migrate, next_scan, now = jiffies;
1177 struct task_struct *p = current;
1178 struct mm_struct *mm = p->mm;
6e5fb223 1179 struct vm_area_struct *vma;
9f40604c
MG
1180 unsigned long start, end;
1181 long pages;
cbee9f88
PZ
1182
1183 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
1184
1185 work->next = work; /* protect against double add */
1186 /*
1187 * Who cares about NUMA placement when they're dying.
1188 *
1189 * NOTE: make sure not to dereference p->mm before this check,
1190 * exit_task_work() happens _after_ exit_mm() so we could be called
1191 * without p->mm even though we still had it when we enqueued this
1192 * work.
1193 */
1194 if (p->flags & PF_EXITING)
1195 return;
1196
5bca2303
MG
1197 /*
1198 * We do not care about task placement until a task runs on a node
1199 * other than the first one used by the address space. This is
1200 * largely because migrations are driven by what CPU the task
1201 * is running on. If it's never scheduled on another node, it'll
1202 * not migrate so why bother trapping the fault.
1203 */
1204 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
1205 mm->first_nid = numa_node_id();
1206 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
1207 /* Are we running on a new node yet? */
1208 if (numa_node_id() == mm->first_nid &&
1209 !sched_feat_numa(NUMA_FORCE))
1210 return;
1211
1212 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
1213 }
1214
b8593bfd
MG
1215 /*
1216 * Reset the scan period if enough time has gone by. Objective is that
1217 * scanning will be reduced if pages are properly placed. As tasks
1218 * can enter different phases this needs to be re-examined. Lacking
1219 * proper tracking of reference behaviour, this blunt hammer is used.
1220 */
1221 migrate = mm->numa_next_reset;
1222 if (time_after(now, migrate)) {
1223 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1224 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
1225 xchg(&mm->numa_next_reset, next_scan);
1226 }
1227
cbee9f88
PZ
1228 /*
1229 * Enforce maximal scan/migration frequency..
1230 */
1231 migrate = mm->numa_next_scan;
1232 if (time_before(now, migrate))
1233 return;
1234
1235 if (p->numa_scan_period == 0)
1236 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1237
fb003b80 1238 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
cbee9f88
PZ
1239 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
1240 return;
1241
e14808b4
MG
1242 /*
1243 * Do not set pte_numa if the current running node is rate-limited.
1244 * This loses statistics on the fault but if we are unwilling to
1245 * migrate to this node, it is less likely we can do useful work
1246 */
1247 if (migrate_ratelimited(numa_node_id()))
1248 return;
1249
9f40604c
MG
1250 start = mm->numa_scan_offset;
1251 pages = sysctl_numa_balancing_scan_size;
1252 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
1253 if (!pages)
1254 return;
cbee9f88 1255
6e5fb223 1256 down_read(&mm->mmap_sem);
9f40604c 1257 vma = find_vma(mm, start);
6e5fb223
PZ
1258 if (!vma) {
1259 reset_ptenuma_scan(p);
9f40604c 1260 start = 0;
6e5fb223
PZ
1261 vma = mm->mmap;
1262 }
9f40604c 1263 for (; vma; vma = vma->vm_next) {
6e5fb223
PZ
1264 if (!vma_migratable(vma))
1265 continue;
1266
1267 /* Skip small VMAs. They are not likely to be of relevance */
221392c3 1268 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
6e5fb223
PZ
1269 continue;
1270
13ea5487
MG
1271 /*
1272 * Skip inaccessible VMAs to avoid any confusion between
1273 * PROT_NONE and NUMA hinting ptes
1274 */
1275 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1276 continue;
1277
9f40604c
MG
1278 do {
1279 start = max(start, vma->vm_start);
1280 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
1281 end = min(end, vma->vm_end);
1282 pages -= change_prot_numa(vma, start, end);
6e5fb223 1283
9f40604c
MG
1284 start = end;
1285 if (pages <= 0)
1286 goto out;
1287 } while (end != vma->vm_end);
cbee9f88 1288 }
6e5fb223 1289
9f40604c 1290out:
6e5fb223
PZ
1291 /*
1292 * It is possible to reach the end of the VMA list but the last few VMAs are
1293 * not guaranteed to the vma_migratable. If they are not, we would find the
1294 * !migratable VMA on the next scan but not reset the scanner to the start
1295 * so check it now.
1296 */
1297 if (vma)
9f40604c 1298 mm->numa_scan_offset = start;
6e5fb223
PZ
1299 else
1300 reset_ptenuma_scan(p);
1301 up_read(&mm->mmap_sem);
cbee9f88
PZ
1302}
1303
1304/*
1305 * Drive the periodic memory faults..
1306 */
1307void task_tick_numa(struct rq *rq, struct task_struct *curr)
1308{
1309 struct callback_head *work = &curr->numa_work;
1310 u64 period, now;
1311
1312 /*
1313 * We don't care about NUMA placement if we don't have memory.
1314 */
1315 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
1316 return;
1317
1318 /*
1319 * Using runtime rather than walltime has the dual advantage that
1320 * we (mostly) drive the selection from busy threads and that the
1321 * task needs to have done some actual work before we bother with
1322 * NUMA placement.
1323 */
1324 now = curr->se.sum_exec_runtime;
1325 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
1326
1327 if (now - curr->node_stamp > period) {
4b96a29b
PZ
1328 if (!curr->node_stamp)
1329 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
cbee9f88
PZ
1330 curr->node_stamp = now;
1331
1332 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1333 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1334 task_work_add(curr, work, true);
1335 }
1336 }
1337}
1338#else
1339static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1340{
1341}
1342#endif /* CONFIG_NUMA_BALANCING */
1343
30cfdcfc
DA
1344static void
1345account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1346{
1347 update_load_add(&cfs_rq->load, se->load.weight);
c09595f6 1348 if (!parent_entity(se))
029632fb 1349 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
367456c7
PZ
1350#ifdef CONFIG_SMP
1351 if (entity_is_task(se))
eb95308e 1352 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
367456c7 1353#endif
30cfdcfc 1354 cfs_rq->nr_running++;
30cfdcfc
DA
1355}
1356
1357static void
1358account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1359{
1360 update_load_sub(&cfs_rq->load, se->load.weight);
c09595f6 1361 if (!parent_entity(se))
029632fb 1362 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
367456c7 1363 if (entity_is_task(se))
b87f1724 1364 list_del_init(&se->group_node);
30cfdcfc 1365 cfs_rq->nr_running--;
30cfdcfc
DA
1366}
1367
3ff6dcac
YZ
1368#ifdef CONFIG_FAIR_GROUP_SCHED
1369# ifdef CONFIG_SMP
cf5f0acf
PZ
1370static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1371{
1372 long tg_weight;
1373
1374 /*
1375 * Use this CPU's actual weight instead of the last load_contribution
1376 * to gain a more accurate current total weight. See
1377 * update_cfs_rq_load_contribution().
1378 */
6fa3eb70 1379 tg_weight = atomic_long_read(&tg->load_avg);
82958366 1380 tg_weight -= cfs_rq->tg_load_contrib;
cf5f0acf
PZ
1381 tg_weight += cfs_rq->load.weight;
1382
1383 return tg_weight;
1384}
1385
6d5ab293 1386static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
3ff6dcac 1387{
cf5f0acf 1388 long tg_weight, load, shares;
3ff6dcac 1389
cf5f0acf 1390 tg_weight = calc_tg_weight(tg, cfs_rq);
6d5ab293 1391 load = cfs_rq->load.weight;
3ff6dcac 1392
3ff6dcac 1393 shares = (tg->shares * load);
cf5f0acf
PZ
1394 if (tg_weight)
1395 shares /= tg_weight;
3ff6dcac
YZ
1396
1397 if (shares < MIN_SHARES)
1398 shares = MIN_SHARES;
1399 if (shares > tg->shares)
1400 shares = tg->shares;
1401
1402 return shares;
1403}
3ff6dcac 1404# else /* CONFIG_SMP */
6d5ab293 1405static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
3ff6dcac
YZ
1406{
1407 return tg->shares;
1408}
3ff6dcac 1409# endif /* CONFIG_SMP */
2069dd75
PZ
1410static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1411 unsigned long weight)
1412{
19e5eebb
PT
1413 if (se->on_rq) {
1414 /* commit outstanding execution time */
1415 if (cfs_rq->curr == se)
1416 update_curr(cfs_rq);
2069dd75 1417 account_entity_dequeue(cfs_rq, se);
19e5eebb 1418 }
2069dd75
PZ
1419
1420 update_load_set(&se->load, weight);
1421
1422 if (se->on_rq)
1423 account_entity_enqueue(cfs_rq, se);
1424}
1425
82958366
PT
1426static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1427
6d5ab293 1428static void update_cfs_shares(struct cfs_rq *cfs_rq)
2069dd75
PZ
1429{
1430 struct task_group *tg;
1431 struct sched_entity *se;
3ff6dcac 1432 long shares;
2069dd75 1433
2069dd75
PZ
1434 tg = cfs_rq->tg;
1435 se = tg->se[cpu_of(rq_of(cfs_rq))];
64660c86 1436 if (!se || throttled_hierarchy(cfs_rq))
2069dd75 1437 return;
3ff6dcac
YZ
1438#ifndef CONFIG_SMP
1439 if (likely(se->load.weight == tg->shares))
1440 return;
1441#endif
6d5ab293 1442 shares = calc_cfs_shares(cfs_rq, tg);
2069dd75
PZ
1443
1444 reweight_entity(cfs_rq_of(se), se, shares);
1445}
1446#else /* CONFIG_FAIR_GROUP_SCHED */
6d5ab293 1447static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2069dd75
PZ
1448{
1449}
1450#endif /* CONFIG_FAIR_GROUP_SCHED */
1451
6fa3eb70 1452#ifdef CONFIG_SMP
5b51f2f8
PT
1453/*
1454 * We choose a half-life close to 1 scheduling period.
1455 * Note: The tables below are dependent on this value.
1456 */
6fa3eb70
S
1457//#define LOAD_AVG_PERIOD 32
1458//#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1459//#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
5b51f2f8
PT
1460
1461/* Precomputed fixed inverse multiplies for multiplication by y^n */
1462static const u32 runnable_avg_yN_inv[] = {
1463 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1464 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1465 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1466 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1467 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1468 0x85aac367, 0x82cd8698,
1469};
1470
1471/*
1472 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1473 * over-estimates when re-combining.
1474 */
1475static const u32 runnable_avg_yN_sum[] = {
1476 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1477 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1478 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1479};
1480
9d85f21c
PT
1481/*
1482 * Approximate:
1483 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1484 */
1485static __always_inline u64 decay_load(u64 val, u64 n)
1486{
5b51f2f8
PT
1487 unsigned int local_n;
1488
1489 if (!n)
1490 return val;
1491 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1492 return 0;
1493
1494 /* after bounds checking we can collapse to 32-bit */
1495 local_n = n;
1496
1497 /*
1498 * As y^PERIOD = 1/2, we can combine
1499 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1500 * With a look-up table which covers k^n (n<PERIOD)
1501 *
1502 * To achieve constant time decay_load.
1503 */
1504 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1505 val >>= local_n / LOAD_AVG_PERIOD;
1506 local_n %= LOAD_AVG_PERIOD;
9d85f21c
PT
1507 }
1508
5b51f2f8
PT
1509 val *= runnable_avg_yN_inv[local_n];
1510 /* We don't use SRR here since we always want to round down. */
1511 return val >> 32;
1512}
1513
1514/*
1515 * For updates fully spanning n periods, the contribution to runnable
1516 * average will be: \Sum 1024*y^n
1517 *
1518 * We can compute this reasonably efficiently by combining:
1519 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1520 */
1521static u32 __compute_runnable_contrib(u64 n)
1522{
1523 u32 contrib = 0;
1524
1525 if (likely(n <= LOAD_AVG_PERIOD))
1526 return runnable_avg_yN_sum[n];
1527 else if (unlikely(n >= LOAD_AVG_MAX_N))
1528 return LOAD_AVG_MAX;
1529
1530 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1531 do {
1532 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1533 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1534
1535 n -= LOAD_AVG_PERIOD;
1536 } while (n > LOAD_AVG_PERIOD);
1537
1538 contrib = decay_load(contrib, n);
1539 return contrib + runnable_avg_yN_sum[n];
9d85f21c
PT
1540}
1541
6fa3eb70
S
1542#ifdef CONFIG_HMP_VARIABLE_SCALE
1543
1544#define HMP_VARIABLE_SCALE_SHIFT 16ULL
1545struct hmp_global_attr {
1546 struct attribute attr;
1547 ssize_t (*show)(struct kobject *kobj,
1548 struct attribute *attr, char *buf);
1549 ssize_t (*store)(struct kobject *a, struct attribute *b,
1550 const char *c, size_t count);
1551 int *value;
1552 int (*to_sysfs)(int);
1553 int (*from_sysfs)(int);
1554};
1555
1556#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1557#define HMP_DATA_SYSFS_MAX 5
1558#else
1559#define HMP_DATA_SYSFS_MAX 4
1560#endif
1561
1562struct hmp_data_struct {
1563#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1564 int freqinvar_load_scale_enabled;
1565#endif
1566 int multiplier; /* used to scale the time delta */
1567 struct attribute_group attr_group;
1568 struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
1569 struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
1570} hmp_data;
1571
1572static u64 hmp_variable_scale_convert(u64 delta);
1573#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1574/* Frequency-Invariant Load Modification:
1575 * Loads are calculated as in PJT's patch however we also scale the current
1576 * contribution in line with the frequency of the CPU that the task was
1577 * executed on.
1578 * In this version, we use a simple linear scale derived from the maximum
1579 * frequency reported by CPUFreq. As an example:
1580 *
1581 * Consider that we ran a task for 100% of the previous interval.
1582 *
1583 * Our CPU was under asynchronous frequency control through one of the
1584 * CPUFreq governors.
1585 *
1586 * The CPUFreq governor reports that it is able to scale the CPU between
1587 * 500MHz and 1GHz.
1588 *
1589 * During the period, the CPU was running at 1GHz.
1590 *
1591 * In this case, our load contribution for that period is calculated as
1592 * 1 * (number_of_active_microseconds)
1593 *
1594 * This results in our task being able to accumulate maximum load as normal.
1595 *
1596 *
1597 * Consider now that our CPU was executing at 500MHz.
1598 *
1599 * We now scale the load contribution such that it is calculated as
1600 * 0.5 * (number_of_active_microseconds)
1601 *
1602 * Our task can only record 50% maximum load during this period.
1603 *
1604 * This represents the task consuming 50% of the CPU's *possible* compute
1605 * capacity. However the task did consume 100% of the CPU's *available*
1606 * compute capacity which is the value seen by the CPUFreq governor and
1607 * user-side CPU Utilization tools.
1608 *
1609 * Restricting tracked load to be scaled by the CPU's frequency accurately
1610 * represents the consumption of possible compute capacity and allows the
1611 * HMP migration's simple threshold migration strategy to interact more
1612 * predictably with CPUFreq's asynchronous compute capacity changes.
1613 */
1614#define SCHED_FREQSCALE_SHIFT 10
1615struct cpufreq_extents {
1616 u32 curr_scale;
1617 u32 min;
1618 u32 max;
1619 u32 flags;
1620#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
1621 u32 const_max;
1622 u32 throttling;
1623#endif
1624};
1625/* Flag set when the governor in use only allows one frequency.
1626 * Disables scaling.
1627 */
1628#define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
1629
1630static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
1631#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1632#endif /* CONFIG_HMP_VARIABLE_SCALE */
1633
1634#ifdef CONFIG_MTK_SCHED_CMP
1635int get_cluster_id(unsigned int cpu)
1636{
1637 return arch_get_cluster_id(cpu);
1638}
1639
1640void get_cluster_cpus(struct cpumask *cpus, int cluster_id,
1641 bool exclusive_offline)
1642{
1643 struct cpumask cls_cpus;
1644
1645 arch_get_cluster_cpus(&cls_cpus, cluster_id);
1646 if (exclusive_offline) {
1647 cpumask_and(cpus, cpu_online_mask, &cls_cpus);
1648 } else
1649 cpumask_copy(cpus, &cls_cpus);
1650}
1651
1652static int nr_cpus_in_cluster(int cluster_id, bool exclusive_offline)
1653{
1654 struct cpumask cls_cpus;
1655 int nr_cpus;
1656
1657 arch_get_cluster_cpus(&cls_cpus, cluster_id);
1658 if (exclusive_offline) {
1659 struct cpumask online_cpus;
1660 cpumask_and(&online_cpus, cpu_online_mask, &cls_cpus);
1661 nr_cpus = cpumask_weight(&online_cpus);
1662 } else
1663 nr_cpus = cpumask_weight(&cls_cpus);
1664
1665 return nr_cpus;
1666}
1667#endif /* CONFIG_MTK_SCHED_CMP */
1668
1669void sched_get_big_little_cpus(struct cpumask *big, struct cpumask *little)
1670{
1671 arch_get_big_little_cpus(big, little);
1672}
1673EXPORT_SYMBOL(sched_get_big_little_cpus);
1674
9d85f21c 1675/*
6fa3eb70
S
1676 * generic entry point for cpu mask construction, dedicated for
1677 * mediatek scheduler.
1678 */
1679static __init __inline void cmp_cputopo_domain_setup(void)
1680{
1681 WARN(smp_processor_id() != 0, "%s is supposed runs on CPU0 "
1682 "while kernel init", __func__);
1683#ifdef CONFIG_MTK_CPU_TOPOLOGY
1684 /*
1685 * sched_init
1686 * |-> cmp_cputopo_domain_seutp()
1687 * ...
1688 * rest_init
1689 * ^ fork kernel_init
1690 * |-> kernel_init_freeable
1691 * ...
1692 * |-> arch_build_cpu_topology_domain
1693 *
1694 * here, we focus to build up cpu topology and domain before scheduler runs.
1695 */
1696 pr_debug("[CPUTOPO][%s] build CPU topology and cluster.\n", __func__);
1697 arch_build_cpu_topology_domain();
1698#endif
1699}
1700
1701#ifdef CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY
1702static u64 __inline variable_scale_convert(u64 delta)
1703{
1704 u64 high = delta >> 32ULL;
1705 u64 low = delta & 0xffffffffULL;
1706 low *= LOAD_AVG_VARIABLE_PERIOD;
1707 high *= LOAD_AVG_VARIABLE_PERIOD;
1708 return (low >> 16ULL) + (high << (32ULL - 16ULL));
1709}
1710#endif
1711
1712/* We can represent the historical contribution to runnable average as the
9d85f21c
PT
1713 * coefficients of a geometric series. To do this we sub-divide our runnable
1714 * history into segments of approximately 1ms (1024us); label the segment that
1715 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1716 *
1717 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1718 * p0 p1 p2
1719 * (now) (~1ms ago) (~2ms ago)
1720 *
1721 * Let u_i denote the fraction of p_i that the entity was runnable.
1722 *
1723 * We then designate the fractions u_i as our co-efficients, yielding the
1724 * following representation of historical load:
1725 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1726 *
1727 * We choose y based on the with of a reasonably scheduling period, fixing:
1728 * y^32 = 0.5
1729 *
1730 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1731 * approximately half as much as the contribution to load within the last ms
1732 * (u_0).
1733 *
1734 * When a period "rolls over" and we have new u_0`, multiplying the previous
1735 * sum again by y is sufficient to update:
1736 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1737 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1738 */
1739static __always_inline int __update_entity_runnable_avg(u64 now,
1740 struct sched_avg *sa,
6fa3eb70
S
1741 int runnable,
1742 int running,
1743 int cpu)
9d85f21c 1744{
6fa3eb70 1745 u64 delta, periods, lru;
5b51f2f8 1746 u32 runnable_contrib;
9d85f21c 1747 int delta_w, decayed = 0;
6fa3eb70
S
1748#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1749 u64 scaled_delta;
1750 u32 scaled_runnable_contrib;
1751 int scaled_delta_w;
1752 u32 curr_scale = 1024;
1753#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1754 u64 scaled_delta;
1755 u32 scaled_runnable_contrib;
1756 int scaled_delta_w;
1757 u32 curr_scale = CPUPOWER_FREQSCALE_DEFAULT;
1758#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
9d85f21c
PT
1759
1760 delta = now - sa->last_runnable_update;
6fa3eb70 1761 lru = sa->last_runnable_update;
9d85f21c
PT
1762 /*
1763 * This should only happen when time goes backwards, which it
1764 * unfortunately does during sched clock init when we swap over to TSC.
1765 */
1766 if ((s64)delta < 0) {
1767 sa->last_runnable_update = now;
1768 return 0;
1769 }
1770
6fa3eb70
S
1771#ifdef CONFIG_HMP_VARIABLE_SCALE
1772 delta = hmp_variable_scale_convert(delta);
1773#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1774 delta = variable_scale_convert(delta);
1775#endif
9d85f21c
PT
1776 /*
1777 * Use 1024ns as the unit of measurement since it's a reasonable
1778 * approximation of 1us and fast to compute.
1779 */
1780 delta >>= 10;
1781 if (!delta)
1782 return 0;
1783 sa->last_runnable_update = now;
1784
6fa3eb70
S
1785#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1786 WARN(cpu < 0, "[%s] CPU %d < 0 !!!\n", __func__, cpu);
1787 /* retrieve scale factor for load */
1788 if (cpu >= 0 && cpu < nr_cpu_ids && hmp_data.freqinvar_load_scale_enabled)
1789 curr_scale = freq_scale[cpu].curr_scale;
1790 mt_sched_printf("[%s] cpu=%d delta=%llu now=%llu last=%llu curr_scale=%u",
1791 __func__, cpu, delta, now, lru, curr_scale);
1792#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1793 WARN(cpu < 0, "[%s] CPU %d < 0 !!!\n", __func__, cpu);
1794 /* retrieve scale factor for load */
1795 if (cpu >= 0 && cpu < nr_cpu_ids)
1796 curr_scale = (topology_cpu_capacity(cpu) << CPUPOWER_FREQSCALE_SHIFT)
1797 / (topology_max_cpu_capacity(cpu)+1);
1798 mt_sched_printf("[%s] cpu=%d delta=%llu now=%llu last=%llu curr_scale=%u",
1799 __func__, cpu, delta, now, lru, curr_scale);
1800#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1801
9d85f21c
PT
1802 /* delta_w is the amount already accumulated against our next period */
1803 delta_w = sa->runnable_avg_period % 1024;
1804 if (delta + delta_w >= 1024) {
1805 /* period roll-over */
1806 decayed = 1;
1807
1808 /*
1809 * Now that we know we're crossing a period boundary, figure
1810 * out how much from delta we need to complete the current
1811 * period and accrue it.
1812 */
1813 delta_w = 1024 - delta_w;
6fa3eb70
S
1814#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1815 /* scale runnable time if necessary */
1816 scaled_delta_w = (delta_w * curr_scale)
1817 >> SCHED_FREQSCALE_SHIFT;
1818 if (runnable)
1819 sa->runnable_avg_sum += scaled_delta_w;
1820 if (running)
1821 sa->usage_avg_sum += scaled_delta_w;
1822#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1823 /* scale runnable time if necessary */
1824 scaled_delta_w = (delta_w * curr_scale)
1825 >> CPUPOWER_FREQSCALE_SHIFT;
1826 if (runnable)
1827 sa->runnable_avg_sum += scaled_delta_w;
1828 if (running)
1829 sa->usage_avg_sum += scaled_delta_w;
1830#else
5b51f2f8
PT
1831 if (runnable)
1832 sa->runnable_avg_sum += delta_w;
6fa3eb70
S
1833 if (running)
1834 sa->usage_avg_sum += delta_w;
1835#endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
5b51f2f8
PT
1836 sa->runnable_avg_period += delta_w;
1837
1838 delta -= delta_w;
1839
1840 /* Figure out how many additional periods this update spans */
1841 periods = delta / 1024;
1842 delta %= 1024;
6fa3eb70 1843 /* decay the load we have accumulated so far */
5b51f2f8
PT
1844 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1845 periods + 1);
1846 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1847 periods + 1);
6fa3eb70
S
1848 sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
1849 /* add the contribution from this period */
5b51f2f8
PT
1850 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1851 runnable_contrib = __compute_runnable_contrib(periods);
6fa3eb70
S
1852#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1853 /* Apply load scaling if necessary.
1854 * Note that multiplying the whole series is same as
1855 * multiplying all terms
1856 */
1857 scaled_runnable_contrib = (runnable_contrib * curr_scale)
1858 >> SCHED_FREQSCALE_SHIFT;
1859 if (runnable)
1860 sa->runnable_avg_sum += scaled_runnable_contrib;
1861 if (running)
1862 sa->usage_avg_sum += scaled_runnable_contrib;
1863#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1864 /* Apply load scaling if necessary.
1865 * Note that multiplying the whole series is same as
1866 * multiplying all terms
1867 */
1868 scaled_runnable_contrib = (runnable_contrib * curr_scale)
1869 >> CPUPOWER_FREQSCALE_SHIFT;
1870 if (runnable)
1871 sa->runnable_avg_sum += scaled_runnable_contrib;
1872 if (running)
1873 sa->usage_avg_sum += scaled_runnable_contrib;
1874#else
5b51f2f8
PT
1875 if (runnable)
1876 sa->runnable_avg_sum += runnable_contrib;
6fa3eb70
S
1877 if (running)
1878 sa->usage_avg_sum += runnable_contrib;
1879#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
5b51f2f8 1880 sa->runnable_avg_period += runnable_contrib;
9d85f21c
PT
1881 }
1882
1883 /* Remainder of delta accrued against u_0` */
6fa3eb70
S
1884#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1885 /* scale if necessary */
1886 scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
1887 if (runnable)
1888 sa->runnable_avg_sum += scaled_delta;
1889 if (running)
1890 sa->usage_avg_sum += scaled_delta;
1891#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1892 /* scale if necessary */
1893 scaled_delta = ((delta * curr_scale) >> CPUPOWER_FREQSCALE_SHIFT);
1894 if (runnable)
1895 sa->runnable_avg_sum += scaled_delta;
1896 if (running)
1897 sa->usage_avg_sum += scaled_delta;
1898#else
9d85f21c
PT
1899 if (runnable)
1900 sa->runnable_avg_sum += delta;
6fa3eb70
S
1901 if (running)
1902 sa->usage_avg_sum += delta;
1903#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
9d85f21c
PT
1904 sa->runnable_avg_period += delta;
1905
1906 return decayed;
1907}
1908
9ee474f5 1909/* Synchronize an entity's decay with its parenting cfs_rq.*/
aff3e498 1910static inline u64 __synchronize_entity_decay(struct sched_entity *se)
9ee474f5
PT
1911{
1912 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1913 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1914
1915 decays -= se->avg.decay_count;
1916 if (!decays)
aff3e498 1917 return 0;
9ee474f5
PT
1918
1919 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1920 se->avg.decay_count = 0;
aff3e498
PT
1921
1922 return decays;
9ee474f5
PT
1923}
1924
c566e8e9
PT
1925#ifdef CONFIG_FAIR_GROUP_SCHED
1926static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1927 int force_update)
1928{
1929 struct task_group *tg = cfs_rq->tg;
6fa3eb70 1930 long tg_contrib;
c566e8e9
PT
1931
1932 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1933 tg_contrib -= cfs_rq->tg_load_contrib;
1934
6fa3eb70
S
1935 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1936 atomic_long_add(tg_contrib, &tg->load_avg);
c566e8e9
PT
1937 cfs_rq->tg_load_contrib += tg_contrib;
1938 }
1939}
8165e145 1940
bb17f655
PT
1941/*
1942 * Aggregate cfs_rq runnable averages into an equivalent task_group
1943 * representation for computing load contributions.
1944 */
1945static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1946 struct cfs_rq *cfs_rq)
1947{
1948 struct task_group *tg = cfs_rq->tg;
6fa3eb70 1949 long contrib, usage_contrib;
bb17f655
PT
1950
1951 /* The fraction of a cpu used by this cfs_rq */
1952 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1953 sa->runnable_avg_period + 1);
1954 contrib -= cfs_rq->tg_runnable_contrib;
1955
6fa3eb70
S
1956 usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
1957 sa->runnable_avg_period + 1);
1958 usage_contrib -= cfs_rq->tg_usage_contrib;
1959
1960 /*
1961 * contrib/usage at this point represent deltas, only update if they
1962 * are substantive.
1963 */
1964 if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
1965 (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
bb17f655
PT
1966 atomic_add(contrib, &tg->runnable_avg);
1967 cfs_rq->tg_runnable_contrib += contrib;
6fa3eb70
S
1968
1969 atomic_add(usage_contrib, &tg->usage_avg);
1970 cfs_rq->tg_usage_contrib += usage_contrib;
bb17f655
PT
1971 }
1972}
1973
8165e145
PT
1974static inline void __update_group_entity_contrib(struct sched_entity *se)
1975{
1976 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1977 struct task_group *tg = cfs_rq->tg;
bb17f655
PT
1978 int runnable_avg;
1979
8165e145
PT
1980 u64 contrib;
1981
1982 contrib = cfs_rq->tg_load_contrib * tg->shares;
6fa3eb70
S
1983 se->avg.load_avg_contrib = div_u64(contrib,
1984 atomic_long_read(&tg->load_avg) + 1);
bb17f655
PT
1985
1986 /*
1987 * For group entities we need to compute a correction term in the case
1988 * that they are consuming <1 cpu so that we would contribute the same
1989 * load as a task of equal weight.
1990 *
1991 * Explicitly co-ordinating this measurement would be expensive, but
1992 * fortunately the sum of each cpus contribution forms a usable
1993 * lower-bound on the true value.
1994 *
1995 * Consider the aggregate of 2 contributions. Either they are disjoint
1996 * (and the sum represents true value) or they are disjoint and we are
1997 * understating by the aggregate of their overlap.
1998 *
1999 * Extending this to N cpus, for a given overlap, the maximum amount we
2000 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
2001 * cpus that overlap for this interval and w_i is the interval width.
2002 *
2003 * On a small machine; the first term is well-bounded which bounds the
2004 * total error since w_i is a subset of the period. Whereas on a
2005 * larger machine, while this first term can be larger, if w_i is the
2006 * of consequential size guaranteed to see n_i*w_i quickly converge to
2007 * our upper bound of 1-cpu.
2008 */
2009 runnable_avg = atomic_read(&tg->runnable_avg);
2010 if (runnable_avg < NICE_0_LOAD) {
2011 se->avg.load_avg_contrib *= runnable_avg;
2012 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2013 }
8165e145 2014}
c566e8e9
PT
2015#else
2016static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
6fa3eb70 2017 int force_update) {}
bb17f655 2018static inline void __update_tg_runnable_avg(struct sched_avg *sa,
6fa3eb70 2019 struct cfs_rq *cfs_rq) {}
8165e145 2020static inline void __update_group_entity_contrib(struct sched_entity *se) {}
c566e8e9
PT
2021#endif
2022
8165e145
PT
2023static inline void __update_task_entity_contrib(struct sched_entity *se)
2024{
2025 u32 contrib;
2026
2027 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2028 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2029 contrib /= (se->avg.runnable_avg_period + 1);
2030 se->avg.load_avg_contrib = scale_load(contrib);
2031}
2032
2dac754e
PT
2033/* Compute the current contribution to load_avg by se, return any delta */
2034static long __update_entity_load_avg_contrib(struct sched_entity *se)
2035{
2036 long old_contrib = se->avg.load_avg_contrib;
2037
8165e145
PT
2038 if (entity_is_task(se)) {
2039 __update_task_entity_contrib(se);
2040 } else {
bb17f655 2041 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
8165e145
PT
2042 __update_group_entity_contrib(se);
2043 }
2dac754e
PT
2044
2045 return se->avg.load_avg_contrib - old_contrib;
2046}
2047
6fa3eb70
S
2048#if defined(CONFIG_MTK_SCHED_CMP) || defined(CONFIG_SCHED_HMP_ENHANCEMENT)
2049/* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
2050static long __update_task_entity_ratio(struct sched_entity *se)
2051{
2052 long old_ratio = se->avg.load_avg_ratio;
2053 u32 ratio;
2054
2055 ratio = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
2056 ratio /= (se->avg.runnable_avg_period + 1);
2057 se->avg.load_avg_ratio = scale_load(ratio);
2058
2059 return se->avg.load_avg_ratio - old_ratio;
2060}
2061#else
2062static inline long __update_task_entity_ratio(struct sched_entity *se) { return 0; }
2063#endif
2064
9ee474f5
PT
2065static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2066 long load_contrib)
2067{
2068 if (likely(load_contrib < cfs_rq->blocked_load_avg))
2069 cfs_rq->blocked_load_avg -= load_contrib;
2070 else
2071 cfs_rq->blocked_load_avg = 0;
2072}
2073
6fa3eb70
S
2074#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2075unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
2076#endif
2077
2078#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
2079/* Schedule entity */
2080#define se_pid(se) ((se != NULL && entity_is_task(se))? \
2081 container_of(se,struct task_struct,se)->pid:-1)
2082#define se_load(se) se->avg.load_avg_ratio
2083#define se_contrib(se) se->avg.load_avg_contrib
2084
2085/* CPU related : load information */
2086#define cfs_pending_load(cpu) cpu_rq(cpu)->cfs.avg.pending_load
2087#define cfs_load(cpu) cpu_rq(cpu)->cfs.avg.load_avg_ratio
2088#define cfs_contrib(cpu) cpu_rq(cpu)->cfs.avg.load_avg_contrib
2089
2090/* CPU related : the number of tasks */
2091#define cfs_nr_normal_prio(cpu) cpu_rq(cpu)->cfs.avg.nr_normal_prio
2092#define cfs_nr_pending(cpu) cpu_rq(cpu)->cfs.avg.nr_pending
2093#define cfs_length(cpu) cpu_rq(cpu)->cfs.h_nr_running
2094#define rq_length(cpu) (cpu_rq(cpu)->nr_running + cfs_nr_pending(cpu))
2095
2096#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2097#define task_low_priority(prio) ((prio >= hmp_up_prio)?1:0)
2098#define cfs_nr_dequeuing_low_prio(cpu) \
2099 cpu_rq(cpu)->cfs.avg.nr_dequeuing_low_prio
2100#define cfs_reset_nr_dequeuing_low_prio(cpu) \
2101 (cfs_nr_dequeuing_low_prio(cpu) = 0)
2102#else
2103#define task_low_priority(prio) (0)
2104#define cfs_reset_nr_dequeuing_low_prio(cpu)
2105#endif /* CONFIG_SCHED_HMP_PRIO_FILTER */
2106#endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
2107
f1b17280
PT
2108static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2109
6fa3eb70
S
2110#ifdef CONFIG_MTK_SCHED_CMP_TGS
2111int group_leader_is_empty(struct task_struct *p) {
2112
2113 struct task_struct *tg = p->group_leader;
2114
2115 if (SIGNAL_GROUP_EXIT & p->signal->flags){
2116 // pr_warn("[%s] (0x%p/0x%p)(#%d/%s) leader: pid(%d) state(%d) exit_state(%d)signal_flags=%x p->signal->flags=%x group_exit_code=%x\n", __func__,
2117 // p, tg, get_nr_threads(p), thread_group_empty(p) ? "empty" : "not empty",
2118 // p->tgid, tg->state, tg->exit_state, tg->state, p->signal->flags, p->signal->group_exit_code);
2119 return 1;
2120 }
2121
2122 // workaround debug codes
2123 if(tg->state == 0x6b6b6b6b){
2124 // pr_warn("[%s] (0x%p/0x%p)(#%d/%s) leader: state(%d) exit_state(%d)\n", __func__,
2125 // p, tg, get_nr_threads(p), thread_group_empty(p) ? "empty" : "not empty",
2126 // tg->state, tg->exit_state);
2127 return 1;
2128 }
2129
2130 return 0;
2131}
2132
2133static inline void update_tg_info(struct cfs_rq *cfs_rq, struct sched_entity *se, long ratio_delta)
2134{
2135 struct task_struct *p = task_of(se);
2136 struct task_struct *tg = p->group_leader;
2137 int id;
2138 unsigned long flags;
2139
2140 if (group_leader_is_empty(p))
2141 return;
2142 id = get_cluster_id(cfs_rq->rq->cpu);
2143 if (unlikely(WARN_ON(id < 0)))
2144 return;
2145
2146 raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags);
2147 tg->thread_group_info[id].load_avg_ratio += ratio_delta;
2148 raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags);
2149
2150#ifdef CONFIG_MT_SCHED_INFO
2151 mt_sched_printf("update_tg_info %d:%s %d:%s %ld %ld %d %d %lu:%lu:%lu update",
2152 tg->pid, tg->comm, p->pid, p->comm,
2153 se->avg.load_avg_ratio, ratio_delta,
2154 cfs_rq->rq->cpu, id,
2155 tg->thread_group_info[id].nr_running,
2156 tg->thread_group_info[id].cfs_nr_running,
2157 tg->thread_group_info[id].load_avg_ratio);
2158/*
2159 mt_sched_printf("update %d:%s %d:%s %ld %ld %d %d %lu %lu %lu, %lu %lu %lu",
2160 tg->pid, tg->comm, p->pid, p->comm,
2161 se->avg.load_avg_ratio, ratio_delta,
2162 id, cfs_rq->rq->cpu,
2163 tg->thread_group_info[0].nr_running,
2164 tg->thread_group_info[0].cfs_nr_running,
2165 tg->thread_group_info[0].load_avg_ratio,
2166 tg->thread_group_info[1].nr_running,
2167 tg->thread_group_info[1].cfs_nr_running,
2168 tg->thread_group_info[1].load_avg_ratio);
2169*/
2170#endif
2171
2172}
2173#endif
2174
9d85f21c 2175/* Update a sched_entity's runnable average */
9ee474f5
PT
2176static inline void update_entity_load_avg(struct sched_entity *se,
2177 int update_cfs_rq)
9d85f21c 2178{
2dac754e
PT
2179 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2180 long contrib_delta;
f1b17280 2181 u64 now;
6fa3eb70
S
2182 long ratio_delta = 0;
2183 int cpu = -1; /* not used in normal case */
2184
2185#if defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) \
2186 || defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
2187 cpu = cfs_rq->rq->cpu;
2188#endif
2dac754e 2189
f1b17280
PT
2190 /*
2191 * For a group entity we need to use their owned cfs_rq_clock_task() in
2192 * case they are the parent of a throttled hierarchy.
2193 */
2194 if (entity_is_task(se))
2195 now = cfs_rq_clock_task(cfs_rq);
2196 else
2197 now = cfs_rq_clock_task(group_cfs_rq(se));
2198
6fa3eb70
S
2199 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
2200 cfs_rq->curr == se, cpu)) {
2201#if 0
2202 if (entity_is_task(se)) {
2203 ratio_delta = __update_task_entity_ratio(se);
2204 if (update_cfs_rq)
2205 {
2206 cpu = cfs_rq->rq->cpu;
2207 cpu_rq(cpu)->cfs.avg.load_avg_ratio += ratio_delta;
2208#ifdef CONFIG_HMP_TRACER
2209 trace_sched_cfs_load_update(task_of(se),se_load(se),ratio_delta, cpu);
2210#endif /* CONFIG_HMP_TRACER */
2211 }
2212
2213 trace_sched_task_entity_avg(2, task_of(se), &se->avg);
2214#ifdef CONFIG_MTK_SCHED_CMP_TGS
2215 if (se->on_rq) {
2216 update_tg_info(cfs_rq, se, ratio_delta);
2217 }
2218#endif
2219 }
2220#endif
2dac754e 2221 return;
6fa3eb70 2222 }
2dac754e
PT
2223
2224 contrib_delta = __update_entity_load_avg_contrib(se);
9ee474f5 2225
6fa3eb70
S
2226 /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
2227 if (entity_is_task(se)) {
2228 ratio_delta = __update_task_entity_ratio(se);
2229 /*
2230 * ratio is re-estimated just for entity of task; as
2231 * for contrib, mark tracer here for task entity while
2232 * mining tg's at __update_group_entity_contrib().
2233 *
2234 * track running usage in passing.
2235 */
2236 trace_sched_task_entity_avg(3, task_of(se), &se->avg);
2237 }
2238
9ee474f5
PT
2239 if (!update_cfs_rq)
2240 return;
2241
6fa3eb70 2242 if (se->on_rq) {
2dac754e 2243 cfs_rq->runnable_load_avg += contrib_delta;
6fa3eb70
S
2244 if (entity_is_task(se)) {
2245 cpu = cfs_rq->rq->cpu;
2246 cpu_rq(cpu)->cfs.avg.load_avg_ratio += ratio_delta;
2247 cpu_rq(cpu)->cfs.avg.load_avg_contrib += contrib_delta;
2248#ifdef CONFIG_HMP_TRACER
2249 trace_sched_cfs_load_update(task_of(se),se_load(se),ratio_delta,cpu);
2250#endif /* CONFIG_HMP_TRACER */
2251#ifdef CONFIG_MTK_SCHED_CMP_TGS
2252 update_tg_info(cfs_rq, se, ratio_delta);
2253#endif
2254 }
2255 }
9ee474f5
PT
2256 else
2257 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2258}
2259
6fa3eb70 2260
9ee474f5
PT
2261/*
2262 * Decay the load contributed by all blocked children and account this so that
2263 * their contribution may appropriately discounted when they wake up.
2264 */
aff3e498 2265static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
9ee474f5 2266{
f1b17280 2267 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
9ee474f5
PT
2268 u64 decays;
2269
2270 decays = now - cfs_rq->last_decay;
aff3e498 2271 if (!decays && !force_update)
9ee474f5
PT
2272 return;
2273
6fa3eb70
S
2274 if (atomic_long_read(&cfs_rq->removed_load)) {
2275 unsigned long removed_load;
2276 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
aff3e498
PT
2277 subtract_blocked_load_contrib(cfs_rq, removed_load);
2278 }
9ee474f5 2279
aff3e498
PT
2280 if (decays) {
2281 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
2282 decays);
2283 atomic64_add(decays, &cfs_rq->decay_counter);
2284 cfs_rq->last_decay = now;
2285 }
c566e8e9
PT
2286
2287 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
9d85f21c 2288}
18bf2805
BS
2289
2290static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2291{
6fa3eb70
S
2292 u32 contrib;
2293 int cpu = -1; /* not used in normal case */
2294
2295#if defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) \
2296 || defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
2297 cpu = rq->cpu;
2298#endif
2299 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
2300 runnable, cpu);
bb17f655 2301 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
6fa3eb70
S
2302 contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
2303 contrib /= (rq->avg.runnable_avg_period + 1);
2304 trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib));
2305 trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
18bf2805 2306}
2dac754e
PT
2307
2308/* Add the load generated by se into cfs_rq's child load-average */
2309static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
2310 struct sched_entity *se,
2311 int wakeup)
2dac754e 2312{
6fa3eb70
S
2313 int cpu = cfs_rq->rq->cpu;
2314
aff3e498
PT
2315 /*
2316 * We track migrations using entity decay_count <= 0, on a wake-up
2317 * migration we use a negative decay count to track the remote decays
2318 * accumulated while sleeping.
6fa3eb70
S
2319 *
2320 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
2321 * are seen by enqueue_entity_load_avg() as a migration with an already
2322 * constructed load_avg_contrib.
aff3e498
PT
2323 */
2324 if (unlikely(se->avg.decay_count <= 0)) {
9ee474f5 2325 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
aff3e498
PT
2326 if (se->avg.decay_count) {
2327 /*
2328 * In a wake-up migration we have to approximate the
2329 * time sleeping. This is because we can't synchronize
2330 * clock_task between the two cpus, and it is not
2331 * guaranteed to be read-safe. Instead, we can
2332 * approximate this using our carried decays, which are
2333 * explicitly atomically readable.
2334 */
2335 se->avg.last_runnable_update -= (-se->avg.decay_count)
2336 << 20;
2337 update_entity_load_avg(se, 0);
2338 /* Indicate that we're now synchronized and on-rq */
2339 se->avg.decay_count = 0;
6fa3eb70
S
2340#ifdef CONFIG_MTK_SCHED_CMP
2341 } else {
2342 if (entity_is_task(se))
2343 trace_sched_task_entity_avg(1, task_of(se), &se->avg);
2344#endif
aff3e498 2345 }
9ee474f5
PT
2346 wakeup = 0;
2347 } else {
2348 __synchronize_entity_decay(se);
2349 }
2350
aff3e498
PT
2351 /* migrated tasks did not contribute to our blocked load */
2352 if (wakeup) {
9ee474f5 2353 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
aff3e498
PT
2354 update_entity_load_avg(se, 0);
2355 }
9ee474f5 2356
2dac754e 2357 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
6fa3eb70
S
2358#ifdef CONFIG_MTK_SCHED_CMP_TGS
2359 if(entity_is_task(se)){
2360 update_tg_info(cfs_rq, se, se->avg.load_avg_ratio);
2361 }
2362#endif
2363
2364 if (entity_is_task(se)) {
2365 cpu_rq(cpu)->cfs.avg.load_avg_contrib += se->avg.load_avg_contrib;
2366 cpu_rq(cpu)->cfs.avg.load_avg_ratio += se->avg.load_avg_ratio;
2367#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
2368 cfs_nr_pending(cpu) = 0;
2369 cfs_pending_load(cpu) = 0;
2370#endif
2371#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2372 if(!task_low_priority(task_of(se)->prio))
2373 cfs_nr_normal_prio(cpu)++;
2374#endif
2375#ifdef CONFIG_HMP_TRACER
2376 trace_sched_cfs_enqueue_task(task_of(se),se_load(se),cpu);
2377#endif
2378 }
2379
aff3e498
PT
2380 /* we force update consideration on load-balancer moves */
2381 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2dac754e
PT
2382}
2383
9ee474f5
PT
2384/*
2385 * Remove se's load from this cfs_rq child load-average, if the entity is
2386 * transitioning to a blocked state we track its projected decay using
2387 * blocked_load_avg.
2388 */
2dac754e 2389static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
2390 struct sched_entity *se,
2391 int sleep)
2dac754e 2392{
6fa3eb70
S
2393 int cpu = cfs_rq->rq->cpu;
2394
9ee474f5 2395 update_entity_load_avg(se, 1);
aff3e498
PT
2396 /* we force update consideration on load-balancer moves */
2397 update_cfs_rq_blocked_load(cfs_rq, !sleep);
9ee474f5 2398
2dac754e 2399 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
6fa3eb70
S
2400#ifdef CONFIG_MTK_SCHED_CMP_TGS
2401 if(entity_is_task(se)){
2402 update_tg_info(cfs_rq, se, -se->avg.load_avg_ratio);
2403 }
2404#endif
2405
2406 if (entity_is_task(se)) {
2407 cpu_rq(cpu)->cfs.avg.load_avg_contrib -= se->avg.load_avg_contrib;
2408 cpu_rq(cpu)->cfs.avg.load_avg_ratio -= se->avg.load_avg_ratio;
2409#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2410 cfs_reset_nr_dequeuing_low_prio(cpu);
2411 if(!task_low_priority(task_of(se)->prio))
2412 cfs_nr_normal_prio(cpu)--;
2413#endif
2414#ifdef CONFIG_HMP_TRACER
2415 trace_sched_cfs_dequeue_task(task_of(se),se_load(se),cpu);
2416#endif
2417 }
2418
9ee474f5
PT
2419 if (sleep) {
2420 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2421 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
2422 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
2dac754e 2423}
642dbc39
VG
2424
2425/*
2426 * Update the rq's load with the elapsed running time before entering
2427 * idle. if the last scheduled task is not a CFS task, idle_enter will
2428 * be the only way to update the runnable statistic.
2429 */
2430void idle_enter_fair(struct rq *this_rq)
2431{
2432 update_rq_runnable_avg(this_rq, 1);
2433}
2434
2435/*
2436 * Update the rq's load with the elapsed idle time before a task is
2437 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2438 * be the only way to update the runnable statistic.
2439 */
2440void idle_exit_fair(struct rq *this_rq)
2441{
2442 update_rq_runnable_avg(this_rq, 0);
2443}
2444
9d85f21c 2445#else
9ee474f5
PT
2446static inline void update_entity_load_avg(struct sched_entity *se,
2447 int update_cfs_rq) {}
18bf2805 2448static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2dac754e 2449static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
2450 struct sched_entity *se,
2451 int wakeup) {}
2dac754e 2452static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
2453 struct sched_entity *se,
2454 int sleep) {}
aff3e498
PT
2455static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2456 int force_update) {}
9d85f21c
PT
2457#endif
2458
2396af69 2459static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 2460{
bf0f6f24 2461#ifdef CONFIG_SCHEDSTATS
e414314c
PZ
2462 struct task_struct *tsk = NULL;
2463
2464 if (entity_is_task(se))
2465 tsk = task_of(se);
2466
41acab88
LDM
2467 if (se->statistics.sleep_start) {
2468 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
bf0f6f24
IM
2469
2470 if ((s64)delta < 0)
2471 delta = 0;
2472
41acab88
LDM
2473 if (unlikely(delta > se->statistics.sleep_max))
2474 se->statistics.sleep_max = delta;
bf0f6f24 2475
8c79a045 2476 se->statistics.sleep_start = 0;
41acab88 2477 se->statistics.sum_sleep_runtime += delta;
9745512c 2478
768d0c27 2479 if (tsk) {
e414314c 2480 account_scheduler_latency(tsk, delta >> 10, 1);
768d0c27
PZ
2481 trace_sched_stat_sleep(tsk, delta);
2482 }
bf0f6f24 2483 }
41acab88
LDM
2484 if (se->statistics.block_start) {
2485 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
bf0f6f24
IM
2486
2487 if ((s64)delta < 0)
2488 delta = 0;
2489
41acab88
LDM
2490 if (unlikely(delta > se->statistics.block_max))
2491 se->statistics.block_max = delta;
bf0f6f24 2492
8c79a045 2493 se->statistics.block_start = 0;
41acab88 2494 se->statistics.sum_sleep_runtime += delta;
30084fbd 2495
e414314c 2496 if (tsk) {
8f0dfc34 2497 if (tsk->in_iowait) {
41acab88
LDM
2498 se->statistics.iowait_sum += delta;
2499 se->statistics.iowait_count++;
768d0c27 2500 trace_sched_stat_iowait(tsk, delta);
8f0dfc34
AV
2501 }
2502
b781a602
AV
2503 trace_sched_stat_blocked(tsk, delta);
2504
e414314c
PZ
2505 /*
2506 * Blocking time is in units of nanosecs, so shift by
2507 * 20 to get a milliseconds-range estimation of the
2508 * amount of time that the task spent sleeping:
2509 */
2510 if (unlikely(prof_on == SLEEP_PROFILING)) {
2511 profile_hits(SLEEP_PROFILING,
2512 (void *)get_wchan(tsk),
2513 delta >> 20);
2514 }
2515 account_scheduler_latency(tsk, delta >> 10, 0);
30084fbd 2516 }
bf0f6f24
IM
2517 }
2518#endif
2519}
2520
ddc97297
PZ
2521static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
2522{
2523#ifdef CONFIG_SCHED_DEBUG
2524 s64 d = se->vruntime - cfs_rq->min_vruntime;
2525
2526 if (d < 0)
2527 d = -d;
2528
2529 if (d > 3*sysctl_sched_latency)
2530 schedstat_inc(cfs_rq, nr_spread_over);
2531#endif
2532}
2533
aeb73b04
PZ
2534static void
2535place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
2536{
1af5f730 2537 u64 vruntime = cfs_rq->min_vruntime;
94dfb5e7 2538
2cb8600e
PZ
2539 /*
2540 * The 'current' period is already promised to the current tasks,
2541 * however the extra weight of the new task will slow them down a
2542 * little, place the new task so that it fits in the slot that
2543 * stays open at the end.
2544 */
94dfb5e7 2545 if (initial && sched_feat(START_DEBIT))
f9c0b095 2546 vruntime += sched_vslice(cfs_rq, se);
aeb73b04 2547
a2e7a7eb 2548 /* sleeps up to a single latency don't count. */
5ca9880c 2549 if (!initial) {
a2e7a7eb 2550 unsigned long thresh = sysctl_sched_latency;
a7be37ac 2551
a2e7a7eb
MG
2552 /*
2553 * Halve their sleep time's effect, to allow
2554 * for a gentler effect of sleepers:
2555 */
2556 if (sched_feat(GENTLE_FAIR_SLEEPERS))
2557 thresh >>= 1;
51e0304c 2558
a2e7a7eb 2559 vruntime -= thresh;
aeb73b04
PZ
2560 }
2561
b5d9d734 2562 /* ensure we never gain time by being placed backwards. */
16c8f1c7 2563 se->vruntime = max_vruntime(se->vruntime, vruntime);
aeb73b04
PZ
2564}
2565
d3d9dc33
PT
2566static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
2567
bf0f6f24 2568static void
88ec22d3 2569enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 2570{
88ec22d3
PZ
2571 /*
2572 * Update the normalized vruntime before updating min_vruntime
6fa3eb70 2573 * through calling update_curr().
88ec22d3 2574 */
371fd7e7 2575 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
88ec22d3
PZ
2576 se->vruntime += cfs_rq->min_vruntime;
2577
bf0f6f24 2578 /*
a2a2d680 2579 * Update run-time statistics of the 'current'.
bf0f6f24 2580 */
b7cc0896 2581 update_curr(cfs_rq);
f269ae04 2582 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
17bc14b7
LT
2583 account_entity_enqueue(cfs_rq, se);
2584 update_cfs_shares(cfs_rq);
bf0f6f24 2585
88ec22d3 2586 if (flags & ENQUEUE_WAKEUP) {
aeb73b04 2587 place_entity(cfs_rq, se, 0);
2396af69 2588 enqueue_sleeper(cfs_rq, se);
e9acbff6 2589 }
bf0f6f24 2590
d2417e5a 2591 update_stats_enqueue(cfs_rq, se);
ddc97297 2592 check_spread(cfs_rq, se);
83b699ed
SV
2593 if (se != cfs_rq->curr)
2594 __enqueue_entity(cfs_rq, se);
2069dd75 2595 se->on_rq = 1;
3d4b47b4 2596
d3d9dc33 2597 if (cfs_rq->nr_running == 1) {
3d4b47b4 2598 list_add_leaf_cfs_rq(cfs_rq);
d3d9dc33
PT
2599 check_enqueue_throttle(cfs_rq);
2600 }
bf0f6f24
IM
2601}
2602
2c13c919 2603static void __clear_buddies_last(struct sched_entity *se)
2002c695 2604{
2c13c919
RR
2605 for_each_sched_entity(se) {
2606 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2607 if (cfs_rq->last == se)
2608 cfs_rq->last = NULL;
2609 else
2610 break;
2611 }
2612}
2002c695 2613
2c13c919
RR
2614static void __clear_buddies_next(struct sched_entity *se)
2615{
2616 for_each_sched_entity(se) {
2617 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2618 if (cfs_rq->next == se)
2619 cfs_rq->next = NULL;
2620 else
2621 break;
2622 }
2002c695
PZ
2623}
2624
ac53db59
RR
2625static void __clear_buddies_skip(struct sched_entity *se)
2626{
2627 for_each_sched_entity(se) {
2628 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2629 if (cfs_rq->skip == se)
2630 cfs_rq->skip = NULL;
2631 else
2632 break;
2633 }
2634}
2635
a571bbea
PZ
2636static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
2637{
2c13c919
RR
2638 if (cfs_rq->last == se)
2639 __clear_buddies_last(se);
2640
2641 if (cfs_rq->next == se)
2642 __clear_buddies_next(se);
ac53db59
RR
2643
2644 if (cfs_rq->skip == se)
2645 __clear_buddies_skip(se);
a571bbea
PZ
2646}
2647
6c16a6dc 2648static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d8b4986d 2649
bf0f6f24 2650static void
371fd7e7 2651dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 2652{
a2a2d680
DA
2653 /*
2654 * Update run-time statistics of the 'current'.
2655 */
2656 update_curr(cfs_rq);
17bc14b7 2657 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
a2a2d680 2658
19b6a2e3 2659 update_stats_dequeue(cfs_rq, se);
371fd7e7 2660 if (flags & DEQUEUE_SLEEP) {
67e9fb2a 2661#ifdef CONFIG_SCHEDSTATS
bf0f6f24
IM
2662 if (entity_is_task(se)) {
2663 struct task_struct *tsk = task_of(se);
2664
2665 if (tsk->state & TASK_INTERRUPTIBLE)
41acab88 2666 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
bf0f6f24 2667 if (tsk->state & TASK_UNINTERRUPTIBLE)
41acab88 2668 se->statistics.block_start = rq_of(cfs_rq)->clock;
bf0f6f24 2669 }
db36cc7d 2670#endif
67e9fb2a
PZ
2671 }
2672
2002c695 2673 clear_buddies(cfs_rq, se);
4793241b 2674
83b699ed 2675 if (se != cfs_rq->curr)
30cfdcfc 2676 __dequeue_entity(cfs_rq, se);
17bc14b7 2677 se->on_rq = 0;
30cfdcfc 2678 account_entity_dequeue(cfs_rq, se);
88ec22d3
PZ
2679
2680 /*
2681 * Normalize the entity after updating the min_vruntime because the
2682 * update can refer to the ->curr item and we need to reflect this
2683 * movement in our normalized position.
2684 */
371fd7e7 2685 if (!(flags & DEQUEUE_SLEEP))
88ec22d3 2686 se->vruntime -= cfs_rq->min_vruntime;
1e876231 2687
d8b4986d
PT
2688 /* return excess runtime on last dequeue */
2689 return_cfs_rq_runtime(cfs_rq);
2690
1e876231 2691 update_min_vruntime(cfs_rq);
17bc14b7 2692 update_cfs_shares(cfs_rq);
bf0f6f24
IM
2693}
2694
2695/*
2696 * Preempt the current task with a newly woken task if needed:
2697 */
7c92e54f 2698static void
2e09bf55 2699check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
bf0f6f24 2700{
11697830 2701 unsigned long ideal_runtime, delta_exec;
f4cfb33e
WX
2702 struct sched_entity *se;
2703 s64 delta;
11697830 2704
6d0f0ebd 2705 ideal_runtime = sched_slice(cfs_rq, curr);
11697830 2706 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
a9f3e2b5 2707 if (delta_exec > ideal_runtime) {
bf0f6f24 2708 resched_task(rq_of(cfs_rq)->curr);
a9f3e2b5
MG
2709 /*
2710 * The current task ran long enough, ensure it doesn't get
2711 * re-elected due to buddy favours.
2712 */
2713 clear_buddies(cfs_rq, curr);
f685ceac
MG
2714 return;
2715 }
2716
2717 /*
2718 * Ensure that a task that missed wakeup preemption by a
2719 * narrow margin doesn't have to wait for a full slice.
2720 * This also mitigates buddy induced latencies under load.
2721 */
f685ceac
MG
2722 if (delta_exec < sysctl_sched_min_granularity)
2723 return;
2724
f4cfb33e
WX
2725 se = __pick_first_entity(cfs_rq);
2726 delta = curr->vruntime - se->vruntime;
f685ceac 2727
f4cfb33e
WX
2728 if (delta < 0)
2729 return;
d7d82944 2730
f4cfb33e
WX
2731 if (delta > ideal_runtime)
2732 resched_task(rq_of(cfs_rq)->curr);
bf0f6f24
IM
2733}
2734
83b699ed 2735static void
8494f412 2736set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 2737{
83b699ed
SV
2738 /* 'current' is not kept within the tree. */
2739 if (se->on_rq) {
2740 /*
2741 * Any task has to be enqueued before it get to execute on
2742 * a CPU. So account for the time it spent waiting on the
2743 * runqueue.
2744 */
2745 update_stats_wait_end(cfs_rq, se);
2746 __dequeue_entity(cfs_rq, se);
6fa3eb70 2747 update_entity_load_avg(se, 1);
83b699ed
SV
2748 }
2749
79303e9e 2750 update_stats_curr_start(cfs_rq, se);
429d43bc 2751 cfs_rq->curr = se;
eba1ed4b
IM
2752#ifdef CONFIG_SCHEDSTATS
2753 /*
2754 * Track our maximum slice length, if the CPU's load is at
2755 * least twice that of our own weight (i.e. dont track it
2756 * when there are only lesser-weight tasks around):
2757 */
495eca49 2758 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
41acab88 2759 se->statistics.slice_max = max(se->statistics.slice_max,
eba1ed4b
IM
2760 se->sum_exec_runtime - se->prev_sum_exec_runtime);
2761 }
2762#endif
4a55b450 2763 se->prev_sum_exec_runtime = se->sum_exec_runtime;
bf0f6f24
IM
2764}
2765
3f3a4904
PZ
2766static int
2767wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2768
ac53db59
RR
2769/*
2770 * Pick the next process, keeping these things in mind, in this order:
2771 * 1) keep things fair between processes/task groups
2772 * 2) pick the "next" process, since someone really wants that to run
2773 * 3) pick the "last" process, for cache locality
2774 * 4) do not run the "skip" process, if something else is available
2775 */
f4b6755f 2776static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
aa2ac252 2777{
ac53db59 2778 struct sched_entity *se = __pick_first_entity(cfs_rq);
f685ceac 2779 struct sched_entity *left = se;
f4b6755f 2780
ac53db59
RR
2781 /*
2782 * Avoid running the skip buddy, if running something else can
2783 * be done without getting too unfair.
2784 */
2785 if (cfs_rq->skip == se) {
2786 struct sched_entity *second = __pick_next_entity(se);
2787 if (second && wakeup_preempt_entity(second, left) < 1)
2788 se = second;
2789 }
aa2ac252 2790
f685ceac
MG
2791 /*
2792 * Prefer last buddy, try to return the CPU to a preempted task.
2793 */
2794 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
2795 se = cfs_rq->last;
2796
ac53db59
RR
2797 /*
2798 * Someone really wants this to run. If it's not unfair, run it.
2799 */
2800 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
2801 se = cfs_rq->next;
2802
f685ceac 2803 clear_buddies(cfs_rq, se);
4793241b
PZ
2804
2805 return se;
aa2ac252
PZ
2806}
2807
d3d9dc33
PT
2808static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2809
ab6cde26 2810static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
bf0f6f24
IM
2811{
2812 /*
2813 * If still on the runqueue then deactivate_task()
2814 * was not called and update_curr() has to be done:
2815 */
2816 if (prev->on_rq)
b7cc0896 2817 update_curr(cfs_rq);
bf0f6f24 2818
d3d9dc33
PT
2819 /* throttle cfs_rqs exceeding runtime */
2820 check_cfs_rq_runtime(cfs_rq);
2821
ddc97297 2822 check_spread(cfs_rq, prev);
30cfdcfc 2823 if (prev->on_rq) {
5870db5b 2824 update_stats_wait_start(cfs_rq, prev);
30cfdcfc
DA
2825 /* Put 'current' back into the tree. */
2826 __enqueue_entity(cfs_rq, prev);
9d85f21c 2827 /* in !on_rq case, update occurred at dequeue */
9ee474f5 2828 update_entity_load_avg(prev, 1);
30cfdcfc 2829 }
429d43bc 2830 cfs_rq->curr = NULL;
bf0f6f24
IM
2831}
2832
8f4d37ec
PZ
2833static void
2834entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
bf0f6f24 2835{
bf0f6f24 2836 /*
30cfdcfc 2837 * Update run-time statistics of the 'current'.
bf0f6f24 2838 */
30cfdcfc 2839 update_curr(cfs_rq);
bf0f6f24 2840
9d85f21c
PT
2841 /*
2842 * Ensure that runnable average is periodically updated.
2843 */
9ee474f5 2844 update_entity_load_avg(curr, 1);
aff3e498 2845 update_cfs_rq_blocked_load(cfs_rq, 1);
dead45bd 2846 update_cfs_shares(cfs_rq);
9d85f21c 2847
8f4d37ec
PZ
2848#ifdef CONFIG_SCHED_HRTICK
2849 /*
2850 * queued ticks are scheduled to match the slice, so don't bother
2851 * validating it and just reschedule.
2852 */
983ed7a6
HH
2853 if (queued) {
2854 resched_task(rq_of(cfs_rq)->curr);
2855 return;
2856 }
8f4d37ec
PZ
2857 /*
2858 * don't let the period tick interfere with the hrtick preemption
2859 */
2860 if (!sched_feat(DOUBLE_TICK) &&
2861 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2862 return;
2863#endif
2864
2c2efaed 2865 if (cfs_rq->nr_running > 1)
2e09bf55 2866 check_preempt_tick(cfs_rq, curr);
bf0f6f24
IM
2867}
2868
ab84d31e
PT
2869
2870/**************************************************
2871 * CFS bandwidth control machinery
2872 */
2873
2874#ifdef CONFIG_CFS_BANDWIDTH
029632fb
PZ
2875
2876#ifdef HAVE_JUMP_LABEL
c5905afb 2877static struct static_key __cfs_bandwidth_used;
029632fb
PZ
2878
2879static inline bool cfs_bandwidth_used(void)
2880{
c5905afb 2881 return static_key_false(&__cfs_bandwidth_used);
029632fb
PZ
2882}
2883
9d80092f 2884void cfs_bandwidth_usage_inc(void)
029632fb 2885{
9d80092f
BS
2886 static_key_slow_inc(&__cfs_bandwidth_used);
2887}
2888
2889void cfs_bandwidth_usage_dec(void)
2890{
2891 static_key_slow_dec(&__cfs_bandwidth_used);
029632fb
PZ
2892}
2893#else /* HAVE_JUMP_LABEL */
2894static bool cfs_bandwidth_used(void)
2895{
2896 return true;
2897}
2898
9d80092f
BS
2899void cfs_bandwidth_usage_inc(void) {}
2900void cfs_bandwidth_usage_dec(void) {}
029632fb
PZ
2901#endif /* HAVE_JUMP_LABEL */
2902
ab84d31e
PT
2903/*
2904 * default period for cfs group bandwidth.
2905 * default: 0.1s, units: nanoseconds
2906 */
2907static inline u64 default_cfs_period(void)
2908{
2909 return 100000000ULL;
2910}
ec12cb7f
PT
2911
2912static inline u64 sched_cfs_bandwidth_slice(void)
2913{
2914 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2915}
2916
a9cf55b2
PT
2917/*
2918 * Replenish runtime according to assigned quota and update expiration time.
2919 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2920 * additional synchronization around rq->lock.
2921 *
2922 * requires cfs_b->lock
2923 */
029632fb 2924void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
a9cf55b2
PT
2925{
2926 u64 now;
2927
2928 if (cfs_b->quota == RUNTIME_INF)
2929 return;
2930
2931 now = sched_clock_cpu(smp_processor_id());
2932 cfs_b->runtime = cfs_b->quota;
2933 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2934}
2935
029632fb
PZ
2936static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2937{
2938 return &tg->cfs_bandwidth;
2939}
2940
f1b17280
PT
2941/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2942static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2943{
2944 if (unlikely(cfs_rq->throttle_count))
2945 return cfs_rq->throttled_clock_task;
2946
2947 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2948}
2949
85dac906
PT
2950/* returns 0 on failure to allocate runtime */
2951static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f
PT
2952{
2953 struct task_group *tg = cfs_rq->tg;
2954 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
a9cf55b2 2955 u64 amount = 0, min_amount, expires;
ec12cb7f
PT
2956
2957 /* note: this is a positive sum as runtime_remaining <= 0 */
2958 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2959
2960 raw_spin_lock(&cfs_b->lock);
2961 if (cfs_b->quota == RUNTIME_INF)
2962 amount = min_amount;
58088ad0 2963 else {
a9cf55b2
PT
2964 /*
2965 * If the bandwidth pool has become inactive, then at least one
2966 * period must have elapsed since the last consumption.
2967 * Refresh the global state and ensure bandwidth timer becomes
2968 * active.
2969 */
2970 if (!cfs_b->timer_active) {
2971 __refill_cfs_bandwidth_runtime(cfs_b);
58088ad0 2972 __start_cfs_bandwidth(cfs_b);
a9cf55b2 2973 }
58088ad0
PT
2974
2975 if (cfs_b->runtime > 0) {
2976 amount = min(cfs_b->runtime, min_amount);
2977 cfs_b->runtime -= amount;
2978 cfs_b->idle = 0;
2979 }
ec12cb7f 2980 }
a9cf55b2 2981 expires = cfs_b->runtime_expires;
ec12cb7f
PT
2982 raw_spin_unlock(&cfs_b->lock);
2983
2984 cfs_rq->runtime_remaining += amount;
a9cf55b2
PT
2985 /*
2986 * we may have advanced our local expiration to account for allowed
2987 * spread between our sched_clock and the one on which runtime was
2988 * issued.
2989 */
2990 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2991 cfs_rq->runtime_expires = expires;
85dac906
PT
2992
2993 return cfs_rq->runtime_remaining > 0;
ec12cb7f
PT
2994}
2995
a9cf55b2
PT
2996/*
2997 * Note: This depends on the synchronization provided by sched_clock and the
2998 * fact that rq->clock snapshots this value.
2999 */
3000static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f 3001{
a9cf55b2
PT
3002 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3003 struct rq *rq = rq_of(cfs_rq);
3004
3005 /* if the deadline is ahead of our clock, nothing to do */
3006 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
ec12cb7f
PT
3007 return;
3008
a9cf55b2
PT
3009 if (cfs_rq->runtime_remaining < 0)
3010 return;
3011
3012 /*
3013 * If the local deadline has passed we have to consider the
3014 * possibility that our sched_clock is 'fast' and the global deadline
3015 * has not truly expired.
3016 *
3017 * Fortunately we can check determine whether this the case by checking
3018 * whether the global deadline has advanced.
3019 */
3020
3021 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
3022 /* extend local deadline, drift is bounded above by 2 ticks */
3023 cfs_rq->runtime_expires += TICK_NSEC;
3024 } else {
3025 /* global deadline is ahead, expiration has passed */
3026 cfs_rq->runtime_remaining = 0;
3027 }
3028}
3029
3030static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3031 unsigned long delta_exec)
3032{
3033 /* dock delta_exec before expiring quota (as it could span periods) */
ec12cb7f 3034 cfs_rq->runtime_remaining -= delta_exec;
a9cf55b2
PT
3035 expire_cfs_rq_runtime(cfs_rq);
3036
3037 if (likely(cfs_rq->runtime_remaining > 0))
ec12cb7f
PT
3038 return;
3039
85dac906
PT
3040 /*
3041 * if we're unable to extend our runtime we resched so that the active
3042 * hierarchy can be throttled
3043 */
3044 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3045 resched_task(rq_of(cfs_rq)->curr);
ec12cb7f
PT
3046}
3047
6c16a6dc
PZ
3048static __always_inline
3049void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
ec12cb7f 3050{
56f570e5 3051 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
ec12cb7f
PT
3052 return;
3053
3054 __account_cfs_rq_runtime(cfs_rq, delta_exec);
3055}
3056
85dac906
PT
3057static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3058{
56f570e5 3059 return cfs_bandwidth_used() && cfs_rq->throttled;
85dac906
PT
3060}
3061
64660c86
PT
3062/* check whether cfs_rq, or any parent, is throttled */
3063static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3064{
56f570e5 3065 return cfs_bandwidth_used() && cfs_rq->throttle_count;
64660c86
PT
3066}
3067
3068/*
3069 * Ensure that neither of the group entities corresponding to src_cpu or
3070 * dest_cpu are members of a throttled hierarchy when performing group
3071 * load-balance operations.
3072 */
3073static inline int throttled_lb_pair(struct task_group *tg,
3074 int src_cpu, int dest_cpu)
3075{
3076 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3077
3078 src_cfs_rq = tg->cfs_rq[src_cpu];
3079 dest_cfs_rq = tg->cfs_rq[dest_cpu];
3080
3081 return throttled_hierarchy(src_cfs_rq) ||
3082 throttled_hierarchy(dest_cfs_rq);
3083}
3084
3085/* updated child weight may affect parent so we have to do this bottom up */
3086static int tg_unthrottle_up(struct task_group *tg, void *data)
3087{
3088 struct rq *rq = data;
3089 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3090
3091 cfs_rq->throttle_count--;
3092#ifdef CONFIG_SMP
3093 if (!cfs_rq->throttle_count) {
f1b17280
PT
3094 /* adjust cfs_rq_clock_task() */
3095 cfs_rq->throttled_clock_task_time += rq->clock_task -
3096 cfs_rq->throttled_clock_task;
64660c86
PT
3097 }
3098#endif
3099
3100 return 0;
3101}
3102
3103static int tg_throttle_down(struct task_group *tg, void *data)
3104{
3105 struct rq *rq = data;
3106 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3107
82958366
PT
3108 /* group is entering throttled state, stop time */
3109 if (!cfs_rq->throttle_count)
f1b17280 3110 cfs_rq->throttled_clock_task = rq->clock_task;
64660c86
PT
3111 cfs_rq->throttle_count++;
3112
3113 return 0;
3114}
3115
d3d9dc33 3116static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
85dac906
PT
3117{
3118 struct rq *rq = rq_of(cfs_rq);
3119 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3120 struct sched_entity *se;
3121 long task_delta, dequeue = 1;
3122
3123 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3124
f1b17280 3125 /* freeze hierarchy runnable averages while throttled */
64660c86
PT
3126 rcu_read_lock();
3127 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3128 rcu_read_unlock();
85dac906
PT
3129
3130 task_delta = cfs_rq->h_nr_running;
3131 for_each_sched_entity(se) {
3132 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3133 /* throttled entity or throttle-on-deactivate */
3134 if (!se->on_rq)
3135 break;
3136
3137 if (dequeue)
3138 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3139 qcfs_rq->h_nr_running -= task_delta;
3140
3141 if (qcfs_rq->load.weight)
3142 dequeue = 0;
3143 }
3144
3145 if (!se)
3146 rq->nr_running -= task_delta;
3147
3148 cfs_rq->throttled = 1;
f1b17280 3149 cfs_rq->throttled_clock = rq->clock;
85dac906
PT
3150 raw_spin_lock(&cfs_b->lock);
3151 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
5232a719
BS
3152 if (!cfs_b->timer_active)
3153 __start_cfs_bandwidth(cfs_b);
85dac906
PT
3154 raw_spin_unlock(&cfs_b->lock);
3155}
3156
029632fb 3157void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
671fd9da
PT
3158{
3159 struct rq *rq = rq_of(cfs_rq);
3160 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3161 struct sched_entity *se;
3162 int enqueue = 1;
3163 long task_delta;
3164
6fa3eb70 3165 se = cfs_rq->tg->se[cpu_of(rq)];
671fd9da
PT
3166
3167 cfs_rq->throttled = 0;
3168 raw_spin_lock(&cfs_b->lock);
f1b17280 3169 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
671fd9da
PT
3170 list_del_rcu(&cfs_rq->throttled_list);
3171 raw_spin_unlock(&cfs_b->lock);
3172
64660c86
PT
3173 update_rq_clock(rq);
3174 /* update hierarchical throttle state */
3175 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3176
671fd9da
PT
3177 if (!cfs_rq->load.weight)
3178 return;
3179
3180 task_delta = cfs_rq->h_nr_running;
3181 for_each_sched_entity(se) {
3182 if (se->on_rq)
3183 enqueue = 0;
3184
3185 cfs_rq = cfs_rq_of(se);
3186 if (enqueue)
3187 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3188 cfs_rq->h_nr_running += task_delta;
3189
3190 if (cfs_rq_throttled(cfs_rq))
3191 break;
3192 }
3193
3194 if (!se)
3195 rq->nr_running += task_delta;
3196
3197 /* determine whether we need to wake up potentially idle cpu */
3198 if (rq->curr == rq->idle && rq->cfs.nr_running)
3199 resched_task(rq->curr);
3200}
3201
3202static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3203 u64 remaining, u64 expires)
3204{
3205 struct cfs_rq *cfs_rq;
3206 u64 runtime = remaining;
3207
3208 rcu_read_lock();
3209 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3210 throttled_list) {
3211 struct rq *rq = rq_of(cfs_rq);
3212
3213 raw_spin_lock(&rq->lock);
3214 if (!cfs_rq_throttled(cfs_rq))
3215 goto next;
3216
3217 runtime = -cfs_rq->runtime_remaining + 1;
3218 if (runtime > remaining)
3219 runtime = remaining;
3220 remaining -= runtime;
3221
3222 cfs_rq->runtime_remaining += runtime;
3223 cfs_rq->runtime_expires = expires;
3224
3225 /* we check whether we're throttled above */
3226 if (cfs_rq->runtime_remaining > 0)
3227 unthrottle_cfs_rq(cfs_rq);
3228
3229next:
3230 raw_spin_unlock(&rq->lock);
3231
3232 if (!remaining)
3233 break;
3234 }
3235 rcu_read_unlock();
3236
3237 return remaining;
3238}
3239
58088ad0
PT
3240/*
3241 * Responsible for refilling a task_group's bandwidth and unthrottling its
3242 * cfs_rqs as appropriate. If there has been no activity within the last
3243 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3244 * used to track this state.
3245 */
3246static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3247{
671fd9da
PT
3248 u64 runtime, runtime_expires;
3249 int idle = 1, throttled;
58088ad0
PT
3250
3251 raw_spin_lock(&cfs_b->lock);
3252 /* no need to continue the timer with no bandwidth constraint */
3253 if (cfs_b->quota == RUNTIME_INF)
3254 goto out_unlock;
3255
671fd9da
PT
3256 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3257 /* idle depends on !throttled (for the case of a large deficit) */
3258 idle = cfs_b->idle && !throttled;
e8da1b18 3259 cfs_b->nr_periods += overrun;
671fd9da 3260
a9cf55b2
PT
3261 /* if we're going inactive then everything else can be deferred */
3262 if (idle)
3263 goto out_unlock;
3264
9ca715c4
BS
3265 /*
3266 * if we have relooped after returning idle once, we need to update our
3267 * status as actually running, so that other cpus doing
3268 * __start_cfs_bandwidth will stop trying to cancel us.
3269 */
3270 cfs_b->timer_active = 1;
3271
a9cf55b2
PT
3272 __refill_cfs_bandwidth_runtime(cfs_b);
3273
671fd9da
PT
3274 if (!throttled) {
3275 /* mark as potentially idle for the upcoming period */
3276 cfs_b->idle = 1;
3277 goto out_unlock;
3278 }
3279
e8da1b18
NR
3280 /* account preceding periods in which throttling occurred */
3281 cfs_b->nr_throttled += overrun;
3282
671fd9da
PT
3283 /*
3284 * There are throttled entities so we must first use the new bandwidth
3285 * to unthrottle them before making it generally available. This
3286 * ensures that all existing debts will be paid before a new cfs_rq is
3287 * allowed to run.
3288 */
3289 runtime = cfs_b->runtime;
3290 runtime_expires = cfs_b->runtime_expires;
3291 cfs_b->runtime = 0;
3292
3293 /*
3294 * This check is repeated as we are holding onto the new bandwidth
3295 * while we unthrottle. This can potentially race with an unthrottled
3296 * group trying to acquire new bandwidth from the global pool.
3297 */
3298 while (throttled && runtime > 0) {
3299 raw_spin_unlock(&cfs_b->lock);
3300 /* we can't nest cfs_b->lock while distributing bandwidth */
3301 runtime = distribute_cfs_runtime(cfs_b, runtime,
3302 runtime_expires);
3303 raw_spin_lock(&cfs_b->lock);
3304
3305 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3306 }
58088ad0 3307
671fd9da
PT
3308 /* return (any) remaining runtime */
3309 cfs_b->runtime = runtime;
3310 /*
3311 * While we are ensured activity in the period following an
3312 * unthrottle, this also covers the case in which the new bandwidth is
3313 * insufficient to cover the existing bandwidth deficit. (Forcing the
3314 * timer to remain active while there are any throttled entities.)
3315 */
3316 cfs_b->idle = 0;
58088ad0
PT
3317out_unlock:
3318 if (idle)
3319 cfs_b->timer_active = 0;
3320 raw_spin_unlock(&cfs_b->lock);
3321
3322 return idle;
3323}
d3d9dc33 3324
d8b4986d
PT
3325/* a cfs_rq won't donate quota below this amount */
3326static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3327/* minimum remaining period time to redistribute slack quota */
3328static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3329/* how long we wait to gather additional slack before distributing */
3330static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3331
373e0a59
BS
3332/*
3333 * Are we near the end of the current quota period?
3334 *
3335 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3336 * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3337 * migrate_hrtimers, base is never cleared, so we are fine.
3338 */
d8b4986d
PT
3339static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3340{
3341 struct hrtimer *refresh_timer = &cfs_b->period_timer;
3342 u64 remaining;
3343
3344 /* if the call-back is running a quota refresh is already occurring */
3345 if (hrtimer_callback_running(refresh_timer))
3346 return 1;
3347
3348 /* is a quota refresh about to occur? */
3349 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3350 if (remaining < min_expire)
3351 return 1;
3352
3353 return 0;
3354}
3355
3356static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3357{
3358 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3359
3360 /* if there's a quota refresh soon don't bother with slack */
3361 if (runtime_refresh_within(cfs_b, min_left))
3362 return;
3363
3364 start_bandwidth_timer(&cfs_b->slack_timer,
3365 ns_to_ktime(cfs_bandwidth_slack_period));
3366}
3367
3368/* we know any runtime found here is valid as update_curr() precedes return */
3369static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3370{
3371 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3372 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3373
3374 if (slack_runtime <= 0)
3375 return;
3376
3377 raw_spin_lock(&cfs_b->lock);
3378 if (cfs_b->quota != RUNTIME_INF &&
3379 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3380 cfs_b->runtime += slack_runtime;
3381
3382 /* we are under rq->lock, defer unthrottling using a timer */
3383 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3384 !list_empty(&cfs_b->throttled_cfs_rq))
3385 start_cfs_slack_bandwidth(cfs_b);
3386 }
3387 raw_spin_unlock(&cfs_b->lock);
3388
3389 /* even if it's not valid for return we don't want to try again */
3390 cfs_rq->runtime_remaining -= slack_runtime;
3391}
3392
3393static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3394{
56f570e5
PT
3395 if (!cfs_bandwidth_used())
3396 return;
3397
fccfdc6f 3398 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
d8b4986d
PT
3399 return;
3400
3401 __return_cfs_rq_runtime(cfs_rq);
3402}
3403
3404/*
3405 * This is done with a timer (instead of inline with bandwidth return) since
3406 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3407 */
3408static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3409{
3410 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3411 u64 expires;
3412
3413 /* confirm we're still not at a refresh boundary */
373e0a59
BS
3414 raw_spin_lock(&cfs_b->lock);
3415 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3416 raw_spin_unlock(&cfs_b->lock);
d8b4986d 3417 return;
373e0a59 3418 }
d8b4986d 3419
d8b4986d
PT
3420 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
3421 runtime = cfs_b->runtime;
3422 cfs_b->runtime = 0;
3423 }
3424 expires = cfs_b->runtime_expires;
3425 raw_spin_unlock(&cfs_b->lock);
3426
3427 if (!runtime)
3428 return;
3429
3430 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3431
3432 raw_spin_lock(&cfs_b->lock);
3433 if (expires == cfs_b->runtime_expires)
3434 cfs_b->runtime = runtime;
3435 raw_spin_unlock(&cfs_b->lock);
3436}
3437
d3d9dc33
PT
3438/*
3439 * When a group wakes up we want to make sure that its quota is not already
3440 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3441 * runtime as update_curr() throttling can not not trigger until it's on-rq.
3442 */
3443static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3444{
56f570e5
PT
3445 if (!cfs_bandwidth_used())
3446 return;
3447
d3d9dc33
PT
3448 /* an active group must be handled by the update_curr()->put() path */
3449 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3450 return;
3451
3452 /* ensure the group is not already throttled */
3453 if (cfs_rq_throttled(cfs_rq))
3454 return;
3455
3456 /* update runtime allocation */
3457 account_cfs_rq_runtime(cfs_rq, 0);
3458 if (cfs_rq->runtime_remaining <= 0)
3459 throttle_cfs_rq(cfs_rq);
3460}
3461
3462/* conditionally throttle active cfs_rq's from put_prev_entity() */
3463static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3464{
56f570e5
PT
3465 if (!cfs_bandwidth_used())
3466 return;
3467
d3d9dc33
PT
3468 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3469 return;
3470
3471 /*
3472 * it's possible for a throttled entity to be forced into a running
3473 * state (e.g. set_curr_task), in this case we're finished.
3474 */
3475 if (cfs_rq_throttled(cfs_rq))
3476 return;
3477
3478 throttle_cfs_rq(cfs_rq);
3479}
029632fb
PZ
3480
3481static inline u64 default_cfs_period(void);
3482static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
3483static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
3484
3485static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3486{
3487 struct cfs_bandwidth *cfs_b =
3488 container_of(timer, struct cfs_bandwidth, slack_timer);
3489 do_sched_cfs_slack_timer(cfs_b);
3490
3491 return HRTIMER_NORESTART;
3492}
3493
3494static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3495{
3496 struct cfs_bandwidth *cfs_b =
3497 container_of(timer, struct cfs_bandwidth, period_timer);
3498 ktime_t now;
3499 int overrun;
3500 int idle = 0;
3501
3502 for (;;) {
3503 now = hrtimer_cb_get_time(timer);
3504 overrun = hrtimer_forward(timer, now, cfs_b->period);
3505
3506 if (!overrun)
3507 break;
3508
3509 idle = do_sched_cfs_period_timer(cfs_b, overrun);
3510 }
3511
3512 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3513}
3514
3515void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3516{
3517 raw_spin_lock_init(&cfs_b->lock);
3518 cfs_b->runtime = 0;
3519 cfs_b->quota = RUNTIME_INF;
3520 cfs_b->period = ns_to_ktime(default_cfs_period());
3521
3522 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3523 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3524 cfs_b->period_timer.function = sched_cfs_period_timer;
3525 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3526 cfs_b->slack_timer.function = sched_cfs_slack_timer;
3527}
3528
3529static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3530{
3531 cfs_rq->runtime_enabled = 0;
3532 INIT_LIST_HEAD(&cfs_rq->throttled_list);
3533}
3534
3535/* requires cfs_b->lock, may release to reprogram timer */
3536void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3537{
3538 /*
3539 * The timer may be active because we're trying to set a new bandwidth
3540 * period or because we're racing with the tear-down path
3541 * (timer_active==0 becomes visible before the hrtimer call-back
3542 * terminates). In either case we ensure that it's re-programmed
3543 */
9ca715c4
BS
3544 while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3545 hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3546 /* bounce the lock to allow do_sched_cfs_period_timer to run */
029632fb 3547 raw_spin_unlock(&cfs_b->lock);
9ca715c4 3548 cpu_relax();
029632fb
PZ
3549 raw_spin_lock(&cfs_b->lock);
3550 /* if someone else restarted the timer then we're done */
3551 if (cfs_b->timer_active)
3552 return;
3553 }
3554
3555 cfs_b->timer_active = 1;
3556 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
3557}
3558
3559static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3560{
3561 hrtimer_cancel(&cfs_b->period_timer);
3562 hrtimer_cancel(&cfs_b->slack_timer);
3563}
3564
38dc3348 3565static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
029632fb
PZ
3566{
3567 struct cfs_rq *cfs_rq;
3568
3569 for_each_leaf_cfs_rq(rq, cfs_rq) {
3570 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3571
3572 if (!cfs_rq->runtime_enabled)
3573 continue;
3574
3575 /*
3576 * clock_task is not advancing so we just need to make sure
3577 * there's some valid quota amount
3578 */
3579 cfs_rq->runtime_remaining = cfs_b->quota;
3580 if (cfs_rq_throttled(cfs_rq))
3581 unthrottle_cfs_rq(cfs_rq);
3582 }
3583}
3584
3585#else /* CONFIG_CFS_BANDWIDTH */
f1b17280
PT
3586static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3587{
3588 return rq_of(cfs_rq)->clock_task;
3589}
3590
3591static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3592 unsigned long delta_exec) {}
d3d9dc33
PT
3593static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3594static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
6c16a6dc 3595static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
85dac906
PT
3596
3597static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3598{
3599 return 0;
3600}
64660c86
PT
3601
3602static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3603{
3604 return 0;
3605}
3606
3607static inline int throttled_lb_pair(struct task_group *tg,
3608 int src_cpu, int dest_cpu)
3609{
3610 return 0;
3611}
029632fb
PZ
3612
3613void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3614
3615#ifdef CONFIG_FAIR_GROUP_SCHED
3616static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
ab84d31e
PT
3617#endif
3618
029632fb
PZ
3619static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3620{
3621 return NULL;
3622}
3623static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
a4c96ae3 3624static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
029632fb
PZ
3625
3626#endif /* CONFIG_CFS_BANDWIDTH */
3627
bf0f6f24
IM
3628/**************************************************
3629 * CFS operations on tasks:
3630 */
3631
8f4d37ec
PZ
3632#ifdef CONFIG_SCHED_HRTICK
3633static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3634{
8f4d37ec
PZ
3635 struct sched_entity *se = &p->se;
3636 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3637
3638 WARN_ON(task_rq(p) != rq);
3639
b39e66ea 3640 if (cfs_rq->nr_running > 1) {
8f4d37ec
PZ
3641 u64 slice = sched_slice(cfs_rq, se);
3642 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
3643 s64 delta = slice - ran;
3644
3645 if (delta < 0) {
3646 if (rq->curr == p)
3647 resched_task(p);
3648 return;
3649 }
3650
3651 /*
3652 * Don't schedule slices shorter than 10000ns, that just
3653 * doesn't make sense. Rely on vruntime for fairness.
3654 */
31656519 3655 if (rq->curr != p)
157124c1 3656 delta = max_t(s64, 10000LL, delta);
8f4d37ec 3657
31656519 3658 hrtick_start(rq, delta);
8f4d37ec
PZ
3659 }
3660}
a4c2f00f
PZ
3661
3662/*
3663 * called from enqueue/dequeue and updates the hrtick when the
3664 * current task is from our class and nr_running is low enough
3665 * to matter.
3666 */
3667static void hrtick_update(struct rq *rq)
3668{
3669 struct task_struct *curr = rq->curr;
3670
b39e66ea 3671 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
a4c2f00f
PZ
3672 return;
3673
3674 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
3675 hrtick_start_fair(rq, curr);
3676}
55e12e5e 3677#else /* !CONFIG_SCHED_HRTICK */
8f4d37ec
PZ
3678static inline void
3679hrtick_start_fair(struct rq *rq, struct task_struct *p)
3680{
3681}
a4c2f00f
PZ
3682
3683static inline void hrtick_update(struct rq *rq)
3684{
3685}
8f4d37ec
PZ
3686#endif
3687
6fa3eb70
S
3688#if defined(CONFIG_SCHED_HMP) || defined(CONFIG_MTK_SCHED_CMP)
3689
3690/* CPU cluster statistics for task migration control */
3691#define HMP_GB (0x1000)
3692#define HMP_SELECT_RQ (0x2000)
3693#define HMP_LB (0x4000)
3694#define HMP_MAX_LOAD (NICE_0_LOAD - 1)
3695
3696
3697struct clb_env {
3698 struct clb_stats bstats;
3699 struct clb_stats lstats;
3700 int btarget, ltarget;
3701
3702 struct cpumask *bcpus;
3703 struct cpumask *lcpus;
3704
3705 unsigned int flags;
3706 struct mcheck {
3707 int status; /* Details of this migration check */
3708 int result; /* Indicate whether we should perform this task migration */
3709 } mcheck;
3710};
3711
3712unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu);
3713
3714static void collect_cluster_stats(struct clb_stats *clbs,
3715 struct cpumask *cluster_cpus, int target)
3716{
3717#define HMP_RESOLUTION_SCALING (4)
3718#define hmp_scale_down(w) ((w) >> HMP_RESOLUTION_SCALING)
3719
3720 /* Update cluster informatics */
3721 int cpu;
3722 for_each_cpu(cpu, cluster_cpus) {
3723 if(cpu_online(cpu)) {
3724 clbs->ncpu ++;
3725 clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running;
3726 clbs->load_avg += cpu_rq(cpu)->cfs.avg.load_avg_ratio;
3727#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
3728 clbs->nr_normal_prio_task += cfs_nr_normal_prio(cpu);
3729 clbs->nr_dequeuing_low_prio += cfs_nr_dequeuing_low_prio(cpu);
3730#endif
3731 }
3732 }
3733
3734 if(!clbs->ncpu || NR_CPUS == target || !cpumask_test_cpu(target,cluster_cpus))
3735 return;
3736
3737 clbs->cpu_power = (int) arch_scale_freq_power(NULL, target);
3738
3739 /* Scale current CPU compute capacity in accordance with frequency */
3740 clbs->cpu_capacity = HMP_MAX_LOAD;
3741#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
3742 if (hmp_data.freqinvar_load_scale_enabled) {
3743 cpu = cpumask_any(cluster_cpus);
3744 if (freq_scale[cpu].throttling == 1){
3745 clbs->cpu_capacity *= freq_scale[cpu].curr_scale;
3746 }else {
3747 clbs->cpu_capacity *= freq_scale[cpu].max;
3748 }
3749 clbs->cpu_capacity >>= SCHED_FREQSCALE_SHIFT;
3750
3751 if (clbs->cpu_capacity > HMP_MAX_LOAD){
3752 clbs->cpu_capacity = HMP_MAX_LOAD;
3753 }
3754 }
3755#elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
3756 if (topology_cpu_inv_power_en()) {
3757 cpu = cpumask_any(cluster_cpus);
3758 if (topology_cpu_throttling(cpu))
3759 clbs->cpu_capacity *=
3760 (topology_cpu_capacity(cpu) << CPUPOWER_FREQSCALE_SHIFT)
3761 / (topology_max_cpu_capacity(cpu)+1);
3762 else
3763 clbs->cpu_capacity *= topology_max_cpu_capacity(cpu);
3764 clbs->cpu_capacity >>= CPUPOWER_FREQSCALE_SHIFT;
3765
3766 if (clbs->cpu_capacity > HMP_MAX_LOAD){
3767 clbs->cpu_capacity = HMP_MAX_LOAD;
3768 }
3769 }
3770#endif
3771
3772 /*
3773 * Calculate available CPU capacity
3774 * Calculate available task space
3775 *
3776 * Why load ratio should be multiplied by the number of task ?
3777 * The task is the entity of scheduling unit so that we should consider
3778 * it in scheduler. Only considering task load is not enough.
3779 * Thus, multiplying the number of tasks can adjust load ratio to a more
3780 * reasonable value.
3781 */
3782 clbs->load_avg /= clbs->ncpu;
3783 clbs->acap = clbs->cpu_capacity - cpu_rq(target)->cfs.avg.load_avg_ratio;
3784 clbs->scaled_acap = hmp_scale_down(clbs->acap);
3785 clbs->scaled_atask = cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.load_avg_ratio;
3786 clbs->scaled_atask = clbs->cpu_capacity - clbs->scaled_atask;
3787 clbs->scaled_atask = hmp_scale_down(clbs->scaled_atask);
3788
3789 mt_sched_printf("[%s] cpu/cluster:%d/%02lx load/len:%lu/%u stats:%d,%d,%d,%d,%d,%d,%d,%d\n", __func__,
3790 target, *cpumask_bits(cluster_cpus),
3791 cpu_rq(target)->cfs.avg.load_avg_ratio, cpu_rq(target)->cfs.h_nr_running,
3792 clbs->ncpu, clbs->ntask, clbs->load_avg, clbs->cpu_capacity,
3793 clbs->acap, clbs->scaled_acap, clbs->scaled_atask, clbs->threshold);
3794}
3795
3796//#define USE_HMP_DYNAMIC_THRESHOLD
3797#if defined(CONFIG_SCHED_HMP) && defined(USE_HMP_DYNAMIC_THRESHOLD)
3798static inline void hmp_dynamic_threshold(struct clb_env *clbenv);
3799#endif
3800
3801/*
3802 * Task Dynamic Migration Threshold Adjustment.
3803 *
3804 * If the workload between clusters is not balanced, adjust migration
3805 * threshold in an attempt to move task precisely.
3806 *
3807 * Diff. = Max Threshold - Min Threshold
3808 *
3809 * Dynamic UP-Threshold =
3810 * B_nacap B_natask
3811 * Max Threshold - Diff. x ----------------- x -------------------
3812 * B_nacap + L_nacap B_natask + L_natask
3813 *
3814 *
3815 * Dynamic Down-Threshold =
3816 * L_nacap L_natask
3817 * Min Threshold + Diff. x ----------------- x -------------------
3818 * B_nacap + L_nacap B_natask + L_natask
3819 */
3820static void adj_threshold(struct clb_env *clbenv)
3821{
3822#define TSKLD_SHIFT (2)
3823#define POSITIVE(x) ((int)(x) < 0 ? 0 : (x))
3824
3825 int bcpu, lcpu;
3826 unsigned long b_cap=0, l_cap=0;
3827 unsigned long b_load=0, l_load=0;
3828 unsigned long b_task=0, l_task=0;
3829 int b_nacap, l_nacap, b_natask, l_natask;
3830
3831#if defined(CONFIG_SCHED_HMP) && defined(USE_HMP_DYNAMIC_THRESHOLD)
3832 hmp_dynamic_threshold(clbenv);
3833 return;
3834#endif
3835
3836 bcpu = clbenv->btarget;
3837 lcpu = clbenv->ltarget;
3838 if (bcpu < nr_cpu_ids) {
3839 b_load = cpu_rq(bcpu)->cfs.avg.load_avg_ratio;
3840 b_task = cpu_rq(bcpu)->cfs.h_nr_running;
3841 }
3842 if (lcpu < nr_cpu_ids) {
3843 l_load = cpu_rq(lcpu)->cfs.avg.load_avg_ratio;
3844 l_task = cpu_rq(lcpu)->cfs.h_nr_running;
3845 }
3846
3847#ifdef CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY
3848 if (bcpu < nr_cpu_ids) {
3849 b_cap = topology_cpu_capacity(bcpu);
3850 }
3851 if (lcpu < nr_cpu_ids) {
3852 l_cap = topology_cpu_capacity(lcpu);
3853 }
3854
3855 b_nacap = POSITIVE(b_cap - b_load);
3856 b_natask = POSITIVE(b_cap - ((b_task * b_load) >> TSKLD_SHIFT));
3857 l_nacap = POSITIVE(l_cap - l_load);
3858 l_natask = POSITIVE(l_cap - ((l_task * l_load) >> TSKLD_SHIFT));
3859#else /* !CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY */
3860 b_cap = clbenv->bstats.cpu_power;
3861 l_cap = clbenv->lstats.cpu_power;
3862 b_nacap = POSITIVE(clbenv->bstats.scaled_acap *
3863 clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));
3864 b_natask = POSITIVE(clbenv->bstats.scaled_atask *
3865 clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));
3866 l_nacap = POSITIVE(clbenv->lstats.scaled_acap);
3867 l_natask = POSITIVE(clbenv->bstats.scaled_atask);
3868
3869#endif /* CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY */
3870
3871 clbenv->bstats.threshold = HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask /
3872 ((b_nacap + l_nacap) * (b_natask + l_natask)+1);
3873 clbenv->lstats.threshold = HMP_MAX_LOAD * l_nacap * l_natask /
3874 ((b_nacap + l_nacap) * (b_natask + l_natask)+1);
3875
3876 mt_sched_printf("[%s]\tup/dl:%4d/%4d L(%d:%4lu,%4lu/%4lu) b(%d:%4lu,%4lu/%4lu)\n", __func__,
3877 clbenv->bstats.threshold, clbenv->lstats.threshold,
3878 lcpu, l_load, l_task, l_cap,
3879 bcpu, b_load, b_task, b_cap);
3880}
3881
3882static void sched_update_clbstats(struct clb_env *clbenv)
3883{
3884 collect_cluster_stats(&clbenv->bstats, clbenv->bcpus, clbenv->btarget);
3885 collect_cluster_stats(&clbenv->lstats, clbenv->lcpus, clbenv->ltarget);
3886 adj_threshold(clbenv);
3887}
3888#endif /* #if defined(CONFIG_SCHED_HMP) || defined(CONFIG_SCHED_CMP) */
3889
3890
3891#ifdef CONFIG_SCHED_HMP
3892/*
3893 * Heterogenous multiprocessor (HMP) optimizations
3894 *
3895 * The cpu types are distinguished using a list of hmp_domains
3896 * which each represent one cpu type using a cpumask.
3897 * The list is assumed ordered by compute capacity with the
3898 * fastest domain first.
3899 */
3900DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
3901/* We need to know which cpus are fast and slow. */
3902extern struct cpumask hmp_fast_cpu_mask;
3903extern struct cpumask hmp_slow_cpu_mask;
3904
3905extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
3906
3907/* Setup hmp_domains */
3908static int __init hmp_cpu_mask_setup(void)
3909{
3910 char buf[64];
3911 struct hmp_domain *domain;
3912 struct list_head *pos;
3913 int dc, cpu;
3914
3915#if defined(CONFIG_SCHED_HMP_ENHANCEMENT) || \
3916 defined(CONFIG_MT_RT_SCHED) || defined(CONFIG_MT_RT_SCHED_LOG)
3917 cpumask_clear(&hmp_fast_cpu_mask);
3918 cpumask_clear(&hmp_slow_cpu_mask);
3919#endif
3920
3921 pr_debug("Initializing HMP scheduler:\n");
3922
3923 /* Initialize hmp_domains using platform code */
3924 arch_get_hmp_domains(&hmp_domains);
3925 if (list_empty(&hmp_domains)) {
3926 pr_debug("HMP domain list is empty!\n");
3927 return 0;
3928 }
3929
3930 /* Print hmp_domains */
3931 dc = 0;
3932 list_for_each(pos, &hmp_domains) {
3933 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3934 cpulist_scnprintf(buf, 64, &domain->possible_cpus);
3935 pr_debug(" HMP domain %d: %s\n", dc, buf);
3936
3937 /*
3938 * According to the description in "arch_get_hmp_domains",
3939 * Fastest domain is at head of list. Thus, the fast-cpu mask should
3940 * be initialized first, followed by slow-cpu mask.
3941 */
3942#if defined(CONFIG_SCHED_HMP_ENHANCEMENT) || \
3943 defined(CONFIG_MT_RT_SCHED) || defined(CONFIG_MT_RT_SCHED_LOG)
3944 if(cpumask_empty(&hmp_fast_cpu_mask)) {
3945 cpumask_copy(&hmp_fast_cpu_mask,&domain->possible_cpus);
3946 for_each_cpu(cpu, &hmp_fast_cpu_mask)
3947 pr_debug(" HMP fast cpu : %d\n",cpu);
3948 } else if (cpumask_empty(&hmp_slow_cpu_mask)){
3949 cpumask_copy(&hmp_slow_cpu_mask,&domain->possible_cpus);
3950 for_each_cpu(cpu, &hmp_slow_cpu_mask)
3951 pr_debug(" HMP slow cpu : %d\n",cpu);
3952 }
3953#endif
3954
3955 for_each_cpu_mask(cpu, domain->possible_cpus) {
3956 per_cpu(hmp_cpu_domain, cpu) = domain;
3957 }
3958 dc++;
3959 }
3960
3961 return 1;
3962}
3963
3964static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu)
3965{
3966 struct hmp_domain *domain;
3967 struct list_head *pos;
3968
3969 list_for_each(pos, &hmp_domains) {
3970 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3971 if(cpumask_test_cpu(cpu, &domain->possible_cpus))
3972 return domain;
3973 }
3974 return NULL;
3975}
3976
3977static void hmp_online_cpu(int cpu)
3978{
3979 struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3980
3981 if(domain)
3982 cpumask_set_cpu(cpu, &domain->cpus);
3983}
3984
3985static void hmp_offline_cpu(int cpu)
3986{
3987 struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3988
3989 if(domain)
3990 cpumask_clear_cpu(cpu, &domain->cpus);
3991}
3992
3993/*
3994 * Migration thresholds should be in the range [0..1023]
3995 * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
3996 * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
3997 * The default values (512, 256) offer good responsiveness, but may need
3998 * tweaking suit particular needs.
3999 *
4000 * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
4001 * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
4002 * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
4003 */
4004#ifdef CONFIG_HMP_DYNAMIC_THRESHOLD
4005unsigned int hmp_up_threshold = 1023;
4006unsigned int hmp_down_threshold = 0;
4007#else
4008unsigned int hmp_up_threshold = 512;
4009unsigned int hmp_down_threshold = 256;
4010#endif
4011
4012unsigned int hmp_next_up_threshold = 4096;
4013unsigned int hmp_next_down_threshold = 4096;
4014#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4015#define hmp_last_up_migration(cpu) \
4016 cpu_rq(cpu)->cfs.avg.hmp_last_up_migration
4017#define hmp_last_down_migration(cpu) \
4018 cpu_rq(cpu)->cfs.avg.hmp_last_down_migration
4019static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p,
4020 int prev_cpu, int new_cpu);
4021#else
4022static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
4023static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
4024#endif
4025static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
4026 int *min_cpu);
4027
4028/* Check if cpu is in fastest hmp_domain */
4029static inline unsigned int hmp_cpu_is_fastest(int cpu)
4030{
4031 struct list_head *pos;
4032
4033 pos = &hmp_cpu_domain(cpu)->hmp_domains;
4034 return pos == hmp_domains.next;
4035}
4036
4037/* Check if cpu is in slowest hmp_domain */
4038static inline unsigned int hmp_cpu_is_slowest(int cpu)
4039{
4040 struct list_head *pos;
4041
4042 pos = &hmp_cpu_domain(cpu)->hmp_domains;
4043 return list_is_last(pos, &hmp_domains);
4044}
4045
4046/* Next (slower) hmp_domain relative to cpu */
4047static inline struct hmp_domain *hmp_slower_domain(int cpu)
4048{
4049 struct list_head *pos;
4050
4051 pos = &hmp_cpu_domain(cpu)->hmp_domains;
4052 return list_entry(pos->next, struct hmp_domain, hmp_domains);
4053}
4054
4055/* Previous (faster) hmp_domain relative to cpu */
4056static inline struct hmp_domain *hmp_faster_domain(int cpu)
4057{
4058 struct list_head *pos;
4059
4060 pos = &hmp_cpu_domain(cpu)->hmp_domains;
4061 return list_entry(pos->prev, struct hmp_domain, hmp_domains);
4062}
4063
4064/*
4065 * Selects a cpu in previous (faster) hmp_domain
4066 * Note that cpumask_any_and() returns the first cpu in the cpumask
4067 */
4068static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
4069 int cpu)
4070{
4071 int lowest_cpu=NR_CPUS;
4072 __always_unused int lowest_ratio = hmp_domain_min_load(hmp_faster_domain(cpu), &lowest_cpu);
4073 /*
4074 * If the lowest-loaded CPU in the domain is allowed by the task affinity
4075 * select that one, otherwise select one which is allowed
4076 */
4077 if(lowest_cpu < nr_cpu_ids && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk)))
4078 return lowest_cpu;
4079 else
4080 return cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
4081 tsk_cpus_allowed(tsk));
4082}
4083
4084/*
4085 * Selects a cpu in next (slower) hmp_domain
4086 * Note that cpumask_any_and() returns the first cpu in the cpumask
4087 */
4088static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
4089 int cpu)
4090{
4091 int lowest_cpu=NR_CPUS;
4092 __always_unused int lowest_ratio = hmp_domain_min_load(hmp_slower_domain(cpu), &lowest_cpu);
4093 /*
4094 * If the lowest-loaded CPU in the domain is allowed by the task affinity
4095 * select that one, otherwise select one which is allowed
4096 */
4097 if(lowest_cpu < nr_cpu_ids && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk)))
4098 return lowest_cpu;
4099 else
4100 return cpumask_any_and(&hmp_slower_domain(cpu)->cpus,
4101 tsk_cpus_allowed(tsk));
4102}
4103
4104static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
4105{
4106#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4107 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4108 hmp_last_up_migration(cpu) = cfs_rq_clock_task(cfs_rq);
4109 hmp_last_down_migration(cpu) = 0;
4110#else
4111 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4112
4113 se->avg.hmp_last_up_migration = cfs_rq_clock_task(cfs_rq);
4114 se->avg.hmp_last_down_migration = 0;
4115#endif
4116}
4117
4118static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
4119{
4120#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4121 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4122 hmp_last_down_migration(cpu) = cfs_rq_clock_task(cfs_rq);
4123 hmp_last_up_migration(cpu) = 0;
4124#else
4125 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4126
4127 se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq);
4128 se->avg.hmp_last_up_migration = 0;
4129#endif
4130}
4131
4132#ifdef CONFIG_HMP_VARIABLE_SCALE
4133/*
4134 * Heterogenous multiprocessor (HMP) optimizations
4135 *
4136 * These functions allow to change the growing speed of the load_avg_ratio
4137 * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
4138 * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
4139 *
4140 * These functions also allow to change the up and down threshold of HMP
4141 * using /sys/kernel/hmp/{up,down}_threshold.
4142 * Both must be between 0 and 1023. The threshold that is compared
4143 * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
4144 *
4145 * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
4146 * task with a load of 0 will reach the threshold after 64ms of busy loop.
4147 *
4148 * Changing load_avg_periods_ms has the same effect than changing the
4149 * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
4150 * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
4151 * could trigger overflows.
4152 * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
4153 * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
4154 * could be overflowed for a weight > 2^12 even is the load_avg_contrib
4155 * should still be a 32bits result. This would not happen by multiplicating
4156 * delta time by 1/22 and setting load_avg_period_ms = 706.
4157 */
4158
4159/*
4160 * By scaling the delta time it end-up increasing or decrease the
4161 * growing speed of the per entity load_avg_ratio
4162 * The scale factor hmp_data.multiplier is a fixed point
4163 * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
4164 */
4165static u64 hmp_variable_scale_convert(u64 delta)
4166{
4167 u64 high = delta >> 32ULL;
4168 u64 low = delta & 0xffffffffULL;
4169 low *= hmp_data.multiplier;
4170 high *= hmp_data.multiplier;
4171 return (low >> HMP_VARIABLE_SCALE_SHIFT)
4172 + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
4173}
4174
4175static ssize_t hmp_show(struct kobject *kobj,
4176 struct attribute *attr, char *buf)
4177{
4178 ssize_t ret = 0;
4179 struct hmp_global_attr *hmp_attr =
4180 container_of(attr, struct hmp_global_attr, attr);
4181 int temp = *(hmp_attr->value);
4182 if (hmp_attr->to_sysfs != NULL)
4183 temp = hmp_attr->to_sysfs(temp);
4184 ret = sprintf(buf, "%d\n", temp);
4185 return ret;
4186}
4187
4188static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
4189 const char *buf, size_t count)
4190{
4191 int temp;
4192 ssize_t ret = count;
4193 struct hmp_global_attr *hmp_attr =
4194 container_of(attr, struct hmp_global_attr, attr);
4195 char *str = vmalloc(count + 1);
4196 if (str == NULL)
4197 return -ENOMEM;
4198 memcpy(str, buf, count);
4199 str[count] = 0;
4200 if (sscanf(str, "%d", &temp) < 1)
4201 ret = -EINVAL;
4202 else {
4203 if (hmp_attr->from_sysfs != NULL)
4204 temp = hmp_attr->from_sysfs(temp);
4205 if (temp < 0)
4206 ret = -EINVAL;
4207 else
4208 *(hmp_attr->value) = temp;
4209 }
4210 vfree(str);
4211 return ret;
4212}
4213
4214static int hmp_period_tofrom_sysfs(int value)
4215{
4216 return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
4217}
4218
4219/* max value for threshold is 1024 */
4220static int hmp_theshold_from_sysfs(int value)
4221{
4222 if (value > 1024)
4223 return -1;
4224 return value;
4225}
4226#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4227/* freqinvar control is only 0,1 off/on */
4228static int hmp_freqinvar_from_sysfs(int value)
4229{
4230 if (value < 0 || value > 1)
4231 return -1;
4232 return value;
4233}
4234#endif
4235static void hmp_attr_add(
4236 const char *name,
4237 int *value,
4238 int (*to_sysfs)(int),
4239 int (*from_sysfs)(int))
4240{
4241 int i = 0;
4242 while (hmp_data.attributes[i] != NULL) {
4243 i++;
4244 if (i >= HMP_DATA_SYSFS_MAX)
4245 return;
4246 }
4247 hmp_data.attr[i].attr.mode = 0644;
4248 hmp_data.attr[i].show = hmp_show;
4249 hmp_data.attr[i].store = hmp_store;
4250 hmp_data.attr[i].attr.name = name;
4251 hmp_data.attr[i].value = value;
4252 hmp_data.attr[i].to_sysfs = to_sysfs;
4253 hmp_data.attr[i].from_sysfs = from_sysfs;
4254 hmp_data.attributes[i] = &hmp_data.attr[i].attr;
4255 hmp_data.attributes[i + 1] = NULL;
4256}
4257
4258static int hmp_attr_init(void)
4259{
4260 int ret;
4261 memset(&hmp_data, sizeof(hmp_data), 0);
4262 /* by default load_avg_period_ms == LOAD_AVG_PERIOD
4263 * meaning no change
4264 */
4265 /* LOAD_AVG_PERIOD is too short to trigger heavy task indicator
4266 so we change it to LOAD_AVG_VARIABLE_PERIOD */
4267 hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_VARIABLE_PERIOD);
4268
4269 hmp_attr_add("load_avg_period_ms",
4270 &hmp_data.multiplier,
4271 hmp_period_tofrom_sysfs,
4272 hmp_period_tofrom_sysfs);
4273 hmp_attr_add("up_threshold",
4274 &hmp_up_threshold,
4275 NULL,
4276 hmp_theshold_from_sysfs);
4277 hmp_attr_add("down_threshold",
4278 &hmp_down_threshold,
4279 NULL,
4280 hmp_theshold_from_sysfs);
4281 hmp_attr_add("init_task_load_period",
4282 &init_task_load_period,
4283 NULL,
4284 NULL);
4285#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4286 /* default frequency-invariant scaling ON */
4287 hmp_data.freqinvar_load_scale_enabled = 1;
4288 hmp_attr_add("frequency_invariant_load_scale",
4289 &hmp_data.freqinvar_load_scale_enabled,
4290 NULL,
4291 hmp_freqinvar_from_sysfs);
4292#endif
4293 hmp_data.attr_group.name = "hmp";
4294 hmp_data.attr_group.attrs = hmp_data.attributes;
4295 ret = sysfs_create_group(kernel_kobj,
4296 &hmp_data.attr_group);
4297 return 0;
4298}
4299late_initcall(hmp_attr_init);
4300#endif /* CONFIG_HMP_VARIABLE_SCALE */
4301
4302static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
4303 int *min_cpu)
4304{
4305 int cpu;
4306 int min_cpu_runnable_temp = NR_CPUS;
4307 unsigned long min_runnable_load = INT_MAX;
4308 unsigned long contrib;
4309
4310 for_each_cpu_mask(cpu, hmpd->cpus) {
4311 /* don't use the divisor in the loop, just at the end */
4312 contrib = cpu_rq(cpu)->avg.runnable_avg_sum * scale_load_down(1024);
4313 if (contrib < min_runnable_load) {
4314 min_runnable_load = contrib;
4315 min_cpu_runnable_temp = cpu;
4316 }
4317 }
4318
4319 if (min_cpu)
4320 *min_cpu = min_cpu_runnable_temp;
4321
4322 /* domain will often have at least one empty CPU */
4323 return min_runnable_load ? min_runnable_load / (LOAD_AVG_MAX + 1) : 0;
4324}
4325
4326/*
4327 * Calculate the task starvation
4328 * This is the ratio of actually running time vs. runnable time.
4329 * If the two are equal the task is getting the cpu time it needs or
4330 * it is alone on the cpu and the cpu is fully utilized.
4331 */
4332static inline unsigned int hmp_task_starvation(struct sched_entity *se)
4333{
4334 u32 starvation;
4335
4336 starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD);
4337 starvation /= (se->avg.runnable_avg_sum + 1);
4338
4339 return scale_load(starvation);
4340}
4341
4342static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
4343{
4344 int min_usage;
4345 int dest_cpu = NR_CPUS;
4346
4347 if (hmp_cpu_is_slowest(cpu))
4348 return NR_CPUS;
4349
4350 /* Is the current domain fully loaded? */
4351 /* load < ~50% */
4352 min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL);
4353 if (min_usage < (NICE_0_LOAD>>1))
4354 return NR_CPUS;
4355
4356 /* Is the task alone on the cpu? */
4357 if (cpu_rq(cpu)->cfs.nr_running < 2)
4358 return NR_CPUS;
4359
4360 /* Is the task actually starving? */
4361 /* >=25% ratio running/runnable = starving */
4362 if (hmp_task_starvation(se) > 768)
4363 return NR_CPUS;
4364
4365 /* Does the slower domain have spare cycles? */
4366 min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu);
4367 /* load > 50% */
4368 if (min_usage > NICE_0_LOAD/2)
4369 return NR_CPUS;
4370
4371 if (cpumask_test_cpu(dest_cpu, &hmp_slower_domain(cpu)->cpus))
4372 return dest_cpu;
4373
4374 return NR_CPUS;
4375}
4376#endif /* CONFIG_SCHED_HMP */
4377
4378
4379#ifdef CONFIG_MTK_SCHED_CMP
4380/* Check if cpu is in fastest hmp_domain */
4381unsigned int cmp_up_threshold = 512;
4382unsigned int cmp_down_threshold = 256;
4383#endif /* CONFIG_MTK_SCHED_CMP */
4384
4385#ifdef CONFIG_MTK_SCHED_CMP_TGS
4386static void sched_tg_enqueue_fair(struct rq *rq, struct task_struct *p)
4387{
4388 int id;
4389 unsigned long flags;
4390 struct task_struct *tg = p->group_leader;
4391
4392 if (group_leader_is_empty(p))
4393 return;
4394 id = get_cluster_id(rq->cpu);
4395 if (unlikely(WARN_ON(id < 0)))
4396 return;
4397
4398 raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags);
4399 tg->thread_group_info[id].cfs_nr_running++;
4400 raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags);
4401}
4402
4403static void sched_tg_dequeue_fair(struct rq *rq, struct task_struct *p)
4404{
4405 int id;
4406 unsigned long flags;
4407 struct task_struct *tg = p->group_leader;
4408
4409 if (group_leader_is_empty(p))
4410 return;
4411 id = get_cluster_id(rq->cpu);
4412 if (unlikely(WARN_ON(id < 0)))
4413 return;
4414
4415 raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags);
4416 tg->thread_group_info[id].cfs_nr_running--;
4417 raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags);
4418}
4419
4420#endif
bf0f6f24
IM
4421/*
4422 * The enqueue_task method is called before nr_running is
4423 * increased. Here we update the fair scheduling stats and
4424 * then put the task into the rbtree:
4425 */
ea87bb78 4426static void
371fd7e7 4427enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
4428{
4429 struct cfs_rq *cfs_rq;
62fb1851 4430 struct sched_entity *se = &p->se;
bf0f6f24
IM
4431
4432 for_each_sched_entity(se) {
62fb1851 4433 if (se->on_rq)
bf0f6f24
IM
4434 break;
4435 cfs_rq = cfs_rq_of(se);
88ec22d3 4436 enqueue_entity(cfs_rq, se, flags);
85dac906
PT
4437
4438 /*
4439 * end evaluation on encountering a throttled cfs_rq
4440 *
4441 * note: in the case of encountering a throttled cfs_rq we will
4442 * post the final h_nr_running increment below.
4443 */
4444 if (cfs_rq_throttled(cfs_rq))
4445 break;
953bfcd1 4446 cfs_rq->h_nr_running++;
85dac906 4447
88ec22d3 4448 flags = ENQUEUE_WAKEUP;
bf0f6f24 4449 }
8f4d37ec 4450
2069dd75 4451 for_each_sched_entity(se) {
0f317143 4452 cfs_rq = cfs_rq_of(se);
953bfcd1 4453 cfs_rq->h_nr_running++;
2069dd75 4454
85dac906
PT
4455 if (cfs_rq_throttled(cfs_rq))
4456 break;
4457
17bc14b7 4458 update_cfs_shares(cfs_rq);
9ee474f5 4459 update_entity_load_avg(se, 1);
2069dd75
PZ
4460 }
4461
18bf2805
BS
4462 if (!se) {
4463 update_rq_runnable_avg(rq, rq->nr_running);
85dac906 4464 inc_nr_running(rq);
6fa3eb70
S
4465#ifndef CONFIG_CFS_BANDWIDTH
4466 BUG_ON(rq->cfs.nr_running > rq->cfs.h_nr_running);
4467#endif
18bf2805 4468 }
a4c2f00f 4469 hrtick_update(rq);
6fa3eb70
S
4470#ifdef CONFIG_HMP_TRACER
4471 trace_sched_runqueue_length(rq->cpu,rq->nr_running);
4472 trace_sched_cfs_length(rq->cpu,rq->cfs.h_nr_running);
4473#endif
4474#ifdef CONFIG_MET_SCHED_HMP
4475 RqLen(rq->cpu,rq->nr_running);
4476 CfsLen(rq->cpu,rq->cfs.h_nr_running);
4477#endif
4478
4479#ifdef CONFIG_MTK_SCHED_CMP_TGS
4480 sched_tg_enqueue_fair(rq, p);
4481#endif
bf0f6f24
IM
4482}
4483
2f36825b
VP
4484static void set_next_buddy(struct sched_entity *se);
4485
bf0f6f24
IM
4486/*
4487 * The dequeue_task method is called before nr_running is
4488 * decreased. We remove the task from the rbtree and
4489 * update the fair scheduling stats:
4490 */
371fd7e7 4491static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
4492{
4493 struct cfs_rq *cfs_rq;
62fb1851 4494 struct sched_entity *se = &p->se;
2f36825b 4495 int task_sleep = flags & DEQUEUE_SLEEP;
bf0f6f24
IM
4496
4497 for_each_sched_entity(se) {
4498 cfs_rq = cfs_rq_of(se);
371fd7e7 4499 dequeue_entity(cfs_rq, se, flags);
85dac906
PT
4500
4501 /*
4502 * end evaluation on encountering a throttled cfs_rq
4503 *
4504 * note: in the case of encountering a throttled cfs_rq we will
4505 * post the final h_nr_running decrement below.
4506 */
4507 if (cfs_rq_throttled(cfs_rq))
4508 break;
953bfcd1 4509 cfs_rq->h_nr_running--;
2069dd75 4510
bf0f6f24 4511 /* Don't dequeue parent if it has other entities besides us */
2f36825b
VP
4512 if (cfs_rq->load.weight) {
4513 /*
4514 * Bias pick_next to pick a task from this cfs_rq, as
4515 * p is sleeping when it is within its sched_slice.
4516 */
4517 if (task_sleep && parent_entity(se))
4518 set_next_buddy(parent_entity(se));
9598c82d
PT
4519
4520 /* avoid re-evaluating load for this entity */
4521 se = parent_entity(se);
bf0f6f24 4522 break;
2f36825b 4523 }
371fd7e7 4524 flags |= DEQUEUE_SLEEP;
bf0f6f24 4525 }
8f4d37ec 4526
2069dd75 4527 for_each_sched_entity(se) {
0f317143 4528 cfs_rq = cfs_rq_of(se);
953bfcd1 4529 cfs_rq->h_nr_running--;
2069dd75 4530
85dac906
PT
4531 if (cfs_rq_throttled(cfs_rq))
4532 break;
4533
17bc14b7 4534 update_cfs_shares(cfs_rq);
9ee474f5 4535 update_entity_load_avg(se, 1);
2069dd75
PZ
4536 }
4537
18bf2805 4538 if (!se) {
85dac906 4539 dec_nr_running(rq);
6fa3eb70
S
4540#ifndef CONFIG_CFS_BANDWIDTH
4541 BUG_ON(rq->cfs.nr_running > rq->cfs.h_nr_running);
4542#endif
18bf2805
BS
4543 update_rq_runnable_avg(rq, 1);
4544 }
a4c2f00f 4545 hrtick_update(rq);
6fa3eb70
S
4546#ifdef CONFIG_HMP_TRACER
4547 trace_sched_runqueue_length(rq->cpu,rq->nr_running);
4548 trace_sched_cfs_length(rq->cpu,rq->cfs.h_nr_running);
4549#endif
4550#ifdef CONFIG_MET_SCHED_HMP
4551 RqLen(rq->cpu,rq->nr_running);
4552 CfsLen(rq->cpu,rq->cfs.h_nr_running);
4553#endif
4554
4555#ifdef CONFIG_MTK_SCHED_CMP_TGS
4556 sched_tg_dequeue_fair(rq, p);
4557#endif
bf0f6f24
IM
4558}
4559
e7693a36 4560#ifdef CONFIG_SMP
029632fb
PZ
4561/* Used instead of source_load when we know the type == 0 */
4562static unsigned long weighted_cpuload(const int cpu)
4563{
6fa3eb70 4564 return cpu_rq(cpu)->cfs.runnable_load_avg;
029632fb
PZ
4565}
4566
4567/*
4568 * Return a low guess at the load of a migration-source cpu weighted
4569 * according to the scheduling class and "nice" value.
4570 *
4571 * We want to under-estimate the load of migration sources, to
4572 * balance conservatively.
4573 */
4574static unsigned long source_load(int cpu, int type)
4575{
4576 struct rq *rq = cpu_rq(cpu);
4577 unsigned long total = weighted_cpuload(cpu);
4578
4579 if (type == 0 || !sched_feat(LB_BIAS))
4580 return total;
4581
4582 return min(rq->cpu_load[type-1], total);
4583}
4584
4585/*
4586 * Return a high guess at the load of a migration-target cpu weighted
4587 * according to the scheduling class and "nice" value.
4588 */
4589static unsigned long target_load(int cpu, int type)
4590{
4591 struct rq *rq = cpu_rq(cpu);
4592 unsigned long total = weighted_cpuload(cpu);
4593
4594 if (type == 0 || !sched_feat(LB_BIAS))
4595 return total;
4596
4597 return max(rq->cpu_load[type-1], total);
4598}
4599
4600static unsigned long power_of(int cpu)
4601{
4602 return cpu_rq(cpu)->cpu_power;
4603}
4604
4605static unsigned long cpu_avg_load_per_task(int cpu)
4606{
4607 struct rq *rq = cpu_rq(cpu);
4608 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
6fa3eb70 4609 unsigned long load_avg = rq->cfs.runnable_load_avg;
029632fb
PZ
4610
4611 if (nr_running)
6fa3eb70 4612 return load_avg / nr_running;
029632fb
PZ
4613
4614 return 0;
4615}
4616
098fb9db 4617
74f8e4b2 4618static void task_waking_fair(struct task_struct *p)
88ec22d3
PZ
4619{
4620 struct sched_entity *se = &p->se;
4621 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3fe1698b
PZ
4622 u64 min_vruntime;
4623
4624#ifndef CONFIG_64BIT
4625 u64 min_vruntime_copy;
88ec22d3 4626
3fe1698b
PZ
4627 do {
4628 min_vruntime_copy = cfs_rq->min_vruntime_copy;
4629 smp_rmb();
4630 min_vruntime = cfs_rq->min_vruntime;
4631 } while (min_vruntime != min_vruntime_copy);
4632#else
4633 min_vruntime = cfs_rq->min_vruntime;
4634#endif
88ec22d3 4635
3fe1698b 4636 se->vruntime -= min_vruntime;
88ec22d3
PZ
4637}
4638
bb3469ac 4639#ifdef CONFIG_FAIR_GROUP_SCHED
f5bfb7d9
PZ
4640/*
4641 * effective_load() calculates the load change as seen from the root_task_group
4642 *
4643 * Adding load to a group doesn't make a group heavier, but can cause movement
4644 * of group shares between cpus. Assuming the shares were perfectly aligned one
4645 * can calculate the shift in shares.
cf5f0acf
PZ
4646 *
4647 * Calculate the effective load difference if @wl is added (subtracted) to @tg
4648 * on this @cpu and results in a total addition (subtraction) of @wg to the
4649 * total group weight.
4650 *
4651 * Given a runqueue weight distribution (rw_i) we can compute a shares
4652 * distribution (s_i) using:
4653 *
4654 * s_i = rw_i / \Sum rw_j (1)
4655 *
4656 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4657 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4658 * shares distribution (s_i):
4659 *
4660 * rw_i = { 2, 4, 1, 0 }
4661 * s_i = { 2/7, 4/7, 1/7, 0 }
4662 *
4663 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4664 * task used to run on and the CPU the waker is running on), we need to
4665 * compute the effect of waking a task on either CPU and, in case of a sync
4666 * wakeup, compute the effect of the current task going to sleep.
4667 *
4668 * So for a change of @wl to the local @cpu with an overall group weight change
4669 * of @wl we can compute the new shares distribution (s'_i) using:
4670 *
4671 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
4672 *
4673 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4674 * differences in waking a task to CPU 0. The additional task changes the
4675 * weight and shares distributions like:
4676 *
4677 * rw'_i = { 3, 4, 1, 0 }
4678 * s'_i = { 3/8, 4/8, 1/8, 0 }
4679 *
4680 * We can then compute the difference in effective weight by using:
4681 *
4682 * dw_i = S * (s'_i - s_i) (3)
4683 *
4684 * Where 'S' is the group weight as seen by its parent.
4685 *
4686 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4687 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4688 * 4/7) times the weight of the group.
f5bfb7d9 4689 */
2069dd75 4690static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
bb3469ac 4691{
4be9daaa 4692 struct sched_entity *se = tg->se[cpu];
f1d239f7 4693
cf5f0acf 4694 if (!tg->parent) /* the trivial, non-cgroup case */
f1d239f7
PZ
4695 return wl;
4696
4be9daaa 4697 for_each_sched_entity(se) {
cf5f0acf 4698 long w, W;
4be9daaa 4699
977dda7c 4700 tg = se->my_q->tg;
bb3469ac 4701
cf5f0acf
PZ
4702 /*
4703 * W = @wg + \Sum rw_j
4704 */
4705 W = wg + calc_tg_weight(tg, se->my_q);
4be9daaa 4706
cf5f0acf
PZ
4707 /*
4708 * w = rw_i + @wl
4709 */
4710 w = se->my_q->load.weight + wl;
940959e9 4711
cf5f0acf
PZ
4712 /*
4713 * wl = S * s'_i; see (2)
4714 */
4715 if (W > 0 && w < W)
4716 wl = (w * tg->shares) / W;
977dda7c
PT
4717 else
4718 wl = tg->shares;
940959e9 4719
cf5f0acf
PZ
4720 /*
4721 * Per the above, wl is the new se->load.weight value; since
4722 * those are clipped to [MIN_SHARES, ...) do so now. See
4723 * calc_cfs_shares().
4724 */
977dda7c
PT
4725 if (wl < MIN_SHARES)
4726 wl = MIN_SHARES;
cf5f0acf
PZ
4727
4728 /*
4729 * wl = dw_i = S * (s'_i - s_i); see (3)
4730 */
977dda7c 4731 wl -= se->load.weight;
cf5f0acf
PZ
4732
4733 /*
4734 * Recursively apply this logic to all parent groups to compute
4735 * the final effective load change on the root group. Since
4736 * only the @tg group gets extra weight, all parent groups can
4737 * only redistribute existing shares. @wl is the shift in shares
4738 * resulting from this level per the above.
4739 */
4be9daaa 4740 wg = 0;
4be9daaa 4741 }
bb3469ac 4742
4be9daaa 4743 return wl;
bb3469ac
PZ
4744}
4745#else
4be9daaa 4746
83378269
PZ
4747static inline unsigned long effective_load(struct task_group *tg, int cpu,
4748 unsigned long wl, unsigned long wg)
4be9daaa 4749{
83378269 4750 return wl;
bb3469ac 4751}
4be9daaa 4752
bb3469ac
PZ
4753#endif
4754
c88d5910 4755static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
098fb9db 4756{
e37b6a7b 4757 s64 this_load, load;
c88d5910 4758 int idx, this_cpu, prev_cpu;
098fb9db 4759 unsigned long tl_per_task;
c88d5910 4760 struct task_group *tg;
83378269 4761 unsigned long weight;
b3137bc8 4762 int balanced;
098fb9db 4763
c88d5910
PZ
4764 idx = sd->wake_idx;
4765 this_cpu = smp_processor_id();
4766 prev_cpu = task_cpu(p);
4767 load = source_load(prev_cpu, idx);
4768 this_load = target_load(this_cpu, idx);
098fb9db 4769
b3137bc8
MG
4770 /*
4771 * If sync wakeup then subtract the (maximum possible)
4772 * effect of the currently running task from the load
4773 * of the current CPU:
4774 */
83378269
PZ
4775 if (sync) {
4776 tg = task_group(current);
4777 weight = current->se.load.weight;
4778
c88d5910 4779 this_load += effective_load(tg, this_cpu, -weight, -weight);
83378269
PZ
4780 load += effective_load(tg, prev_cpu, 0, -weight);
4781 }
b3137bc8 4782
83378269
PZ
4783 tg = task_group(p);
4784 weight = p->se.load.weight;
b3137bc8 4785
71a29aa7
PZ
4786 /*
4787 * In low-load situations, where prev_cpu is idle and this_cpu is idle
c88d5910
PZ
4788 * due to the sync cause above having dropped this_load to 0, we'll
4789 * always have an imbalance, but there's really nothing you can do
4790 * about that, so that's good too.
71a29aa7
PZ
4791 *
4792 * Otherwise check if either cpus are near enough in load to allow this
4793 * task to be woken on this_cpu.
4794 */
e37b6a7b
PT
4795 if (this_load > 0) {
4796 s64 this_eff_load, prev_eff_load;
e51fd5e2
PZ
4797
4798 this_eff_load = 100;
4799 this_eff_load *= power_of(prev_cpu);
4800 this_eff_load *= this_load +
4801 effective_load(tg, this_cpu, weight, weight);
4802
4803 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4804 prev_eff_load *= power_of(this_cpu);
4805 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4806
4807 balanced = this_eff_load <= prev_eff_load;
4808 } else
4809 balanced = true;
b3137bc8 4810
098fb9db 4811 /*
4ae7d5ce
IM
4812 * If the currently running task will sleep within
4813 * a reasonable amount of time then attract this newly
4814 * woken task:
098fb9db 4815 */
2fb7635c
PZ
4816 if (sync && balanced)
4817 return 1;
098fb9db 4818
41acab88 4819 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
098fb9db
IM
4820 tl_per_task = cpu_avg_load_per_task(this_cpu);
4821
c88d5910
PZ
4822 if (balanced ||
4823 (this_load <= load &&
4824 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
098fb9db
IM
4825 /*
4826 * This domain has SD_WAKE_AFFINE and
4827 * p is cache cold in this domain, and
4828 * there is no bad imbalance.
4829 */
c88d5910 4830 schedstat_inc(sd, ttwu_move_affine);
41acab88 4831 schedstat_inc(p, se.statistics.nr_wakeups_affine);
098fb9db
IM
4832
4833 return 1;
4834 }
4835 return 0;
4836}
4837
aaee1203
PZ
4838/*
4839 * find_idlest_group finds and returns the least busy CPU group within the
4840 * domain.
4841 */
4842static struct sched_group *
78e7ed53 4843find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5158f4e4 4844 int this_cpu, int load_idx)
e7693a36 4845{
b3bd3de6 4846 struct sched_group *idlest = NULL, *group = sd->groups;
aaee1203 4847 unsigned long min_load = ULONG_MAX, this_load = 0;
aaee1203 4848 int imbalance = 100 + (sd->imbalance_pct-100)/2;
e7693a36 4849
aaee1203
PZ
4850 do {
4851 unsigned long load, avg_load;
4852 int local_group;
4853 int i;
e7693a36 4854
aaee1203
PZ
4855 /* Skip over this group if it has no CPUs allowed */
4856 if (!cpumask_intersects(sched_group_cpus(group),
fa17b507 4857 tsk_cpus_allowed(p)))
aaee1203
PZ
4858 continue;
4859
4860 local_group = cpumask_test_cpu(this_cpu,
4861 sched_group_cpus(group));
4862
4863 /* Tally up the load of all CPUs in the group */
4864 avg_load = 0;
4865
4866 for_each_cpu(i, sched_group_cpus(group)) {
4867 /* Bias balancing toward cpus of our domain */
4868 if (local_group)
4869 load = source_load(i, load_idx);
4870 else
4871 load = target_load(i, load_idx);
4872
4873 avg_load += load;
6fa3eb70
S
4874
4875 mt_sched_printf("find_idlest_group cpu=%d avg=%lu",
4876 i, avg_load);
aaee1203
PZ
4877 }
4878
4879 /* Adjust by relative CPU power of the group */
9c3f75cb 4880 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
aaee1203
PZ
4881
4882 if (local_group) {
4883 this_load = avg_load;
6fa3eb70
S
4884 mt_sched_printf("find_idlest_group this_load=%lu",
4885 this_load);
aaee1203
PZ
4886 } else if (avg_load < min_load) {
4887 min_load = avg_load;
4888 idlest = group;
6fa3eb70
S
4889 mt_sched_printf("find_idlest_group min_load=%lu",
4890 min_load);
aaee1203
PZ
4891 }
4892 } while (group = group->next, group != sd->groups);
4893
6fa3eb70
S
4894 if (!idlest || 100*this_load < imbalance*min_load){
4895 mt_sched_printf("find_idlest_group fail this_load=%lu min_load=%lu, imbalance=%d",
4896 this_load, min_load, imbalance);
aaee1203 4897 return NULL;
6fa3eb70 4898 }
aaee1203
PZ
4899 return idlest;
4900}
4901
4902/*
4903 * find_idlest_cpu - find the idlest cpu among the cpus in group.
4904 */
4905static int
4906find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4907{
4908 unsigned long load, min_load = ULONG_MAX;
4909 int idlest = -1;
4910 int i;
4911
4912 /* Traverse only the allowed CPUs */
fa17b507 4913 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
aaee1203
PZ
4914 load = weighted_cpuload(i);
4915
4916 if (load < min_load || (load == min_load && i == this_cpu)) {
4917 min_load = load;
4918 idlest = i;
e7693a36
GH
4919 }
4920 }
4921
aaee1203
PZ
4922 return idlest;
4923}
e7693a36 4924
a50bde51
PZ
4925/*
4926 * Try and locate an idle CPU in the sched_domain.
4927 */
99bd5e2f 4928static int select_idle_sibling(struct task_struct *p, int target)
a50bde51 4929{
99bd5e2f 4930 struct sched_domain *sd;
37407ea7 4931 struct sched_group *sg;
e0a79f52 4932 int i = task_cpu(p);
a50bde51 4933
e0a79f52
MG
4934 if (idle_cpu(target))
4935 return target;
99bd5e2f
SS
4936
4937 /*
e0a79f52 4938 * If the prevous cpu is cache affine and idle, don't be stupid.
99bd5e2f 4939 */
e0a79f52
MG
4940 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4941 return i;
a50bde51
PZ
4942
4943 /*
37407ea7 4944 * Otherwise, iterate the domains and find an elegible idle cpu.
a50bde51 4945 */
518cd623 4946 sd = rcu_dereference(per_cpu(sd_llc, target));
970e1789 4947 for_each_lower_domain(sd) {
37407ea7
LT
4948 sg = sd->groups;
4949 do {
4950 if (!cpumask_intersects(sched_group_cpus(sg),
4951 tsk_cpus_allowed(p)))
4952 goto next;
4953
4954 for_each_cpu(i, sched_group_cpus(sg)) {
e0a79f52 4955 if (i == target || !idle_cpu(i))
37407ea7
LT
4956 goto next;
4957 }
970e1789 4958
37407ea7
LT
4959 target = cpumask_first_and(sched_group_cpus(sg),
4960 tsk_cpus_allowed(p));
4961 goto done;
4962next:
4963 sg = sg->next;
4964 } while (sg != sd->groups);
4965 }
4966done:
a50bde51
PZ
4967 return target;
4968}
4969
6fa3eb70 4970#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
aaee1203 4971/*
6fa3eb70
S
4972 * @p: the task want to be located at.
4973 * @clid: the CPU cluster id to be search for the target CPU
4974 * @target: the appropriate CPU for task p, updated by this function.
aaee1203 4975 *
6fa3eb70 4976 * Return:
aaee1203 4977 *
6fa3eb70
S
4978 * 1 on success
4979 * 0 if target CPU is not found in this CPU cluster
aaee1203 4980 */
6fa3eb70 4981static int cmp_find_idle_cpu(struct task_struct *p, int clid, int *target)
aaee1203 4982{
6fa3eb70
S
4983 struct cpumask cls_cpus;
4984 int j;
4985
4986 get_cluster_cpus(&cls_cpus, clid, true);
4987 *target = cpumask_any_and(&cls_cpus, tsk_cpus_allowed(p));
4988 for_each_cpu(j, &cls_cpus) {
4989 if (idle_cpu(j) && cpumask_test_cpu(j, tsk_cpus_allowed(p))) {
4990 *target = j;
4991 break;
4992 }
4993 }
4994 if (*target >= nr_cpu_ids)
4995 return 0; // task is not allow in this CPU cluster
4996 mt_sched_printf("wakeup %d %s cpu=%d, max_clid/max_idle_clid=%d",
4997 p->pid, p->comm, *target, clid);
4998
4999 return 1;
5000}
5001
5002#if !defined(CONFIG_SCHED_HMP)
5003#define TGS_WAKEUP_EXPERIMENT
5004#endif
5005static int cmp_select_task_rq_fair(struct task_struct *p, int sd_flag, int *cpu)
5006{
5007 int i, j;
5008 int max_cnt=0, tskcnt;
5009 int tgs_clid=-1;
5010 int idle_cnt, max_idle_cnt=0;
5011 int in_prev=0, prev_cluster=0;
5012 struct cpumask cls_cpus;
5013 int num_cluster;
5014
5015 num_cluster=arch_get_nr_clusters();
5016 for(i=0; i< num_cluster; i++) {
5017 tskcnt= p->group_leader->thread_group_info[i].nr_running;
5018 idle_cnt = 0;
5019 get_cluster_cpus(&cls_cpus, i, true);
5020
5021 for_each_cpu(j, &cls_cpus) {
5022#ifdef TGS_WAKEUP_EXPERIMENT
5023 if (arch_is_big_little()) {
5024 int bcpu = arch_cpu_is_big(j);
5025 if (bcpu && p->se.avg.load_avg_ratio >= cmp_up_threshold) {
5026 in_prev = 0;
5027 tgs_clid = i;
5028 mt_sched_printf("[heavy task] wakeup load=%ld up_th=%u pid=%d name=%s cpu=%d, tgs_clid=%d in_prev=%d",
5029 p->se.avg.load_avg_ratio, cmp_up_threshold, p->pid, p->comm, *cpu, tgs_clid, in_prev);
5030 goto find_idle_cpu;
5031 }
5032 if (!bcpu && p->se.avg.load_avg_ratio < cmp_down_threshold) {
5033 in_prev = 0;
5034 tgs_clid = i;
5035 mt_sched_printf("[light task] wakeup load=%ld down_th=%u pid=%d name=%s cpu=%d, tgs_clid=%d in_prev=%d",
5036 p->se.avg.load_avg_ratio, cmp_down_threshold, p->pid, p->comm, *cpu, tgs_clid, in_prev);
5037 goto find_idle_cpu;
5038 }
5039 }
5040#endif
5041 if (idle_cpu(j))
5042 idle_cnt++;
5043 }
5044 mt_sched_printf("wakeup load=%ld pid=%d name=%s clid=%d idle_cnt=%d tskcnt=%d max_cnt=%d, cls_cpus=%02lx, onlineCPU=%02lx",
5045 p->se.avg.load_avg_ratio, p->pid, p->comm, i, idle_cnt, tskcnt, max_cnt,
5046 *cpumask_bits(&cls_cpus), *cpumask_bits(cpu_online_mask));
5047
5048 if (idle_cnt == 0)
5049 continue;
5050
5051 if (i == get_cluster_id(*cpu))
5052 prev_cluster = 1;
5053
5054 if (tskcnt > 0) {
5055 if ( (tskcnt > max_cnt) || ((tskcnt == max_cnt) && prev_cluster)) {
5056 in_prev = prev_cluster;
5057 tgs_clid = i;
5058 max_cnt = tskcnt;
5059 }
5060 } else if (0 == max_cnt) {
5061 if ((idle_cnt > max_idle_cnt) || ((idle_cnt == max_idle_cnt) && prev_cluster)) {
5062 in_prev = prev_cluster;
5063 tgs_clid = i ;
5064 max_idle_cnt = idle_cnt;
5065 }
5066
5067 }
5068 mt_sched_printf("wakeup %d %s i=%d idle_cnt=%d tgs_clid=%d max_cnt=%d max_idle_cnt=%d in_prev=%d",
5069 p->pid, p->comm, i, idle_cnt, tgs_clid, max_cnt, max_idle_cnt, in_prev);
5070 }
5071
5072#ifdef TGS_WAKEUP_EXPERIMENT
5073find_idle_cpu:
5074#endif
5075 mt_sched_printf("wakeup %d %s cpu=%d, tgs_clid=%d in_prev=%d",
5076 p->pid, p->comm, *cpu, tgs_clid, in_prev);
5077
5078 if(-1 != tgs_clid && !in_prev && cmp_find_idle_cpu(p, tgs_clid, cpu))
5079 return 1;
5080
5081 return 0;
5082}
5083#endif
5084
5085#ifdef CONFIG_MTK_SCHED_TRACERS
5086#define LB_RESET 0
5087#define LB_AFFINITY 0x10
5088#define LB_BUDDY 0x20
5089#define LB_FORK 0x30
5090#define LB_CMP_SHIFT 8
5091#define LB_CMP 0x4000
5092#define LB_SMP_SHIFT 16
5093#define LB_SMP 0x500000
5094#define LB_HMP_SHIFT 24
5095#define LB_HMP 0x60000000
5096#endif
5097
5098/*
5099 * sched_balance_self: balance the current task (running on cpu) in domains
5100 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
5101 * SD_BALANCE_EXEC.
5102 *
5103 * Balance, ie. select the least loaded group.
5104 *
5105 * Returns the target CPU number, or the same CPU if no balancing is needed.
5106 *
5107 * preempt must be disabled.
5108 */
5109static int
5110select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
5111{
5112 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5113 int cpu = smp_processor_id();
c88d5910
PZ
5114 int prev_cpu = task_cpu(p);
5115 int new_cpu = cpu;
99bd5e2f 5116 int want_affine = 0;
5158f4e4 5117 int sync = wake_flags & WF_SYNC;
6fa3eb70
S
5118#if defined(CONFIG_SCHED_HMP) && !defined(CONFIG_SCHED_HMP_ENHANCEMENT)
5119 int target_cpu = nr_cpu_ids;
5120#endif
5121#ifdef CONFIG_MTK_SCHED_TRACERS
5122 int policy = 0;
5123#endif
5124#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5125 int cmp_cpu;
5126 int cmp_cpu_found=0;
5127#endif
5128#ifdef CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK
5129 int buddy_cpu = per_cpu(sd_pack_buddy, cpu);
5130#endif
c88d5910 5131
29baa747 5132 if (p->nr_cpus_allowed == 1)
6fa3eb70
S
5133 {
5134#ifdef CONFIG_MTK_SCHED_TRACERS
5135 trace_sched_select_task_rq(p, (LB_AFFINITY | prev_cpu), prev_cpu, prev_cpu);
5136#endif
76854c7e 5137 return prev_cpu;
6fa3eb70
S
5138 }
5139
5140#ifdef CONFIG_HMP_PACK_SMALL_TASK
5141#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5142 if (check_pack_buddy(cpu, p) && PA_ENABLE) {
5143 PACK_FROM_CPUX_TO_CPUY_COUNT[cpu][per_cpu(sd_pack_buddy, cpu)]++;
5144
5145#ifdef CONFIG_HMP_TRACER
5146 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_PACK_FORM_CPUX_TO_CPUY, p->pid, cpu, per_cpu(sd_pack_buddy, cpu));
5147#endif /* CONFIG_HMP_TRACER */
5148
5149 if(PA_MON_ENABLE) {
5150 if(strcmp(p->comm, PA_MON) == 0 && cpu != per_cpu(sd_pack_buddy, cpu)) {
5151 printk(KERN_EMERG "[PA] %s PACK From CPU%d to CPU%d\n", p->comm, cpu, per_cpu(sd_pack_buddy, cpu));
5152 printk(KERN_EMERG "[PA] Buddy RQ Usage = %u, Period = %u, NR = %u\n",
5153 per_cpu(BUDDY_CPU_RQ_USAGE, per_cpu(sd_pack_buddy, cpu)),
5154 per_cpu(BUDDY_CPU_RQ_PERIOD, per_cpu(sd_pack_buddy, cpu)),
5155 per_cpu(BUDDY_CPU_RQ_NR, per_cpu(sd_pack_buddy, cpu)));
5156 printk(KERN_EMERG "[PA] Task Usage = %u, Period = %u\n",
5157 per_cpu(TASK_USGAE, cpu),
5158 per_cpu(TASK_PERIOD, cpu));
5159 }
5160 }
5161#else /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5162 if (check_pack_buddy(cpu, p)) {
5163#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5164#ifdef CONFIG_MTK_SCHED_TRACERS
5165 new_cpu = per_cpu(sd_pack_buddy, cpu);
5166 trace_sched_select_task_rq(p, (LB_BUDDY | new_cpu), prev_cpu, new_cpu);
5167#endif
5168 return per_cpu(sd_pack_buddy, cpu);
5169 }
5170#elif defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK)
5171#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5172 if (PA_ENABLE && (sd_flag & SD_BALANCE_WAKE) && (check_pack_buddy(buddy_cpu, p))) {
5173#else
5174 if ((sd_flag & SD_BALANCE_WAKE) && (check_pack_buddy(buddy_cpu, p))) {
5175#endif
5176 struct thread_group_info_t *src_tginfo, *dst_tginfo;
5177 src_tginfo = &p->group_leader->thread_group_info[get_cluster_id(prev_cpu)]; //Compare with previous cpu(Not current cpu)
5178 dst_tginfo = &p->group_leader->thread_group_info[get_cluster_id(buddy_cpu)];
5179 if((get_cluster_id(prev_cpu) == get_cluster_id(buddy_cpu)) ||
5180 (src_tginfo->nr_running < dst_tginfo->nr_running))
5181 {
5182#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5183 PACK_FROM_CPUX_TO_CPUY_COUNT[cpu][buddy_cpu]++;
5184 mt_sched_printf("[PA]pid=%d, Pack to CPU%d(CPU%d's buddy)\n", p->pid,buddy_cpu,cpu);
5185 if(PA_MON_ENABLE) {
5186 u8 i=0;
5187 for(i=0;i<4; i++) {
5188 if(strcmp(p->comm, &PA_MON[i][0]) == 0) {
5189 TASK_PACK_CPU_COUNT[i][buddy_cpu]++;
5190 printk(KERN_EMERG "[PA] %s PACK to CPU%d(CPU%d's buddy), pre(cpu%d)\n", p->comm, buddy_cpu,cpu, prev_cpu);
5191 printk(KERN_EMERG "[PA] Buddy RQ Usage = %u, Period = %u, NR = %u\n",
5192 per_cpu(BUDDY_CPU_RQ_USAGE, buddy_cpu),
5193 per_cpu(BUDDY_CPU_RQ_PERIOD, buddy_cpu),
5194 per_cpu(BUDDY_CPU_RQ_NR, buddy_cpu));
5195 printk(KERN_EMERG "[PA] Task Usage = %u, Period = %u\n",
5196 per_cpu(TASK_USGAE, cpu),
5197 per_cpu(TASK_PERIOD, cpu));
5198 break;
5199 }
5200 }
5201 }
5202#endif //CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5203#ifdef CONFIG_MTK_SCHED_TRACERS
5204 trace_sched_select_task_rq(p, (LB_BUDDY | buddy_cpu), prev_cpu, buddy_cpu);
5205#endif
5206 return buddy_cpu;
5207 }
5208 }
5209#endif /* CONFIG_HMP_PACK_SMALL_TASK */
5210
5211#ifdef CONFIG_SCHED_HMP
5212 /* always put non-kernel forking tasks on a big domain */
5213 if (p->mm && (sd_flag & SD_BALANCE_FORK)) {
5214 if(hmp_cpu_is_fastest(prev_cpu)) {
5215 struct hmp_domain *hmpdom = list_entry(&hmp_cpu_domain(prev_cpu)->hmp_domains, struct hmp_domain, hmp_domains);
5216 __always_unused int lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu);
5217 if(new_cpu < nr_cpu_ids && cpumask_test_cpu(new_cpu,tsk_cpus_allowed(p)))
5218 {
5219#ifdef CONFIG_MTK_SCHED_TRACERS
5220 trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
5221#endif
5222 return new_cpu;
5223 }
5224 else
5225 {
5226 new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
5227 tsk_cpus_allowed(p));
5228 if(new_cpu < nr_cpu_ids)
5229 {
5230#ifdef CONFIG_MTK_SCHED_TRACERS
5231 trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
5232#endif
5233 return new_cpu;
5234 }
5235 }
5236 } else {
5237 new_cpu = hmp_select_faster_cpu(p, prev_cpu);
5238 if (new_cpu < nr_cpu_ids)
5239 {
5240#ifdef CONFIG_MTK_SCHED_TRACERS
5241 trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
5242#endif
5243 return new_cpu;
5244 }
5245 }
5246 // to recover new_cpu value
5247 if (new_cpu >= nr_cpu_ids)
5248 new_cpu = cpu;
5249 }
5250#endif
76854c7e 5251
0763a660 5252 if (sd_flag & SD_BALANCE_WAKE) {
fa17b507 5253 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
c88d5910
PZ
5254 want_affine = 1;
5255 new_cpu = prev_cpu;
5256 }
aaee1203 5257
6fa3eb70
S
5258#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5259 cmp_cpu = prev_cpu;
5260 cmp_cpu_found = cmp_select_task_rq_fair(p, sd_flag, &cmp_cpu);
5261 if (cmp_cpu_found && (cmp_cpu < nr_cpu_ids)) {
5262 cpu = cmp_cpu;
5263 new_cpu = cmp_cpu;
5264#ifdef CONFIG_MTK_SCHED_TRACERS
5265 policy |= (new_cpu << LB_CMP_SHIFT);
5266 policy |= LB_CMP;
5267#endif
5268 mt_sched_printf("wakeup %d %s sd_flag=%x cmp_cpu_found=%d, cpu=%d, want_affine=%d ",
5269 p->pid, p->comm, sd_flag, cmp_cpu_found, cpu, want_affine);
5270 goto cmp_found;
5271 }
5272#endif
dce840a0 5273 rcu_read_lock();
aaee1203 5274 for_each_domain(cpu, tmp) {
6fa3eb70
S
5275 mt_sched_printf("wakeup %d %s tmp->flags=%x, cpu=%d, prev_cpu=%d, new_cpu=%d",
5276 p->pid, p->comm, tmp->flags, cpu, prev_cpu, new_cpu);
5277
e4f42888
PZ
5278 if (!(tmp->flags & SD_LOAD_BALANCE))
5279 continue;
5280
fe3bcfe1 5281 /*
99bd5e2f
SS
5282 * If both cpu and prev_cpu are part of this domain,
5283 * cpu is a valid SD_WAKE_AFFINE target.
fe3bcfe1 5284 */
99bd5e2f
SS
5285 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5286 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5287 affine_sd = tmp;
29cd8bae 5288 break;
f03542a7 5289 }
29cd8bae 5290
f03542a7 5291 if (tmp->flags & sd_flag)
29cd8bae
PZ
5292 sd = tmp;
5293 }
5294
8b911acd 5295 if (affine_sd) {
f03542a7 5296 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
dce840a0
PZ
5297 prev_cpu = cpu;
5298
5299 new_cpu = select_idle_sibling(p, prev_cpu);
5300 goto unlock;
8b911acd 5301 }
e7693a36 5302
6fa3eb70
S
5303 mt_sched_printf("wakeup %d %s sd=%p", p->pid, p->comm, sd);
5304
aaee1203 5305 while (sd) {
5158f4e4 5306 int load_idx = sd->forkexec_idx;
aaee1203 5307 struct sched_group *group;
c88d5910 5308 int weight;
098fb9db 5309
6fa3eb70
S
5310 mt_sched_printf("wakeup %d %s find_idlest_group cpu=%d sd->flags=%x sd_flag=%x",
5311 p->pid, p->comm, cpu, sd->flags, sd_flag);
5312
0763a660 5313 if (!(sd->flags & sd_flag)) {
aaee1203
PZ
5314 sd = sd->child;
5315 continue;
5316 }
098fb9db 5317
5158f4e4
PZ
5318 if (sd_flag & SD_BALANCE_WAKE)
5319 load_idx = sd->wake_idx;
098fb9db 5320
6fa3eb70
S
5321 mt_sched_printf("wakeup %d %s find_idlest_group cpu=%d",
5322 p->pid, p->comm, cpu);
5158f4e4 5323 group = find_idlest_group(sd, p, cpu, load_idx);
aaee1203
PZ
5324 if (!group) {
5325 sd = sd->child;
6fa3eb70
S
5326 mt_sched_printf("wakeup %d %s find_idlest_group child",
5327 p->pid, p->comm);
aaee1203
PZ
5328 continue;
5329 }
4ae7d5ce 5330
d7c33c49 5331 new_cpu = find_idlest_cpu(group, p, cpu);
aaee1203
PZ
5332 if (new_cpu == -1 || new_cpu == cpu) {
5333 /* Now try balancing at a lower domain level of cpu */
5334 sd = sd->child;
6fa3eb70
S
5335 mt_sched_printf("wakeup %d %s find_idlest_cpu sd->child=%p",
5336 p->pid, p->comm, sd);
aaee1203 5337 continue;
e7693a36 5338 }
aaee1203
PZ
5339
5340 /* Now try balancing at a lower domain level of new_cpu */
6fa3eb70
S
5341 mt_sched_printf("wakeup %d %s find_idlest_cpu cpu=%d sd=%p",
5342 p->pid, p->comm, new_cpu, sd);
aaee1203 5343 cpu = new_cpu;
669c55e9 5344 weight = sd->span_weight;
aaee1203
PZ
5345 sd = NULL;
5346 for_each_domain(cpu, tmp) {
669c55e9 5347 if (weight <= tmp->span_weight)
aaee1203 5348 break;
0763a660 5349 if (tmp->flags & sd_flag)
aaee1203 5350 sd = tmp;
6fa3eb70
S
5351 mt_sched_printf("wakeup %d %s sd=%p weight=%d, tmp->span_weight=%d",
5352 p->pid, p->comm, sd, weight, tmp->span_weight);
aaee1203
PZ
5353 }
5354 /* while loop will break here if sd == NULL */
e7693a36 5355 }
6fa3eb70
S
5356
5357#ifdef CONFIG_MTK_SCHED_TRACERS
5358 policy |= (new_cpu << LB_SMP_SHIFT);
5359 policy |= LB_SMP;
5360#endif
5361
dce840a0
PZ
5362unlock:
5363 rcu_read_unlock();
6fa3eb70
S
5364 mt_sched_printf("wakeup %d %s new_cpu=%x", p->pid, p->comm, new_cpu);
5365
5366#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5367cmp_found:
5368#endif
5369
5370#ifdef CONFIG_SCHED_HMP
5371#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
5372 new_cpu = hmp_select_task_rq_fair(sd_flag, p, prev_cpu, new_cpu);
5373#ifdef CONFIG_MTK_SCHED_TRACERS
5374 policy |= (new_cpu << LB_HMP_SHIFT);
5375 policy |= LB_HMP;
5376#endif
5377
5378#else
5379 if (hmp_up_migration(prev_cpu, &target_cpu, &p->se)) {
5380 new_cpu = hmp_select_faster_cpu(p, prev_cpu);
5381 hmp_next_up_delay(&p->se, new_cpu);
5382 trace_sched_hmp_migrate(p, new_cpu, 0);
5383 return new_cpu;
5384 }
5385 if (hmp_down_migration(prev_cpu, &p->se)) {
5386 new_cpu = hmp_select_slower_cpu(p, prev_cpu);
5387 hmp_next_down_delay(&p->se, new_cpu);
5388 trace_sched_hmp_migrate(p, new_cpu, 0);
5389 return new_cpu;
5390 }
5391 /* Make sure that the task stays in its previous hmp domain */
5392 if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
5393 return prev_cpu;
5394#endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
5395#endif /* CONFIG_SCHED_HMP */
5396
5397#ifdef CONFIG_MTK_SCHED_TRACERS
5398 trace_sched_select_task_rq(p, policy, prev_cpu, new_cpu);
5399#endif
5400
5401#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5402 if(PA_MON_ENABLE) {
5403 if(strcmp(p->comm, PA_MON) == 0 && cpu != new_cpu) {
5404 printk(KERN_EMERG "[PA] %s Select From CPU%d to CPU%d\n", p->comm, cpu, new_cpu);
5405 }
5406 }
5407#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
e7693a36 5408
c88d5910 5409 return new_cpu;
e7693a36 5410}
0a74bef8
PT
5411
5412/*
5413 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5414 * cfs_rq_of(p) references at time of call are still valid and identify the
5415 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
5416 * other assumptions, including the state of rq->lock, should be made.
5417 */
5418static void
5419migrate_task_rq_fair(struct task_struct *p, int next_cpu)
5420{
aff3e498
PT
5421 struct sched_entity *se = &p->se;
5422 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5423
5424 /*
5425 * Load tracking: accumulate removed load so that it can be processed
5426 * when we next update owning cfs_rq under rq->lock. Tasks contribute
5427 * to blocked load iff they have a positive decay-count. It can never
5428 * be negative here since on-rq tasks have decay-count == 0.
5429 */
5430 if (se->avg.decay_count) {
5431 se->avg.decay_count = -__synchronize_entity_decay(se);
6fa3eb70
S
5432 atomic_long_add(se->avg.load_avg_contrib,
5433 &cfs_rq->removed_load);
aff3e498 5434 }
0a74bef8 5435}
e7693a36
GH
5436#endif /* CONFIG_SMP */
5437
e52fb7c0
PZ
5438static unsigned long
5439wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
0bbd3336
PZ
5440{
5441 unsigned long gran = sysctl_sched_wakeup_granularity;
5442
5443 /*
e52fb7c0
PZ
5444 * Since its curr running now, convert the gran from real-time
5445 * to virtual-time in his units.
13814d42
MG
5446 *
5447 * By using 'se' instead of 'curr' we penalize light tasks, so
5448 * they get preempted easier. That is, if 'se' < 'curr' then
5449 * the resulting gran will be larger, therefore penalizing the
5450 * lighter, if otoh 'se' > 'curr' then the resulting gran will
5451 * be smaller, again penalizing the lighter task.
5452 *
5453 * This is especially important for buddies when the leftmost
5454 * task is higher priority than the buddy.
0bbd3336 5455 */
f4ad9bd2 5456 return calc_delta_fair(gran, se);
0bbd3336
PZ
5457}
5458
464b7527
PZ
5459/*
5460 * Should 'se' preempt 'curr'.
5461 *
5462 * |s1
5463 * |s2
5464 * |s3
5465 * g
5466 * |<--->|c
5467 *
5468 * w(c, s1) = -1
5469 * w(c, s2) = 0
5470 * w(c, s3) = 1
5471 *
5472 */
5473static int
5474wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5475{
5476 s64 gran, vdiff = curr->vruntime - se->vruntime;
5477
5478 if (vdiff <= 0)
5479 return -1;
5480
e52fb7c0 5481 gran = wakeup_gran(curr, se);
464b7527
PZ
5482 if (vdiff > gran)
5483 return 1;
5484
5485 return 0;
5486}
5487
02479099
PZ
5488static void set_last_buddy(struct sched_entity *se)
5489{
69c80f3e
VP
5490 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5491 return;
5492
5493 for_each_sched_entity(se)
5494 cfs_rq_of(se)->last = se;
02479099
PZ
5495}
5496
5497static void set_next_buddy(struct sched_entity *se)
5498{
69c80f3e
VP
5499 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5500 return;
5501
5502 for_each_sched_entity(se)
5503 cfs_rq_of(se)->next = se;
02479099
PZ
5504}
5505
ac53db59
RR
5506static void set_skip_buddy(struct sched_entity *se)
5507{
69c80f3e
VP
5508 for_each_sched_entity(se)
5509 cfs_rq_of(se)->skip = se;
ac53db59
RR
5510}
5511
bf0f6f24
IM
5512/*
5513 * Preempt the current task with a newly woken task if needed:
5514 */
5a9b86f6 5515static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
bf0f6f24
IM
5516{
5517 struct task_struct *curr = rq->curr;
8651a86c 5518 struct sched_entity *se = &curr->se, *pse = &p->se;
03e89e45 5519 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
f685ceac 5520 int scale = cfs_rq->nr_running >= sched_nr_latency;
2f36825b 5521 int next_buddy_marked = 0;
bf0f6f24 5522
4ae7d5ce
IM
5523 if (unlikely(se == pse))
5524 return;
5525
5238cdd3 5526 /*
ddcdf6e7 5527 * This is possible from callers such as move_task(), in which we
5238cdd3
PT
5528 * unconditionally check_prempt_curr() after an enqueue (which may have
5529 * lead to a throttle). This both saves work and prevents false
5530 * next-buddy nomination below.
5531 */
5532 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5533 return;
5534
2f36825b 5535 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3cb63d52 5536 set_next_buddy(pse);
2f36825b
VP
5537 next_buddy_marked = 1;
5538 }
57fdc26d 5539
aec0a514
BR
5540 /*
5541 * We can come here with TIF_NEED_RESCHED already set from new task
5542 * wake up path.
5238cdd3
PT
5543 *
5544 * Note: this also catches the edge-case of curr being in a throttled
5545 * group (e.g. via set_curr_task), since update_curr() (in the
5546 * enqueue of curr) will have resulted in resched being set. This
5547 * prevents us from potentially nominating it as a false LAST_BUDDY
5548 * below.
aec0a514
BR
5549 */
5550 if (test_tsk_need_resched(curr))
5551 return;
5552
a2f5c9ab
DH
5553 /* Idle tasks are by definition preempted by non-idle tasks. */
5554 if (unlikely(curr->policy == SCHED_IDLE) &&
5555 likely(p->policy != SCHED_IDLE))
5556 goto preempt;
5557
91c234b4 5558 /*
a2f5c9ab
DH
5559 * Batch and idle tasks do not preempt non-idle tasks (their preemption
5560 * is driven by the tick):
91c234b4 5561 */
8ed92e51 5562 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
91c234b4 5563 return;
bf0f6f24 5564
464b7527 5565 find_matching_se(&se, &pse);
9bbd7374 5566 update_curr(cfs_rq_of(se));
002f128b 5567 BUG_ON(!pse);
2f36825b
VP
5568 if (wakeup_preempt_entity(se, pse) == 1) {
5569 /*
5570 * Bias pick_next to pick the sched entity that is
5571 * triggering this preemption.
5572 */
5573 if (!next_buddy_marked)
5574 set_next_buddy(pse);
3a7e73a2 5575 goto preempt;
2f36825b 5576 }
464b7527 5577
3a7e73a2 5578 return;
a65ac745 5579
3a7e73a2
PZ
5580preempt:
5581 resched_task(curr);
5582 /*
5583 * Only set the backward buddy when the current task is still
5584 * on the rq. This can happen when a wakeup gets interleaved
5585 * with schedule on the ->pre_schedule() or idle_balance()
5586 * point, either of which can * drop the rq lock.
5587 *
5588 * Also, during early boot the idle thread is in the fair class,
5589 * for obvious reasons its a bad idea to schedule back to it.
5590 */
5591 if (unlikely(!se->on_rq || curr == rq->idle))
5592 return;
5593
5594 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5595 set_last_buddy(se);
bf0f6f24
IM
5596}
5597
fb8d4724 5598static struct task_struct *pick_next_task_fair(struct rq *rq)
bf0f6f24 5599{
8f4d37ec 5600 struct task_struct *p;
bf0f6f24
IM
5601 struct cfs_rq *cfs_rq = &rq->cfs;
5602 struct sched_entity *se;
5603
6fa3eb70
S
5604 // in case nr_running!=0 but h_nr_running==0
5605 if (!cfs_rq->nr_running || !cfs_rq->h_nr_running)
bf0f6f24
IM
5606 return NULL;
5607
5608 do {
9948f4b2 5609 se = pick_next_entity(cfs_rq);
f4b6755f 5610 set_next_entity(cfs_rq, se);
bf0f6f24
IM
5611 cfs_rq = group_cfs_rq(se);
5612 } while (cfs_rq);
5613
8f4d37ec 5614 p = task_of(se);
b39e66ea
MG
5615 if (hrtick_enabled(rq))
5616 hrtick_start_fair(rq, p);
8f4d37ec
PZ
5617
5618 return p;
bf0f6f24
IM
5619}
5620
5621/*
5622 * Account for a descheduled task:
5623 */
31ee529c 5624static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
bf0f6f24
IM
5625{
5626 struct sched_entity *se = &prev->se;
5627 struct cfs_rq *cfs_rq;
5628
5629 for_each_sched_entity(se) {
5630 cfs_rq = cfs_rq_of(se);
ab6cde26 5631 put_prev_entity(cfs_rq, se);
bf0f6f24
IM
5632 }
5633}
5634
ac53db59
RR
5635/*
5636 * sched_yield() is very simple
5637 *
5638 * The magic of dealing with the ->skip buddy is in pick_next_entity.
5639 */
5640static void yield_task_fair(struct rq *rq)
5641{
5642 struct task_struct *curr = rq->curr;
5643 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5644 struct sched_entity *se = &curr->se;
5645
5646 /*
5647 * Are we the only task in the tree?
5648 */
5649 if (unlikely(rq->nr_running == 1))
5650 return;
5651
5652 clear_buddies(cfs_rq, se);
5653
5654 if (curr->policy != SCHED_BATCH) {
5655 update_rq_clock(rq);
5656 /*
5657 * Update run-time statistics of the 'current'.
5658 */
5659 update_curr(cfs_rq);
916671c0
MG
5660 /*
5661 * Tell update_rq_clock() that we've just updated,
5662 * so we don't do microscopic update in schedule()
5663 * and double the fastpath cost.
5664 */
5665 rq->skip_clock_update = 1;
ac53db59
RR
5666 }
5667
5668 set_skip_buddy(se);
5669}
5670
d95f4122
MG
5671static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5672{
5673 struct sched_entity *se = &p->se;
5674
5238cdd3
PT
5675 /* throttled hierarchies are not runnable */
5676 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
d95f4122
MG
5677 return false;
5678
5679 /* Tell the scheduler that we'd really like pse to run next. */
5680 set_next_buddy(se);
5681
d95f4122
MG
5682 yield_task_fair(rq);
5683
5684 return true;
5685}
5686
681f3e68 5687#ifdef CONFIG_SMP
bf0f6f24 5688/**************************************************
e9c84cb8
PZ
5689 * Fair scheduling class load-balancing methods.
5690 *
5691 * BASICS
5692 *
5693 * The purpose of load-balancing is to achieve the same basic fairness the
5694 * per-cpu scheduler provides, namely provide a proportional amount of compute
5695 * time to each task. This is expressed in the following equation:
5696 *
5697 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
5698 *
5699 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5700 * W_i,0 is defined as:
5701 *
5702 * W_i,0 = \Sum_j w_i,j (2)
5703 *
5704 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5705 * is derived from the nice value as per prio_to_weight[].
5706 *
5707 * The weight average is an exponential decay average of the instantaneous
5708 * weight:
5709 *
5710 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
5711 *
5712 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
5713 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5714 * can also include other factors [XXX].
5715 *
5716 * To achieve this balance we define a measure of imbalance which follows
5717 * directly from (1):
5718 *
5719 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
5720 *
5721 * We them move tasks around to minimize the imbalance. In the continuous
5722 * function space it is obvious this converges, in the discrete case we get
5723 * a few fun cases generally called infeasible weight scenarios.
5724 *
5725 * [XXX expand on:
5726 * - infeasible weights;
5727 * - local vs global optima in the discrete case. ]
5728 *
5729 *
5730 * SCHED DOMAINS
5731 *
5732 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5733 * for all i,j solution, we create a tree of cpus that follows the hardware
5734 * topology where each level pairs two lower groups (or better). This results
5735 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5736 * tree to only the first of the previous level and we decrease the frequency
5737 * of load-balance at each level inv. proportional to the number of cpus in
5738 * the groups.
5739 *
5740 * This yields:
5741 *
5742 * log_2 n 1 n
5743 * \Sum { --- * --- * 2^i } = O(n) (5)
5744 * i = 0 2^i 2^i
5745 * `- size of each group
5746 * | | `- number of cpus doing load-balance
5747 * | `- freq
5748 * `- sum over all levels
5749 *
5750 * Coupled with a limit on how many tasks we can migrate every balance pass,
5751 * this makes (5) the runtime complexity of the balancer.
5752 *
5753 * An important property here is that each CPU is still (indirectly) connected
5754 * to every other cpu in at most O(log n) steps:
5755 *
5756 * The adjacency matrix of the resulting graph is given by:
5757 *
5758 * log_2 n
5759 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
5760 * k = 0
5761 *
5762 * And you'll find that:
5763 *
5764 * A^(log_2 n)_i,j != 0 for all i,j (7)
5765 *
5766 * Showing there's indeed a path between every cpu in at most O(log n) steps.
5767 * The task movement gives a factor of O(m), giving a convergence complexity
5768 * of:
5769 *
5770 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
5771 *
5772 *
5773 * WORK CONSERVING
5774 *
5775 * In order to avoid CPUs going idle while there's still work to do, new idle
5776 * balancing is more aggressive and has the newly idle cpu iterate up the domain
5777 * tree itself instead of relying on other CPUs to bring it work.
5778 *
5779 * This adds some complexity to both (5) and (8) but it reduces the total idle
5780 * time.
5781 *
5782 * [XXX more?]
5783 *
5784 *
5785 * CGROUPS
5786 *
5787 * Cgroups make a horror show out of (2), instead of a simple sum we get:
5788 *
5789 * s_k,i
5790 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
5791 * S_k
5792 *
5793 * Where
5794 *
5795 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
5796 *
5797 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5798 *
5799 * The big problem is S_k, its a global sum needed to compute a local (W_i)
5800 * property.
5801 *
5802 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5803 * rewrite all of this once again.]
5804 */
bf0f6f24 5805
ed387b78
HS
5806static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5807
ddcdf6e7 5808#define LBF_ALL_PINNED 0x01
367456c7 5809#define LBF_NEED_BREAK 0x02
88b8dac0 5810#define LBF_SOME_PINNED 0x04
ddcdf6e7
PZ
5811
5812struct lb_env {
5813 struct sched_domain *sd;
5814
ddcdf6e7 5815 struct rq *src_rq;
85c1e7da 5816 int src_cpu;
ddcdf6e7
PZ
5817
5818 int dst_cpu;
5819 struct rq *dst_rq;
5820
88b8dac0
SV
5821 struct cpumask *dst_grpmask;
5822 int new_dst_cpu;
ddcdf6e7 5823 enum cpu_idle_type idle;
bd939f45 5824 long imbalance;
b9403130
MW
5825 /* The set of CPUs under consideration for load-balancing */
5826 struct cpumask *cpus;
5827
ddcdf6e7 5828 unsigned int flags;
367456c7
PZ
5829
5830 unsigned int loop;
5831 unsigned int loop_break;
5832 unsigned int loop_max;
6fa3eb70
S
5833#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
5834 int mt_check_cache_in_idle;
5835#endif
5836#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5837 unsigned int fail_reason;
5838#endif
ddcdf6e7
PZ
5839};
5840
1e3c88bd 5841/*
ddcdf6e7 5842 * move_task - move a task from one runqueue to another runqueue.
1e3c88bd
PZ
5843 * Both runqueues must be locked.
5844 */
ddcdf6e7 5845static void move_task(struct task_struct *p, struct lb_env *env)
1e3c88bd 5846{
ddcdf6e7
PZ
5847 deactivate_task(env->src_rq, p, 0);
5848 set_task_cpu(p, env->dst_cpu);
5849 activate_task(env->dst_rq, p, 0);
5850 check_preempt_curr(env->dst_rq, p, 0);
6fa3eb70
S
5851
5852#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5853 if(PA_MON_ENABLE) {
5854 if(strcmp(p->comm, PA_MON) == 0) {
5855 printk(KERN_EMERG "[PA] %s Balance From CPU%d to CPU%d\n", p->comm, env->src_rq->cpu, env->dst_rq->cpu);
5856 }
5857 }
5858#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5859
1e3c88bd
PZ
5860}
5861
029632fb
PZ
5862/*
5863 * Is this task likely cache-hot:
5864 */
6fa3eb70
S
5865#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
5866static int
5867task_hot(struct task_struct *p, u64 now, struct sched_domain *sd, int mt_check_cache_in_idle)
5868#else
029632fb
PZ
5869static int
5870task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
6fa3eb70 5871#endif
029632fb
PZ
5872{
5873 s64 delta;
5874
5875 if (p->sched_class != &fair_sched_class)
5876 return 0;
5877
5878 if (unlikely(p->policy == SCHED_IDLE))
5879 return 0;
5880
5881 /*
5882 * Buddy candidates are cache hot:
5883 */
6fa3eb70
S
5884#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
5885 if (!mt_check_cache_in_idle){
5886 if ( !this_rq()->nr_running && (task_rq(p)->nr_running >= 2) )
5887 return 0;
5888 }
5889#endif
029632fb
PZ
5890 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
5891 (&p->se == cfs_rq_of(&p->se)->next ||
5892 &p->se == cfs_rq_of(&p->se)->last))
5893 return 1;
5894
5895 if (sysctl_sched_migration_cost == -1)
5896 return 1;
5897 if (sysctl_sched_migration_cost == 0)
5898 return 0;
5899
5900 delta = now - p->se.exec_start;
5901
5902 return delta < (s64)sysctl_sched_migration_cost;
5903}
5904
1e3c88bd
PZ
5905/*
5906 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5907 */
5908static
8e45cb54 5909int can_migrate_task(struct task_struct *p, struct lb_env *env)
1e3c88bd
PZ
5910{
5911 int tsk_cache_hot = 0;
5912 /*
5913 * We do not migrate tasks that are:
d3198084 5914 * 1) throttled_lb_pair, or
1e3c88bd 5915 * 2) cannot be migrated to this CPU due to cpus_allowed, or
d3198084
JK
5916 * 3) running (obviously), or
5917 * 4) are cache-hot on their current CPU.
1e3c88bd 5918 */
d3198084
JK
5919 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5920 return 0;
5921
ddcdf6e7 5922 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
e02e60c1 5923 int cpu;
88b8dac0 5924
41acab88 5925 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
6fa3eb70
S
5926#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5927 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_AFFINITY);
5928 if(mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){
5929 char strings[128]="";
5930 snprintf(strings, 128, "%d:balance fail:affinity:%d:%d:%s:0x%lu"
5931 , env->dst_cpu, env->src_cpu, p->pid, p->comm, p->cpus_allowed.bits[0]);
5932 trace_sched_lbprof_log(strings);
5933 }
5934#endif
88b8dac0
SV
5935
5936 /*
5937 * Remember if this task can be migrated to any other cpu in
5938 * our sched_group. We may want to revisit it if we couldn't
5939 * meet load balance goals by pulling other tasks on src_cpu.
5940 *
5941 * Also avoid computing new_dst_cpu if we have already computed
5942 * one in current iteration.
5943 */
5944 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
5945 return 0;
5946
e02e60c1
JK
5947 /* Prevent to re-select dst_cpu via env's cpus */
5948 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5949 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5950 env->flags |= LBF_SOME_PINNED;
5951 env->new_dst_cpu = cpu;
5952 break;
5953 }
88b8dac0 5954 }
e02e60c1 5955
1e3c88bd
PZ
5956 return 0;
5957 }
88b8dac0
SV
5958
5959 /* Record that we found atleast one task that could run on dst_cpu */
8e45cb54 5960 env->flags &= ~LBF_ALL_PINNED;
1e3c88bd 5961
ddcdf6e7 5962 if (task_running(env->src_rq, p)) {
41acab88 5963 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
6fa3eb70
S
5964#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5965 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_RUNNING);
5966 if( mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){
5967 char strings[128]="";
5968 snprintf(strings, 128, "%d:balance fail:running:%d:%d:%s"
5969 , env->dst_cpu, env->src_cpu, p->pid, p->comm);
5970 trace_sched_lbprof_log(strings);
5971 }
5972#endif
1e3c88bd
PZ
5973 return 0;
5974 }
5975
5976 /*
5977 * Aggressive migration if:
5978 * 1) task is cache cold, or
5979 * 2) too many balance attempts have failed.
5980 */
6fa3eb70
S
5981#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
5982 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd, env->mt_check_cache_in_idle);
5983#else
ddcdf6e7 5984 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
6fa3eb70 5985#endif
1e3c88bd 5986 if (!tsk_cache_hot ||
8e45cb54 5987 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4e2dcb73 5988
1e3c88bd 5989 if (tsk_cache_hot) {
8e45cb54 5990 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
41acab88 5991 schedstat_inc(p, se.statistics.nr_forced_migrations);
1e3c88bd 5992 }
4e2dcb73 5993
1e3c88bd
PZ
5994 return 1;
5995 }
5996
4e2dcb73 5997 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
6fa3eb70
S
5998#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5999 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_CACHEHOT);
6000 if(mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){
6001 char strings[128]="";
6002 snprintf(strings, 128, "%d:balance fail:cache hot:%d:%d:%s"
6003 , env->dst_cpu, env->src_cpu, p->pid, p->comm);
6004 trace_sched_lbprof_log(strings);
6005 }
6006#endif
4e2dcb73 6007 return 0;
1e3c88bd
PZ
6008}
6009
897c395f
PZ
6010/*
6011 * move_one_task tries to move exactly one task from busiest to this_rq, as
6012 * part of active balancing operations within "domain".
6013 * Returns 1 if successful and 0 otherwise.
6014 *
6015 * Called with both runqueues locked.
6016 */
8e45cb54 6017static int move_one_task(struct lb_env *env)
897c395f
PZ
6018{
6019 struct task_struct *p, *n;
6fa3eb70
S
6020#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
6021 env->mt_check_cache_in_idle = 1;
6022#endif
6023#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
6024 mt_lbprof_stat_set(env->fail_reason, MT_LBPROF_NO_TRIGGER);
6025#endif
897c395f 6026
367456c7 6027 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6fa3eb70
S
6028#if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6029 if(need_lazy_balance(env->dst_cpu, env->src_cpu, p))
6030 continue;
6031#endif
367456c7
PZ
6032 if (!can_migrate_task(p, env))
6033 continue;
897c395f 6034
367456c7
PZ
6035 move_task(p, env);
6036 /*
6037 * Right now, this is only the second place move_task()
6038 * is called, so we can safely collect move_task()
6039 * stats here rather than inside move_task().
6040 */
6041 schedstat_inc(env->sd, lb_gained[env->idle]);
6042 return 1;
897c395f 6043 }
897c395f
PZ
6044 return 0;
6045}
6046
367456c7
PZ
6047static unsigned long task_h_load(struct task_struct *p);
6048
eb95308e
PZ
6049static const unsigned int sched_nr_migrate_break = 32;
6050
6fa3eb70
S
6051/* in second round load balance, we migrate heavy load_weight task
6052 as long as RT tasks exist in busy cpu*/
6053#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
6054 #define over_imbalance(lw, im) \
6055 (((lw)/2 > (im)) && \
6056 ((env->mt_check_cache_in_idle==1) || \
6057 (env->src_rq->rt.rt_nr_running==0) || \
6058 (pulled>0)))
6059#else
6060 #define over_imbalance(lw, im) (((lw) / 2) > (im))
6061#endif
6062
5d6523eb 6063/*
bd939f45 6064 * move_tasks tries to move up to imbalance weighted load from busiest to
5d6523eb
PZ
6065 * this_rq, as part of a balancing operation within domain "sd".
6066 * Returns 1 if successful and 0 otherwise.
6067 *
6068 * Called with both runqueues locked.
6069 */
6070static int move_tasks(struct lb_env *env)
1e3c88bd 6071{
5d6523eb
PZ
6072 struct list_head *tasks = &env->src_rq->cfs_tasks;
6073 struct task_struct *p;
367456c7
PZ
6074 unsigned long load;
6075 int pulled = 0;
1e3c88bd 6076
bd939f45 6077 if (env->imbalance <= 0)
5d6523eb 6078 return 0;
1e3c88bd 6079
6fa3eb70
S
6080 mt_sched_printf("move_tasks start ");
6081
5d6523eb
PZ
6082 while (!list_empty(tasks)) {
6083 p = list_first_entry(tasks, struct task_struct, se.group_node);
1e3c88bd 6084
367456c7
PZ
6085 env->loop++;
6086 /* We've more or less seen every task there is, call it quits */
5d6523eb 6087 if (env->loop > env->loop_max)
367456c7 6088 break;
5d6523eb
PZ
6089
6090 /* take a breather every nr_migrate tasks */
367456c7 6091 if (env->loop > env->loop_break) {
eb95308e 6092 env->loop_break += sched_nr_migrate_break;
8e45cb54 6093 env->flags |= LBF_NEED_BREAK;
ee00e66f 6094 break;
a195f004 6095 }
6fa3eb70
S
6096#if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6097 if(need_lazy_balance(env->dst_cpu, env->src_cpu, p))
6098 goto next;
6099#endif
d3198084 6100 if (!can_migrate_task(p, env))
367456c7
PZ
6101 goto next;
6102
6103 load = task_h_load(p);
5d6523eb 6104
eb95308e 6105 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
367456c7
PZ
6106 goto next;
6107
6fa3eb70
S
6108 if (over_imbalance(load, env->imbalance))
6109 {
367456c7 6110 goto next;
6fa3eb70 6111 }
1e3c88bd 6112
ddcdf6e7 6113 move_task(p, env);
ee00e66f 6114 pulled++;
bd939f45 6115 env->imbalance -= load;
1e3c88bd
PZ
6116
6117#ifdef CONFIG_PREEMPT
ee00e66f
PZ
6118 /*
6119 * NEWIDLE balancing is a source of latency, so preemptible
6120 * kernels will stop after the first task is pulled to minimize
6121 * the critical section.
6122 */
5d6523eb 6123 if (env->idle == CPU_NEWLY_IDLE)
ee00e66f 6124 break;
1e3c88bd
PZ
6125#endif
6126
ee00e66f
PZ
6127 /*
6128 * We only want to steal up to the prescribed amount of
6129 * weighted load.
6130 */
bd939f45 6131 if (env->imbalance <= 0)
ee00e66f 6132 break;
367456c7
PZ
6133
6134 continue;
6135next:
5d6523eb 6136 list_move_tail(&p->se.group_node, tasks);
1e3c88bd 6137 }
5d6523eb 6138
1e3c88bd 6139 /*
ddcdf6e7
PZ
6140 * Right now, this is one of only two places move_task() is called,
6141 * so we can safely collect move_task() stats here rather than
6142 * inside move_task().
1e3c88bd 6143 */
8e45cb54 6144 schedstat_add(env->sd, lb_gained[env->idle], pulled);
1e3c88bd 6145
6fa3eb70
S
6146 mt_sched_printf("move_tasks end");
6147
5d6523eb 6148 return pulled;
1e3c88bd
PZ
6149}
6150
6fa3eb70
S
6151#ifdef CONFIG_MTK_SCHED_CMP
6152#ifdef CONFIG_MTK_SCHED_CMP_TGS
6153static int cmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
9e3081ca 6154{
6fa3eb70 6155 struct sched_domain *sd = env->sd;
9e3081ca 6156
6fa3eb70 6157 BUG_ON(sd == NULL);
9e3081ca 6158
6fa3eb70
S
6159 if (!(sd->flags & SD_BALANCE_TG))
6160 return 0;
9e3081ca 6161
6fa3eb70
S
6162 if (arch_is_multi_cluster()) {
6163 int src_clid, dst_clid;
6164 int src_nr_cpus;
6165 struct thread_group_info_t *src_tginfo, *dst_tginfo;
6166
6167 src_clid = get_cluster_id(env->src_cpu);
6168 dst_clid = get_cluster_id(env->dst_cpu);
6169 BUG_ON(dst_clid == -1 || src_clid == -1);
6170 BUG_ON(p == NULL || p->group_leader == NULL);
6171 src_tginfo = &p->group_leader->thread_group_info[src_clid];
6172 dst_tginfo = &p->group_leader->thread_group_info[dst_clid];
6173 src_nr_cpus = nr_cpus_in_cluster(src_clid, false);
6174
6175#ifdef CONFIG_MT_SCHED_INFO
6176 mt_sched_printf("check rule0: pid=%d comm=%s load=%ld src:clid=%d tginfo->nr_running=%ld nr_cpus=%d load_avg_ratio=%ld",
6177 p->pid, p->comm, p->se.avg.load_avg_ratio,
6178 src_clid, src_tginfo->nr_running, src_nr_cpus,
6179 src_tginfo->load_avg_ratio);
6180#endif
6181#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6182 if ( (!thread_group_empty(p)) &&
6183 (src_tginfo->nr_running <= src_nr_cpus) &&
6184 (src_tginfo->nr_running > dst_tginfo->nr_running)){
6185 mt_sched_printf("hit ruleA: bypass pid=%d comm=%s src:nr_running=%lu nr_cpus=%d dst:nr_running=%lu",
6186 p->pid, p->comm, src_tginfo->nr_running, src_nr_cpus, dst_tginfo->nr_running);
6187 return 0;
6188 }
6189#endif
82958366 6190 }
6fa3eb70 6191 return 1;
9e3081ca
PZ
6192}
6193
6fa3eb70
S
6194static int need_migrate_task_immediately(struct task_struct *p,
6195 struct lb_env *env, struct clb_env *clbenv)
9e3081ca 6196{
6fa3eb70 6197 struct sched_domain *sd = env->sd;
9e3081ca 6198
6fa3eb70
S
6199 BUG_ON(sd == NULL);
6200
6201 if (arch_is_big_little()) {
6202 mt_sched_printf("[%s] b.L arch", __func__);
6203#ifdef CONFIG_MT_SCHED_INFO
6204 mt_sched_printf("check rule0: pid=%d comm=%s src=%d dst=%d p->prio=%d p->se.avg.load_avg_ratio=%ld",
6205 p->pid, p->comm, env->src_cpu, env->dst_cpu, p->prio, p->se.avg.load_avg_ratio);
6206#endif
6207 /* from LITTLE to big */
6208 if (arch_cpu_is_little(env->src_cpu) && arch_cpu_is_big(env->dst_cpu)) {
6209 BUG_ON(env->src_cpu != clbenv->ltarget);
6210 if (p->se.avg.load_avg_ratio >= clbenv->bstats.threshold)
6211 return 1;
6212
6213 /* from big to LITTLE */
6214 } else if (arch_cpu_is_big(env->src_cpu) && arch_cpu_is_little(env->dst_cpu)) {
6215 BUG_ON(env->src_cpu != clbenv->btarget);
6216 if (p->se.avg.load_avg_ratio < clbenv->lstats.threshold)
6217 return 1;
6218 }
6219 return 0;
64660c86 6220 }
48a16753 6221
6fa3eb70
S
6222 if (arch_is_multi_cluster() && (sd->flags & SD_BALANCE_TG)) {
6223 int src_clid, dst_clid;
6224 int src_nr_cpus;
6225 struct thread_group_info_t *src_tginfo, *dst_tginfo;
6226
6227 src_clid = get_cluster_id(env->src_cpu);
6228 dst_clid = get_cluster_id(env->dst_cpu);
6229 BUG_ON(dst_clid == -1 || src_clid == -1);
6230 BUG_ON(p == NULL || p->group_leader == NULL);
6231 src_tginfo = &p->group_leader->thread_group_info[src_clid];
6232 dst_tginfo = &p->group_leader->thread_group_info[dst_clid];
6233 src_nr_cpus = nr_cpus_in_cluster(src_clid, false);
6234 mt_sched_printf("[%s] L.L arch", __func__);
6235
6236 if ((p->se.avg.load_avg_ratio*4 >= NICE_0_LOAD*3) &&
6237 src_tginfo->nr_running > src_nr_cpus &&
6238 src_tginfo->load_avg_ratio*10 > NICE_0_LOAD*src_nr_cpus*9) {
6239 //pr_warn("[%s] hit rule0, candidate_load_move/load_move (%ld/%ld)\n",
6240 // __func__, candidate_load_move, env->imbalance);
6241 return 1;
6242 }
6243 }
6244
6245 return 0;
9e3081ca 6246}
6fa3eb70 6247#endif
9e3081ca 6248
9763b67f 6249/*
6fa3eb70
S
6250 * move_tasks tries to move up to load_move weighted load from busiest to
6251 * this_rq, as part of a balancing operation within domain "sd".
6252 * Returns 1 if successful and 0 otherwise.
6253 *
6254 * Called with both runqueues locked.
9763b67f 6255 */
6fa3eb70 6256static int cmp_move_tasks(struct sched_domain *sd, struct lb_env *env)
9763b67f 6257{
6fa3eb70
S
6258 struct list_head *tasks = &env->src_rq->cfs_tasks;
6259 struct task_struct *p;
6260 unsigned long load = 0;
6261 int pulled = 0;
9763b67f 6262
6fa3eb70
S
6263 long tg_load_move, other_load_move;
6264 struct list_head tg_tasks, other_tasks;
6265 int src_clid, dst_clid;
6266#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6267 struct cpumask tmp, *cpus = &tmp;
6268#endif
6269#ifdef MTK_QUICK
6270 int flag = 0;
6271#endif
6272 struct clb_env clbenv;
6273 struct cpumask srcmask, dstmask;
9763b67f 6274
6fa3eb70
S
6275 if (env->imbalance <= 0)
6276 return 0;
9763b67f 6277
6fa3eb70
S
6278 other_load_move = env->imbalance;
6279 INIT_LIST_HEAD(&other_tasks);
9763b67f 6280
6fa3eb70
S
6281// if (sd->flags & SD_BALANCE_TG) {
6282 tg_load_move = env->imbalance;
6283 INIT_LIST_HEAD(&tg_tasks);
6284 src_clid = get_cluster_id(env->src_cpu);
6285 dst_clid = get_cluster_id(env->dst_cpu);
6286 BUG_ON(dst_clid == -1 || src_clid == -1);
a35b6466 6287
6fa3eb70
S
6288#ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6289 get_cluster_cpus(cpus, src_clid, true);
6290#endif
6291 mt_sched_printf("move_tasks_tg start: src:cpu=%d clid=%d runnable_load=%lu dst:cpu=%d clid=%d runnable_load=%lu imbalance=%ld curr->on_rq=%d",
6292 env->src_cpu, src_clid, cpu_rq(env->src_cpu)->cfs.runnable_load_avg,
6293 env->dst_cpu, dst_clid, cpu_rq(env->dst_cpu)->cfs.runnable_load_avg,
6294 env->imbalance, env->dst_rq->curr->on_rq);
6295// }
6296
6297 mt_sched_printf("max=%d busiest->nr_running=%d",
6298 env->loop_max, cpu_rq(env->src_cpu)->nr_running);
6299
6300 if (arch_is_big_little()) {
6301 get_cluster_cpus(&srcmask, src_clid, true);
6302 get_cluster_cpus(&dstmask, dst_clid, true);
6303 memset(&clbenv, 0, sizeof(clbenv));
6304 clbenv.flags |= HMP_LB;
6305 clbenv.ltarget = arch_cpu_is_little(env->src_cpu) ? env->src_cpu : env->dst_cpu;
6306 clbenv.btarget = arch_cpu_is_big(env->src_cpu) ? env->src_cpu : env->dst_cpu;
6307 clbenv.lcpus = arch_cpu_is_little(env->src_cpu) ? &srcmask : &dstmask;
6308 clbenv.bcpus = arch_cpu_is_big(env->src_cpu) ? &srcmask : &dstmask;
6309 sched_update_clbstats(&clbenv);
6310 }
a35b6466 6311
6fa3eb70
S
6312 while (!list_empty(tasks)) {
6313 struct thread_group_info_t *src_tginfo, *dst_tginfo;
a35b6466 6314
6fa3eb70 6315 p = list_first_entry(tasks, struct task_struct, se.group_node);
9763b67f 6316
6fa3eb70
S
6317#ifdef CONFIG_MT_SCHED_INFO
6318 mt_sched_printf("check: pid=%d comm=%s load_avg_contrib=%lu h_load=%lu runnable_load_avg=%lu loop=%d, env->imbalance=%ld tg_load_move=%ld",
6319 p->pid, p->comm, p->se.avg.load_avg_contrib,
6320 task_cfs_rq(p)->h_load, task_cfs_rq(p)->runnable_load_avg,
6321 env->loop, env->imbalance, tg_load_move);
6322#endif
6323 env->loop++;
6324 /* We've more or less seen every task there is, call it quits */
6325 if (env->loop > env->loop_max)
6326 break;
230059de 6327
6fa3eb70
S
6328#if 0 // TO check
6329 /* take a breather every nr_migrate tasks */
6330 if (env->loop > env->loop_break) {
6331 env->loop_break += sched_nr_migrate_break;
6332 env->flags |= LBF_NEED_BREAK;
6333 break;
6334 }
6335#endif
6336 BUG_ON(p == NULL || p->group_leader == NULL);
6337 src_tginfo = &p->group_leader->thread_group_info[src_clid];
6338 dst_tginfo = &p->group_leader->thread_group_info[dst_clid];
6339
6340 /* rule0 */
6341 if (!can_migrate_task(p, env)) {
6342 mt_sched_printf("can not migrate: pid=%d comm=%s",
6343 p->pid, p->comm);
6344 goto next;
6345 }
230059de 6346
6fa3eb70 6347 load = task_h_load(p);
9e3081ca 6348
6fa3eb70
S
6349 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) {
6350 mt_sched_printf("can not migrate: pid=%d comm=%s sched_feat",
6351 p->pid, p->comm );
6352 goto next;
6353 }
230059de 6354
6fa3eb70
S
6355 if (over_imbalance(load, env->imbalance)) {
6356 mt_sched_printf("can not migrate: pid=%d comm=%s load=%ld imbalance=%ld",
6357 p->pid, p->comm, load, env->imbalance );
6358 goto next;
6359 }
6360
6361 /* meet rule0 , migrate immediately */
6362 if (need_migrate_task_immediately(p, env, &clbenv)) {
6363 pulled++;
6364 env->imbalance -= load;
6365 tg_load_move -= load;
6366 other_load_move -= load;
6367 mt_sched_printf("hit rule0: pid=%d comm=%s load=%ld imbalance=%ld tg_imbalance=%ld other_load_move=%ld",
6368 p->pid, p->comm, load, env->imbalance, tg_load_move, other_load_move);
6369 move_task(p, env);
6370 if (env->imbalance <= 0)
6371 break;
6372 continue;
6373 }
6374
6375 /* for TGS */
6376 if (!cmp_can_migrate_task(p, env))
6377 goto next;
6378
6379 if (sd->flags & SD_BALANCE_TG){
6380 if (over_imbalance(load, tg_load_move)) {
6381 mt_sched_printf("can not migrate: pid=%d comm=%s load=%ld imbalance=%ld",
6382 p->pid, p->comm, load, tg_load_move );
6383 goto next;
6384 }
6385
6386#ifdef MTK_QUICK
6387 if (candidate_load_move <= 0) {
6388 mt_sched_printf("check: pid=%d comm=%s candidate_load_move=%d",
6389 p->pid, p->comm, candidate_load_move);
6390 goto next;
6391 }
6392#endif
6393
6394 /* rule1, single thread */
6395#ifdef CONFIG_MT_SCHED_INFO
6396 mt_sched_printf("check rule1: pid=%d p->comm=%s thread_group_cnt=%lu thread_group_empty(p)=%d",
6397 p->pid, p->comm,
6398 p->group_leader->thread_group_info[0].nr_running +
6399 p->group_leader->thread_group_info[1].nr_running,
6400 thread_group_empty(p));
6401#endif
6402
6403 if (thread_group_empty(p)) {
6404 list_move_tail(&p->se.group_node, &tg_tasks);
6405 tg_load_move -= load;
6406 other_load_move -= load;
6407 mt_sched_printf("hit rule1: pid=%d p->comm=%s load=%ld tg_imbalance=%ld",
6408 p->pid, p->comm, load, tg_load_move);
6409 continue;
6410 }
6411
6412 /* rule2 */
6413#ifdef CONFIG_MT_SCHED_INFO
6414 mt_sched_printf("check rule2: pid=%d p->comm=%s %ld, %ld, %ld, %ld, %ld",
6415 p->pid, p->comm, src_tginfo->nr_running, src_tginfo->cfs_nr_running, dst_tginfo->nr_running,
6416 p->se.avg.load_avg_ratio, src_tginfo->load_avg_ratio);
6417#endif
6418 if ((src_tginfo->nr_running < dst_tginfo->nr_running) &&
6419 ((p->se.avg.load_avg_ratio * src_tginfo->cfs_nr_running) <=
6420 src_tginfo->load_avg_ratio)) {
6421 list_move_tail(&p->se.group_node, &tg_tasks);
6422 tg_load_move -= load;
6423 other_load_move -= load;
6424 mt_sched_printf("hit rule2: pid=%d p->comm=%s load=%ld tg_imbalance=%ld",
6425 p->pid, p->comm, load, tg_load_move);
6426 continue;
6427 }
6428
6429 if (over_imbalance(load, other_load_move))
6430 goto next;
6431/*
6432 if (other_load_move <= 0)
6433 goto next;
6434*/
6435
6436 list_move_tail(&p->se.group_node, &other_tasks);
6437 other_load_move -= load;
6438 continue;
6439 }else{
6440 list_move_tail(&p->se.group_node, &other_tasks);
6441 other_load_move -= load;
6442 continue;
6443 }
6444
6445 // ytchang
6446#if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6447 if(need_lazy_balance(env->dst_cpu, env->src_cpu, p))
6448 goto next;
6449#endif
6450
6451next:
6452 /* original rule */
6453 list_move_tail(&p->se.group_node, tasks);
6454 } // end of while()
6455
6456 if ( sd->flags & SD_BALANCE_TG){
6457 while (!list_empty(&tg_tasks)) {
6458 p = list_first_entry(&tg_tasks, struct task_struct, se.group_node);
6459 list_move_tail(&p->se.group_node, tasks);
6460
6461 if (env->imbalance > 0) {
6462 load = task_h_load(p);
6463 if (over_imbalance(load, env->imbalance)){
6464 mt_sched_printf("overload rule1,2: pid=%d p->comm=%s load=%ld imbalance=%ld",
6465 p->pid, p->comm, load, env->imbalance);
6466#ifdef MTK_QUICK
6467
6468 flag=1;
6469#endif
6470 continue;
6471 }
6472
6473 move_task(p, env);
6474 env->imbalance -= load;
6475 pulled++;
6476
6477 mt_sched_printf("migrate hit rule1,2: pid=%d p->comm=%s load=%ld imbalance=%ld",
6478 p->pid, p->comm, load, env->imbalance);
6479 }
6480 }
6481 }
6482
6483 mt_sched_printf("move_tasks_tg finish rule migrate");
6484
6485 while (!list_empty(&other_tasks)) {
6486 p = list_first_entry(&other_tasks, struct task_struct, se.group_node);
6487 list_move_tail(&p->se.group_node, tasks);
6488
6489#ifdef MTK_QUICK
6490 if (!flag && (env->imbalance > 0)) {
6491#else
6492 if (env->imbalance > 0) {
6493#endif
6494 load = task_h_load(p);
6495
6496 if (over_imbalance(load, env->imbalance)){
6497 mt_sched_printf("overload others: pid=%d p->comm=%s load=%ld imbalance=%ld",
6498 p->pid, p->comm, load, env->imbalance);
6499 continue;
6500 }
6501
6502 move_task(p, env);
6503 env->imbalance -= load;
6504 pulled++;
6505
6506 mt_sched_printf("migrate others: pid=%d p->comm=%s load=%ld imbalance=%ld",
6507 p->pid, p->comm, load, env->imbalance);
6508 }
6509 }
6510
6511 /*
6512 * Right now, this is one of only two places move_task() is called,
6513 * so we can safely collect move_task() stats here rather than
6514 * inside move_task().
6515 */
6516 schedstat_add(env->sd, lb_gained[env->idle], pulled);
6517
6518 mt_sched_printf("move_tasks_tg finish pulled=%d imbalance=%ld", pulled, env->imbalance);
6519
6520 return pulled;
6521}
6522
6523#endif /* CONFIG_MTK_SCHED_CMP */
6524
6525
6526#if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6527static int need_lazy_balance(int dst_cpu, int src_cpu, struct task_struct *p)
6528{
6529 /* Lazy balnace for small task
6530 1. src cpu is buddy cpu
6531 2. src cpu is not busy cpu
6532 3. p is light task
6533 */
6534#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
6535 if ( PA_ENABLE && cpumask_test_cpu(src_cpu, &buddy_cpu_map) &&
6536 !is_buddy_busy(src_cpu) && is_light_task(p)) {
6537#else
6538 if (cpumask_test_cpu(src_cpu, &buddy_cpu_map) &&
6539 !is_buddy_busy(src_cpu) && is_light_task(p)) {
6540#endif
6541#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
6542 unsigned int i;
6543 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[src_cpu][dst_cpu]++;
6544 mt_sched_printf("[PA]pid=%d, Lazy balance from CPU%d to CPU%d\n)\n", p->pid, src_cpu, dst_cpu);
6545 for(i=0;i<4;i++) {
6546 if(PA_MON_ENABLE && (strcmp(p->comm, &PA_MON[i][0]) == 0)) {
6547 printk(KERN_EMERG "[PA] %s Lazy balance from CPU%d to CPU%d\n", p->comm, src_cpu, dst_cpu);
6548 // printk(KERN_EMERG "[PA] src_cpu RQ Usage = %u, Period = %u, NR = %u\n",
6549 // per_cpu(BUDDY_CPU_RQ_USAGE, src_cpu),
6550 // per_cpu(BUDDY_CPU_RQ_PERIOD, src_cpu),
6551 // per_cpu(BUDDY_CPU_RQ_NR, src_cpu));
6552 // printk(KERN_EMERG "[PA] Task Usage = %u, Period = %u\n",
6553 // p->se.avg.usage_avg_sum,
6554 // p->se.avg.runnable_avg_period);
6555 }
6556 }
6557#endif
6558 return 1;
6559 }
6560 else
6561 return 0;
6562}
6563#endif
6564#ifdef CONFIG_FAIR_GROUP_SCHED
6565/*
6566 * update tg->load_weight by folding this cpu's load_avg
6567 */
6568static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
6569{
6570 struct sched_entity *se = tg->se[cpu];
6571 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
6572
6573 /* throttled entities do not contribute to load */
6574 if (throttled_hierarchy(cfs_rq))
6575 return;
6576
6577 update_cfs_rq_blocked_load(cfs_rq, 1);
6578
6579 if (se) {
6580 update_entity_load_avg(se, 1);
6581 /*
6582 * We pivot on our runnable average having decayed to zero for
6583 * list removal. This generally implies that all our children
6584 * have also been removed (modulo rounding error or bandwidth
6585 * control); however, such cases are rare and we can fix these
6586 * at enqueue.
6587 *
6588 * TODO: fix up out-of-order children on enqueue.
6589 */
6590 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
6591 list_del_leaf_cfs_rq(cfs_rq);
6592 } else {
6593 struct rq *rq = rq_of(cfs_rq);
6594 update_rq_runnable_avg(rq, rq->nr_running);
6595 }
6596}
6597
6598static void update_blocked_averages(int cpu)
6599{
6600 struct rq *rq = cpu_rq(cpu);
6601 struct cfs_rq *cfs_rq;
6602 unsigned long flags;
6603
6604 raw_spin_lock_irqsave(&rq->lock, flags);
6605 update_rq_clock(rq);
6606 /*
6607 * Iterates the task_group tree in a bottom up fashion, see
6608 * list_add_leaf_cfs_rq() for details.
6609 */
6610 for_each_leaf_cfs_rq(rq, cfs_rq) {
6611 /*
6612 * Note: We may want to consider periodically releasing
6613 * rq->lock about these updates so that creating many task
6614 * groups does not result in continually extending hold time.
6615 */
6616 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
6617 }
6618
6619 raw_spin_unlock_irqrestore(&rq->lock, flags);
6620}
6621
6622/*
6623 * Compute the cpu's hierarchical load factor for each task group.
6624 * This needs to be done in a top-down fashion because the load of a child
6625 * group is a fraction of its parents load.
6626 */
6627static int tg_load_down(struct task_group *tg, void *data)
6628{
6629 unsigned long load;
6630 long cpu = (long)data;
6631
6632 if (!tg->parent) {
6633 /*
6634 * rq's sched_avg is not updated accordingly. adopt rq's
6635 * corresponding cfs_rq runnable loading instead.
6636 *
6637 * a003a25b sched: Consider runnable load average...
6638 *
6639
6640 load = cpu_rq(cpu)->avg.load_avg_contrib;
6641
6642 */
6643 load = cpu_rq(cpu)->cfs.runnable_load_avg;
6644 } else {
6645 load = tg->parent->cfs_rq[cpu]->h_load;
6646 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
6647 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
6648 }
6649
6650 tg->cfs_rq[cpu]->h_load = load;
6651
6652 return 0;
6653}
6654
6655static void update_h_load(long cpu)
6656{
6657 rcu_read_lock();
6658 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
6659 rcu_read_unlock();
6660}
6661
6662static unsigned long task_h_load(struct task_struct *p)
6663{
6664 struct cfs_rq *cfs_rq = task_cfs_rq(p);
6665
6666 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
6667 cfs_rq->runnable_load_avg + 1);
6668}
6669#else
6670static inline void update_blocked_averages(int cpu)
6671{
6672}
6673
6674static inline void update_h_load(long cpu)
6675{
6676}
6677
6678static unsigned long task_h_load(struct task_struct *p)
1e3c88bd 6679{
6fa3eb70 6680 return p->se.avg.load_avg_contrib;
1e3c88bd 6681}
230059de 6682#endif
1e3c88bd 6683
1e3c88bd
PZ
6684/********** Helpers for find_busiest_group ************************/
6685/*
6686 * sd_lb_stats - Structure to store the statistics of a sched_domain
6687 * during load balancing.
6688 */
6689struct sd_lb_stats {
6690 struct sched_group *busiest; /* Busiest group in this sd */
6691 struct sched_group *this; /* Local group in this sd */
6692 unsigned long total_load; /* Total load of all groups in sd */
6693 unsigned long total_pwr; /* Total power of all groups in sd */
6694 unsigned long avg_load; /* Average load across all groups in sd */
6695
6696 /** Statistics of this group */
6697 unsigned long this_load;
6698 unsigned long this_load_per_task;
6699 unsigned long this_nr_running;
fab47622 6700 unsigned long this_has_capacity;
aae6d3dd 6701 unsigned int this_idle_cpus;
1e3c88bd
PZ
6702
6703 /* Statistics of the busiest group */
aae6d3dd 6704 unsigned int busiest_idle_cpus;
1e3c88bd
PZ
6705 unsigned long max_load;
6706 unsigned long busiest_load_per_task;
6707 unsigned long busiest_nr_running;
dd5feea1 6708 unsigned long busiest_group_capacity;
fab47622 6709 unsigned long busiest_has_capacity;
aae6d3dd 6710 unsigned int busiest_group_weight;
1e3c88bd
PZ
6711
6712 int group_imb; /* Is there imbalance in this sd */
1e3c88bd
PZ
6713};
6714
6715/*
6716 * sg_lb_stats - stats of a sched_group required for load_balancing
6717 */
6718struct sg_lb_stats {
6719 unsigned long avg_load; /*Avg load across the CPUs of the group */
6720 unsigned long group_load; /* Total load over the CPUs of the group */
6721 unsigned long sum_nr_running; /* Nr tasks running in the group */
6722 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6723 unsigned long group_capacity;
aae6d3dd
SS
6724 unsigned long idle_cpus;
6725 unsigned long group_weight;
1e3c88bd 6726 int group_imb; /* Is there an imbalance in the group ? */
fab47622 6727 int group_has_capacity; /* Is there extra capacity in the group? */
1e3c88bd
PZ
6728};
6729
1e3c88bd
PZ
6730/**
6731 * get_sd_load_idx - Obtain the load index for a given sched domain.
6732 * @sd: The sched_domain whose load_idx is to be obtained.
6733 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
6734 */
6735static inline int get_sd_load_idx(struct sched_domain *sd,
6736 enum cpu_idle_type idle)
6737{
6738 int load_idx;
6739
6740 switch (idle) {
6741 case CPU_NOT_IDLE:
6742 load_idx = sd->busy_idx;
6743 break;
6744
6745 case CPU_NEWLY_IDLE:
6746 load_idx = sd->newidle_idx;
6747 break;
6748 default:
6749 load_idx = sd->idle_idx;
6750 break;
6751 }
6752
6753 return load_idx;
6754}
6755
15f803c9 6756static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
1e3c88bd 6757{
1399fa78 6758 return SCHED_POWER_SCALE;
1e3c88bd
PZ
6759}
6760
6761unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
6762{
6763 return default_scale_freq_power(sd, cpu);
6764}
6765
15f803c9 6766static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
1e3c88bd 6767{
669c55e9 6768 unsigned long weight = sd->span_weight;
1e3c88bd
PZ
6769 unsigned long smt_gain = sd->smt_gain;
6770
6771 smt_gain /= weight;
6772
6773 return smt_gain;
6774}
6775
6776unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
6777{
6778 return default_scale_smt_power(sd, cpu);
6779}
6780
15f803c9 6781static unsigned long scale_rt_power(int cpu)
1e3c88bd
PZ
6782{
6783 struct rq *rq = cpu_rq(cpu);
b654f7de 6784 u64 total, available, age_stamp, avg;
1e3c88bd 6785
b654f7de
PZ
6786 /*
6787 * Since we're reading these variables without serialization make sure
6788 * we read them once before doing sanity checks on them.
6789 */
6790 age_stamp = ACCESS_ONCE(rq->age_stamp);
6791 avg = ACCESS_ONCE(rq->rt_avg);
6792
6793 total = sched_avg_period() + (rq->clock - age_stamp);
aa483808 6794
b654f7de 6795 if (unlikely(total < avg)) {
aa483808
VP
6796 /* Ensures that power won't end up being negative */
6797 available = 0;
6798 } else {
b654f7de 6799 available = total - avg;
aa483808 6800 }
1e3c88bd 6801
1399fa78
NR
6802 if (unlikely((s64)total < SCHED_POWER_SCALE))
6803 total = SCHED_POWER_SCALE;
1e3c88bd 6804
1399fa78 6805 total >>= SCHED_POWER_SHIFT;
1e3c88bd
PZ
6806
6807 return div_u64(available, total);
6808}
6809
6810static void update_cpu_power(struct sched_domain *sd, int cpu)
6811{
669c55e9 6812 unsigned long weight = sd->span_weight;
1399fa78 6813 unsigned long power = SCHED_POWER_SCALE;
1e3c88bd
PZ
6814 struct sched_group *sdg = sd->groups;
6815
1e3c88bd
PZ
6816 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
6817 if (sched_feat(ARCH_POWER))
6818 power *= arch_scale_smt_power(sd, cpu);
6819 else
6820 power *= default_scale_smt_power(sd, cpu);
6821
1399fa78 6822 power >>= SCHED_POWER_SHIFT;
1e3c88bd
PZ
6823 }
6824
9c3f75cb 6825 sdg->sgp->power_orig = power;
9d5efe05
SV
6826
6827 if (sched_feat(ARCH_POWER))
6828 power *= arch_scale_freq_power(sd, cpu);
6829 else
6830 power *= default_scale_freq_power(sd, cpu);
6831
1399fa78 6832 power >>= SCHED_POWER_SHIFT;
9d5efe05 6833
1e3c88bd 6834 power *= scale_rt_power(cpu);
1399fa78 6835 power >>= SCHED_POWER_SHIFT;
1e3c88bd
PZ
6836
6837 if (!power)
6838 power = 1;
6839
e51fd5e2 6840 cpu_rq(cpu)->cpu_power = power;
9c3f75cb 6841 sdg->sgp->power = power;
1e3c88bd
PZ
6842}
6843
029632fb 6844void update_group_power(struct sched_domain *sd, int cpu)
1e3c88bd
PZ
6845{
6846 struct sched_domain *child = sd->child;
6847 struct sched_group *group, *sdg = sd->groups;
6848 unsigned long power;
4ec4412e
VG
6849 unsigned long interval;
6850
6851 interval = msecs_to_jiffies(sd->balance_interval);
6852 interval = clamp(interval, 1UL, max_load_balance_interval);
6853 sdg->sgp->next_update = jiffies + interval;
1e3c88bd
PZ
6854
6855 if (!child) {
6856 update_cpu_power(sd, cpu);
6857 return;
6858 }
6859
6860 power = 0;
6861
74a5ce20
PZ
6862 if (child->flags & SD_OVERLAP) {
6863 /*
6864 * SD_OVERLAP domains cannot assume that child groups
6865 * span the current group.
6866 */
6867
6868 for_each_cpu(cpu, sched_group_cpus(sdg))
6869 power += power_of(cpu);
6870 } else {
6871 /*
6872 * !SD_OVERLAP domains can assume that child groups
6873 * span the current group.
6874 */
6875
6876 group = child->groups;
6877 do {
6878 power += group->sgp->power;
6879 group = group->next;
6880 } while (group != child->groups);
6881 }
1e3c88bd 6882
c3decf0d 6883 sdg->sgp->power_orig = sdg->sgp->power = power;
1e3c88bd
PZ
6884}
6885
9d5efe05
SV
6886/*
6887 * Try and fix up capacity for tiny siblings, this is needed when
6888 * things like SD_ASYM_PACKING need f_b_g to select another sibling
6889 * which on its own isn't powerful enough.
6890 *
6891 * See update_sd_pick_busiest() and check_asym_packing().
6892 */
6893static inline int
6894fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
6895{
6896 /*
1399fa78 6897 * Only siblings can have significantly less than SCHED_POWER_SCALE
9d5efe05 6898 */
a6c75f2f 6899 if (!(sd->flags & SD_SHARE_CPUPOWER))
9d5efe05
SV
6900 return 0;
6901
6902 /*
6903 * If ~90% of the cpu_power is still there, we're good.
6904 */
9c3f75cb 6905 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
9d5efe05
SV
6906 return 1;
6907
6908 return 0;
6909}
6910
1e3c88bd
PZ
6911/**
6912 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
cd96891d 6913 * @env: The load balancing environment.
1e3c88bd 6914 * @group: sched_group whose statistics are to be updated.
1e3c88bd 6915 * @load_idx: Load index of sched_domain of this_cpu for load calc.
1e3c88bd 6916 * @local_group: Does group contain this_cpu.
1e3c88bd
PZ
6917 * @balance: Should we balance.
6918 * @sgs: variable to hold the statistics for this group.
6919 */
bd939f45
PZ
6920static inline void update_sg_lb_stats(struct lb_env *env,
6921 struct sched_group *group, int load_idx,
b9403130 6922 int local_group, int *balance, struct sg_lb_stats *sgs)
1e3c88bd 6923{
e44bc5c5
PZ
6924 unsigned long nr_running, max_nr_running, min_nr_running;
6925 unsigned long load, max_cpu_load, min_cpu_load;
04f733b4 6926 unsigned int balance_cpu = -1, first_idle_cpu = 0;
dd5feea1 6927 unsigned long avg_load_per_task = 0;
bd939f45 6928 int i;
1e3c88bd 6929
871e35bc 6930 if (local_group)
c1174876 6931 balance_cpu = group_balance_cpu(group);
1e3c88bd
PZ
6932
6933 /* Tally up the load of all CPUs in the group */
1e3c88bd
PZ
6934 max_cpu_load = 0;
6935 min_cpu_load = ~0UL;
2582f0eb 6936 max_nr_running = 0;
e44bc5c5 6937 min_nr_running = ~0UL;
1e3c88bd 6938
b9403130 6939 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
1e3c88bd
PZ
6940 struct rq *rq = cpu_rq(i);
6941
e44bc5c5
PZ
6942 nr_running = rq->nr_running;
6943
1e3c88bd
PZ
6944 /* Bias balancing toward cpus of our domain */
6945 if (local_group) {
c1174876
PZ
6946 if (idle_cpu(i) && !first_idle_cpu &&
6947 cpumask_test_cpu(i, sched_group_mask(group))) {
04f733b4 6948 first_idle_cpu = 1;
1e3c88bd
PZ
6949 balance_cpu = i;
6950 }
04f733b4
PZ
6951
6952 load = target_load(i, load_idx);
1e3c88bd
PZ
6953 } else {
6954 load = source_load(i, load_idx);
e44bc5c5 6955 if (load > max_cpu_load)
1e3c88bd
PZ
6956 max_cpu_load = load;
6957 if (min_cpu_load > load)
6958 min_cpu_load = load;
e44bc5c5
PZ
6959
6960 if (nr_running > max_nr_running)
6961 max_nr_running = nr_running;
6962 if (min_nr_running > nr_running)
6963 min_nr_running = nr_running;
6fa3eb70
S
6964
6965#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
6966 if((load_idx > 0) && (load == cpu_rq(i)->cpu_load[load_idx-1]))
6967 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_HISTORY);
6968#endif
1e3c88bd
PZ
6969 }
6970
6971 sgs->group_load += load;
e44bc5c5 6972 sgs->sum_nr_running += nr_running;
1e3c88bd 6973 sgs->sum_weighted_load += weighted_cpuload(i);
aae6d3dd
SS
6974 if (idle_cpu(i))
6975 sgs->idle_cpus++;
1e3c88bd
PZ
6976 }
6977
6978 /*
6979 * First idle cpu or the first cpu(busiest) in this sched group
6980 * is eligible for doing load balancing at this and above
6981 * domains. In the newly idle case, we will allow all the cpu's
6982 * to do the newly idle load balance.
6983 */
4ec4412e 6984 if (local_group) {
bd939f45 6985 if (env->idle != CPU_NEWLY_IDLE) {
04f733b4 6986 if (balance_cpu != env->dst_cpu) {
4ec4412e
VG
6987 *balance = 0;
6988 return;
6989 }
bd939f45 6990 update_group_power(env->sd, env->dst_cpu);
4ec4412e 6991 } else if (time_after_eq(jiffies, group->sgp->next_update))
bd939f45 6992 update_group_power(env->sd, env->dst_cpu);
1e3c88bd
PZ
6993 }
6994
6995 /* Adjust by relative CPU power of the group */
9c3f75cb 6996 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
1e3c88bd 6997
1e3c88bd
PZ
6998 /*
6999 * Consider the group unbalanced when the imbalance is larger
866ab43e 7000 * than the average weight of a task.
1e3c88bd
PZ
7001 *
7002 * APZ: with cgroup the avg task weight can vary wildly and
7003 * might not be a suitable number - should we keep a
7004 * normalized nr_running number somewhere that negates
7005 * the hierarchy?
7006 */
dd5feea1
SS
7007 if (sgs->sum_nr_running)
7008 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
1e3c88bd 7009
e44bc5c5
PZ
7010 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
7011 (max_nr_running - min_nr_running) > 1)
1e3c88bd
PZ
7012 sgs->group_imb = 1;
7013
9c3f75cb 7014 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
1399fa78 7015 SCHED_POWER_SCALE);
9d5efe05 7016 if (!sgs->group_capacity)
bd939f45 7017 sgs->group_capacity = fix_small_capacity(env->sd, group);
aae6d3dd 7018 sgs->group_weight = group->group_weight;
fab47622
NR
7019
7020 if (sgs->group_capacity > sgs->sum_nr_running)
7021 sgs->group_has_capacity = 1;
1e3c88bd
PZ
7022}
7023
532cb4c4
MN
7024/**
7025 * update_sd_pick_busiest - return 1 on busiest group
cd96891d 7026 * @env: The load balancing environment.
532cb4c4
MN
7027 * @sds: sched_domain statistics
7028 * @sg: sched_group candidate to be checked for being the busiest
b6b12294 7029 * @sgs: sched_group statistics
532cb4c4
MN
7030 *
7031 * Determine if @sg is a busier group than the previously selected
7032 * busiest group.
7033 */
bd939f45 7034static bool update_sd_pick_busiest(struct lb_env *env,
532cb4c4
MN
7035 struct sd_lb_stats *sds,
7036 struct sched_group *sg,
bd939f45 7037 struct sg_lb_stats *sgs)
532cb4c4 7038{
6fa3eb70
S
7039 if (sgs->avg_load <= sds->max_load) {
7040 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_PICK_BUSIEST_FAIL_1);
532cb4c4 7041 return false;
6fa3eb70 7042 }
532cb4c4
MN
7043
7044 if (sgs->sum_nr_running > sgs->group_capacity)
7045 return true;
7046
7047 if (sgs->group_imb)
7048 return true;
7049
7050 /*
7051 * ASYM_PACKING needs to move all the work to the lowest
7052 * numbered CPUs in the group, therefore mark all groups
7053 * higher than ourself as busy.
7054 */
bd939f45
PZ
7055 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
7056 env->dst_cpu < group_first_cpu(sg)) {
532cb4c4
MN
7057 if (!sds->busiest)
7058 return true;
7059
7060 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
7061 return true;
7062 }
7063
6fa3eb70 7064 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_PICK_BUSIEST_FAIL_2);
532cb4c4
MN
7065 return false;
7066}
7067
1e3c88bd 7068/**
461819ac 7069 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
cd96891d 7070 * @env: The load balancing environment.
1e3c88bd
PZ
7071 * @balance: Should we balance.
7072 * @sds: variable to hold the statistics for this sched_domain.
7073 */
bd939f45 7074static inline void update_sd_lb_stats(struct lb_env *env,
b9403130 7075 int *balance, struct sd_lb_stats *sds)
1e3c88bd 7076{
bd939f45
PZ
7077 struct sched_domain *child = env->sd->child;
7078 struct sched_group *sg = env->sd->groups;
1e3c88bd
PZ
7079 struct sg_lb_stats sgs;
7080 int load_idx, prefer_sibling = 0;
7081
7082 if (child && child->flags & SD_PREFER_SIBLING)
7083 prefer_sibling = 1;
7084
bd939f45 7085 load_idx = get_sd_load_idx(env->sd, env->idle);
1e3c88bd
PZ
7086
7087 do {
7088 int local_group;
7089
bd939f45 7090 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
1e3c88bd 7091 memset(&sgs, 0, sizeof(sgs));
b9403130 7092 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
1e3c88bd 7093
8f190fb3 7094 if (local_group && !(*balance))
1e3c88bd
PZ
7095 return;
7096
7097 sds->total_load += sgs.group_load;
9c3f75cb 7098 sds->total_pwr += sg->sgp->power;
1e3c88bd
PZ
7099
7100 /*
7101 * In case the child domain prefers tasks go to siblings
532cb4c4 7102 * first, lower the sg capacity to one so that we'll try
75dd321d
NR
7103 * and move all the excess tasks away. We lower the capacity
7104 * of a group only if the local group has the capacity to fit
7105 * these excess tasks, i.e. nr_running < group_capacity. The
7106 * extra check prevents the case where you always pull from the
7107 * heaviest group when it is already under-utilized (possible
7108 * with a large weight task outweighs the tasks on the system).
1e3c88bd 7109 */
75dd321d 7110 if (prefer_sibling && !local_group && sds->this_has_capacity)
1e3c88bd
PZ
7111 sgs.group_capacity = min(sgs.group_capacity, 1UL);
7112
7113 if (local_group) {
7114 sds->this_load = sgs.avg_load;
532cb4c4 7115 sds->this = sg;
1e3c88bd
PZ
7116 sds->this_nr_running = sgs.sum_nr_running;
7117 sds->this_load_per_task = sgs.sum_weighted_load;
fab47622 7118 sds->this_has_capacity = sgs.group_has_capacity;
aae6d3dd 7119 sds->this_idle_cpus = sgs.idle_cpus;
bd939f45 7120 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
1e3c88bd 7121 sds->max_load = sgs.avg_load;
532cb4c4 7122 sds->busiest = sg;
1e3c88bd 7123 sds->busiest_nr_running = sgs.sum_nr_running;
aae6d3dd 7124 sds->busiest_idle_cpus = sgs.idle_cpus;
dd5feea1 7125 sds->busiest_group_capacity = sgs.group_capacity;
1e3c88bd 7126 sds->busiest_load_per_task = sgs.sum_weighted_load;
fab47622 7127 sds->busiest_has_capacity = sgs.group_has_capacity;
aae6d3dd 7128 sds->busiest_group_weight = sgs.group_weight;
1e3c88bd
PZ
7129 sds->group_imb = sgs.group_imb;
7130 }
7131
532cb4c4 7132 sg = sg->next;
bd939f45 7133 } while (sg != env->sd->groups);
532cb4c4
MN
7134}
7135
532cb4c4
MN
7136/**
7137 * check_asym_packing - Check to see if the group is packed into the
7138 * sched doman.
7139 *
7140 * This is primarily intended to used at the sibling level. Some
7141 * cores like POWER7 prefer to use lower numbered SMT threads. In the
7142 * case of POWER7, it can move to lower SMT modes only when higher
7143 * threads are idle. When in lower SMT modes, the threads will
7144 * perform better since they share less core resources. Hence when we
7145 * have idle threads, we want them to be the higher ones.
7146 *
7147 * This packing function is run on idle threads. It checks to see if
7148 * the busiest CPU in this domain (core in the P7 case) has a higher
7149 * CPU number than the packing function is being run on. Here we are
7150 * assuming lower CPU number will be equivalent to lower a SMT thread
7151 * number.
7152 *
b6b12294
MN
7153 * Returns 1 when packing is required and a task should be moved to
7154 * this CPU. The amount of the imbalance is returned in *imbalance.
7155 *
cd96891d 7156 * @env: The load balancing environment.
532cb4c4 7157 * @sds: Statistics of the sched_domain which is to be packed
532cb4c4 7158 */
bd939f45 7159static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
532cb4c4
MN
7160{
7161 int busiest_cpu;
7162
bd939f45 7163 if (!(env->sd->flags & SD_ASYM_PACKING))
532cb4c4
MN
7164 return 0;
7165
7166 if (!sds->busiest)
7167 return 0;
7168
7169 busiest_cpu = group_first_cpu(sds->busiest);
bd939f45 7170 if (env->dst_cpu > busiest_cpu)
532cb4c4
MN
7171 return 0;
7172
bd939f45
PZ
7173 env->imbalance = DIV_ROUND_CLOSEST(
7174 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
7175
532cb4c4 7176 return 1;
1e3c88bd
PZ
7177}
7178
7179/**
7180 * fix_small_imbalance - Calculate the minor imbalance that exists
7181 * amongst the groups of a sched_domain, during
7182 * load balancing.
cd96891d 7183 * @env: The load balancing environment.
1e3c88bd 7184 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 7185 */
bd939f45
PZ
7186static inline
7187void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd
PZ
7188{
7189 unsigned long tmp, pwr_now = 0, pwr_move = 0;
7190 unsigned int imbn = 2;
dd5feea1 7191 unsigned long scaled_busy_load_per_task;
1e3c88bd
PZ
7192
7193 if (sds->this_nr_running) {
7194 sds->this_load_per_task /= sds->this_nr_running;
7195 if (sds->busiest_load_per_task >
7196 sds->this_load_per_task)
7197 imbn = 1;
bd939f45 7198 } else {
1e3c88bd 7199 sds->this_load_per_task =
bd939f45
PZ
7200 cpu_avg_load_per_task(env->dst_cpu);
7201 }
1e3c88bd 7202
dd5feea1 7203 scaled_busy_load_per_task = sds->busiest_load_per_task
1399fa78 7204 * SCHED_POWER_SCALE;
9c3f75cb 7205 scaled_busy_load_per_task /= sds->busiest->sgp->power;
dd5feea1
SS
7206
7207 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
7208 (scaled_busy_load_per_task * imbn)) {
bd939f45 7209 env->imbalance = sds->busiest_load_per_task;
1e3c88bd
PZ
7210 return;
7211 }
7212
7213 /*
7214 * OK, we don't have enough imbalance to justify moving tasks,
7215 * however we may be able to increase total CPU power used by
7216 * moving them.
7217 */
7218
9c3f75cb 7219 pwr_now += sds->busiest->sgp->power *
1e3c88bd 7220 min(sds->busiest_load_per_task, sds->max_load);
9c3f75cb 7221 pwr_now += sds->this->sgp->power *
1e3c88bd 7222 min(sds->this_load_per_task, sds->this_load);
1399fa78 7223 pwr_now /= SCHED_POWER_SCALE;
1e3c88bd
PZ
7224
7225 /* Amount of load we'd subtract */
1399fa78 7226 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
9c3f75cb 7227 sds->busiest->sgp->power;
1e3c88bd 7228 if (sds->max_load > tmp)
9c3f75cb 7229 pwr_move += sds->busiest->sgp->power *
1e3c88bd
PZ
7230 min(sds->busiest_load_per_task, sds->max_load - tmp);
7231
7232 /* Amount of load we'd add */
9c3f75cb 7233 if (sds->max_load * sds->busiest->sgp->power <
1399fa78 7234 sds->busiest_load_per_task * SCHED_POWER_SCALE)
9c3f75cb
PZ
7235 tmp = (sds->max_load * sds->busiest->sgp->power) /
7236 sds->this->sgp->power;
1e3c88bd 7237 else
1399fa78 7238 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
9c3f75cb
PZ
7239 sds->this->sgp->power;
7240 pwr_move += sds->this->sgp->power *
1e3c88bd 7241 min(sds->this_load_per_task, sds->this_load + tmp);
1399fa78 7242 pwr_move /= SCHED_POWER_SCALE;
1e3c88bd
PZ
7243
7244 /* Move if we gain throughput */
7245 if (pwr_move > pwr_now)
bd939f45 7246 env->imbalance = sds->busiest_load_per_task;
1e3c88bd
PZ
7247}
7248
7249/**
7250 * calculate_imbalance - Calculate the amount of imbalance present within the
7251 * groups of a given sched_domain during load balance.
bd939f45 7252 * @env: load balance environment
1e3c88bd 7253 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 7254 */
bd939f45 7255static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 7256{
dd5feea1
SS
7257 unsigned long max_pull, load_above_capacity = ~0UL;
7258
7259 sds->busiest_load_per_task /= sds->busiest_nr_running;
7260 if (sds->group_imb) {
7261 sds->busiest_load_per_task =
7262 min(sds->busiest_load_per_task, sds->avg_load);
7263 }
7264
1e3c88bd
PZ
7265 /*
7266 * In the presence of smp nice balancing, certain scenarios can have
7267 * max load less than avg load(as we skip the groups at or below
7268 * its cpu_power, while calculating max_load..)
7269 */
7270 if (sds->max_load < sds->avg_load) {
bd939f45
PZ
7271 env->imbalance = 0;
7272 return fix_small_imbalance(env, sds);
1e3c88bd
PZ
7273 }
7274
dd5feea1
SS
7275 if (!sds->group_imb) {
7276 /*
7277 * Don't want to pull so many tasks that a group would go idle.
7278 */
7279 load_above_capacity = (sds->busiest_nr_running -
7280 sds->busiest_group_capacity);
7281
1399fa78 7282 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
dd5feea1 7283
9c3f75cb 7284 load_above_capacity /= sds->busiest->sgp->power;
dd5feea1
SS
7285 }
7286
7287 /*
7288 * We're trying to get all the cpus to the average_load, so we don't
7289 * want to push ourselves above the average load, nor do we wish to
7290 * reduce the max loaded cpu below the average load. At the same time,
7291 * we also don't want to reduce the group load below the group capacity
7292 * (so that we can implement power-savings policies etc). Thus we look
7293 * for the minimum possible imbalance.
7294 * Be careful of negative numbers as they'll appear as very large values
7295 * with unsigned longs.
7296 */
7297 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
1e3c88bd
PZ
7298
7299 /* How much load to actually move to equalise the imbalance */
bd939f45 7300 env->imbalance = min(max_pull * sds->busiest->sgp->power,
9c3f75cb 7301 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
1399fa78 7302 / SCHED_POWER_SCALE;
1e3c88bd
PZ
7303
7304 /*
7305 * if *imbalance is less than the average load per runnable task
25985edc 7306 * there is no guarantee that any tasks will be moved so we'll have
1e3c88bd
PZ
7307 * a think about bumping its value to force at least one task to be
7308 * moved
7309 */
bd939f45
PZ
7310 if (env->imbalance < sds->busiest_load_per_task)
7311 return fix_small_imbalance(env, sds);
1e3c88bd
PZ
7312
7313}
fab47622 7314
1e3c88bd
PZ
7315/******* find_busiest_group() helpers end here *********************/
7316
7317/**
7318 * find_busiest_group - Returns the busiest group within the sched_domain
7319 * if there is an imbalance. If there isn't an imbalance, and
7320 * the user has opted for power-savings, it returns a group whose
7321 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
7322 * such a group exists.
7323 *
7324 * Also calculates the amount of weighted load which should be moved
7325 * to restore balance.
7326 *
cd96891d 7327 * @env: The load balancing environment.
1e3c88bd
PZ
7328 * @balance: Pointer to a variable indicating if this_cpu
7329 * is the appropriate cpu to perform load balancing at this_level.
7330 *
7331 * Returns: - the busiest group if imbalance exists.
7332 * - If no imbalance and user has opted for power-savings balance,
7333 * return the least loaded group whose CPUs can be
7334 * put to idle by rebalancing its tasks onto our group.
7335 */
7336static struct sched_group *
b9403130 7337find_busiest_group(struct lb_env *env, int *balance)
1e3c88bd
PZ
7338{
7339 struct sd_lb_stats sds;
7340
7341 memset(&sds, 0, sizeof(sds));
7342
7343 /*
7344 * Compute the various statistics relavent for load balancing at
7345 * this level.
7346 */
b9403130 7347 update_sd_lb_stats(env, balance, &sds);
1e3c88bd 7348
cc57aa8f
PZ
7349 /*
7350 * this_cpu is not the appropriate cpu to perform load balancing at
7351 * this level.
1e3c88bd 7352 */
6fa3eb70
S
7353 if (!(*balance)){
7354 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_BALANCE);
1e3c88bd 7355 goto ret;
6fa3eb70 7356 }
1e3c88bd 7357
bd939f45
PZ
7358 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
7359 check_asym_packing(env, &sds))
532cb4c4
MN
7360 return sds.busiest;
7361
cc57aa8f 7362 /* There is no busy sibling group to pull tasks from */
6fa3eb70
S
7363 if (!sds.busiest || sds.busiest_nr_running == 0){
7364 if(!sds.busiest){
7365 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_BUSIEST_NO_TASK);
7366 }else{
7367 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_BUSIEST);
7368 }
1e3c88bd 7369 goto out_balanced;
6fa3eb70 7370 }
1e3c88bd 7371
1399fa78 7372 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
b0432d8f 7373
866ab43e
PZ
7374 /*
7375 * If the busiest group is imbalanced the below checks don't
7376 * work because they assumes all things are equal, which typically
7377 * isn't true due to cpus_allowed constraints and the like.
7378 */
7379 if (sds.group_imb)
7380 goto force_balance;
7381
cc57aa8f 7382 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
bd939f45 7383 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
fab47622
NR
7384 !sds.busiest_has_capacity)
7385 goto force_balance;
7386
cc57aa8f
PZ
7387 /*
7388 * If the local group is more busy than the selected busiest group
7389 * don't try and pull any tasks.
7390 */
6fa3eb70
S
7391 if (sds.this_load >= sds.max_load){
7392 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_LARGER_THAN);
1e3c88bd 7393 goto out_balanced;
6fa3eb70 7394 }
1e3c88bd 7395
cc57aa8f
PZ
7396 /*
7397 * Don't pull any tasks if this group is already above the domain
7398 * average load.
7399 */
6fa3eb70
S
7400 if (sds.this_load >= sds.avg_load){
7401 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_LARGER_THAN);
1e3c88bd 7402 goto out_balanced;
6fa3eb70 7403 }
1e3c88bd 7404
bd939f45 7405 if (env->idle == CPU_IDLE) {
aae6d3dd
SS
7406 /*
7407 * This cpu is idle. If the busiest group load doesn't
7408 * have more tasks than the number of available cpu's and
7409 * there is no imbalance between this and busiest group
7410 * wrt to idle cpu's, it is balanced.
7411 */
c186fafe 7412 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
aae6d3dd
SS
7413 sds.busiest_nr_running <= sds.busiest_group_weight)
7414 goto out_balanced;
c186fafe
PZ
7415 } else {
7416 /*
7417 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
7418 * imbalance_pct to be conservative.
7419 */
6fa3eb70
S
7420 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load){
7421 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_CHECK_FAIL);
c186fafe 7422 goto out_balanced;
6fa3eb70 7423 }
aae6d3dd 7424 }
1e3c88bd 7425
fab47622 7426force_balance:
1e3c88bd 7427 /* Looks like there is an imbalance. Compute it */
bd939f45 7428 calculate_imbalance(env, &sds);
1e3c88bd
PZ
7429 return sds.busiest;
7430
7431out_balanced:
1e3c88bd 7432ret:
bd939f45 7433 env->imbalance = 0;
1e3c88bd
PZ
7434 return NULL;
7435}
7436
7437/*
7438 * find_busiest_queue - find the busiest runqueue among the cpus in group.
7439 */
bd939f45 7440static struct rq *find_busiest_queue(struct lb_env *env,
b9403130 7441 struct sched_group *group)
1e3c88bd
PZ
7442{
7443 struct rq *busiest = NULL, *rq;
7444 unsigned long max_load = 0;
7445 int i;
7446
7447 for_each_cpu(i, sched_group_cpus(group)) {
7448 unsigned long power = power_of(i);
1399fa78
NR
7449 unsigned long capacity = DIV_ROUND_CLOSEST(power,
7450 SCHED_POWER_SCALE);
1e3c88bd
PZ
7451 unsigned long wl;
7452
9d5efe05 7453 if (!capacity)
bd939f45 7454 capacity = fix_small_capacity(env->sd, group);
9d5efe05 7455
b9403130 7456 if (!cpumask_test_cpu(i, env->cpus))
1e3c88bd
PZ
7457 continue;
7458
7459 rq = cpu_rq(i);
6e40f5bb 7460 wl = weighted_cpuload(i);
1e3c88bd 7461
6e40f5bb
TG
7462 /*
7463 * When comparing with imbalance, use weighted_cpuload()
7464 * which is not scaled with the cpu power.
7465 */
bd939f45 7466 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
1e3c88bd
PZ
7467 continue;
7468
6e40f5bb
TG
7469 /*
7470 * For the load comparisons with the other cpu's, consider
7471 * the weighted_cpuload() scaled with the cpu power, so that
7472 * the load can be moved away from the cpu that is potentially
7473 * running at a lower capacity.
7474 */
1399fa78 7475 wl = (wl * SCHED_POWER_SCALE) / power;
6e40f5bb 7476
1e3c88bd
PZ
7477 if (wl > max_load) {
7478 max_load = wl;
7479 busiest = rq;
7480 }
7481 }
7482
7483 return busiest;
7484}
7485
7486/*
7487 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
7488 * so long as it is large enough.
7489 */
7490#define MAX_PINNED_INTERVAL 512
7491
7492/* Working cpumask for load_balance and load_balance_newidle. */
e6252c3e 7493DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
1e3c88bd 7494
bd939f45 7495static int need_active_balance(struct lb_env *env)
1af3ed3d 7496{
bd939f45
PZ
7497 struct sched_domain *sd = env->sd;
7498
7499 if (env->idle == CPU_NEWLY_IDLE) {
532cb4c4
MN
7500
7501 /*
7502 * ASYM_PACKING needs to force migrate tasks from busy but
7503 * higher numbered CPUs in order to pack all tasks in the
7504 * lowest numbered CPUs.
7505 */
bd939f45 7506 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
532cb4c4 7507 return 1;
1af3ed3d
PZ
7508 }
7509
7510 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
7511}
7512
969c7921
TH
7513static int active_load_balance_cpu_stop(void *data);
7514
1e3c88bd
PZ
7515/*
7516 * Check this_cpu to ensure it is balanced within domain. Attempt to move
7517 * tasks if there is an imbalance.
7518 */
7519static int load_balance(int this_cpu, struct rq *this_rq,
7520 struct sched_domain *sd, enum cpu_idle_type idle,
7521 int *balance)
7522{
88b8dac0 7523 int ld_moved, cur_ld_moved, active_balance = 0;
1e3c88bd 7524 struct sched_group *group;
1e3c88bd
PZ
7525 struct rq *busiest;
7526 unsigned long flags;
e6252c3e 7527 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
1e3c88bd 7528
8e45cb54
PZ
7529 struct lb_env env = {
7530 .sd = sd,
ddcdf6e7
PZ
7531 .dst_cpu = this_cpu,
7532 .dst_rq = this_rq,
88b8dac0 7533 .dst_grpmask = sched_group_cpus(sd->groups),
8e45cb54 7534 .idle = idle,
eb95308e 7535 .loop_break = sched_nr_migrate_break,
b9403130 7536 .cpus = cpus,
6fa3eb70
S
7537#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7538 .fail_reason= MT_LBPROF_NO_TRIGGER,
7539#endif
8e45cb54
PZ
7540 };
7541
cfc03118
JK
7542 /*
7543 * For NEWLY_IDLE load_balancing, we don't need to consider
7544 * other cpus in our group
7545 */
e02e60c1 7546 if (idle == CPU_NEWLY_IDLE)
cfc03118 7547 env.dst_grpmask = NULL;
cfc03118 7548
1e3c88bd
PZ
7549 cpumask_copy(cpus, cpu_active_mask);
7550
1e3c88bd
PZ
7551 schedstat_inc(sd, lb_count[idle]);
7552
7553redo:
b9403130 7554 group = find_busiest_group(&env, balance);
1e3c88bd
PZ
7555
7556 if (*balance == 0)
7557 goto out_balanced;
7558
7559 if (!group) {
7560 schedstat_inc(sd, lb_nobusyg[idle]);
6fa3eb70
S
7561 if(mt_lbprof_test(env.fail_reason, MT_LBPROF_HISTORY)){
7562 int tmp_cpu;
7563 for_each_cpu(tmp_cpu, cpu_possible_mask){
7564 if (tmp_cpu == this_rq->cpu)
7565 continue;
7566 mt_lbprof_update_state(tmp_cpu, MT_LBPROF_BALANCE_FAIL_STATE);
7567 }
7568 }
1e3c88bd
PZ
7569 goto out_balanced;
7570 }
7571
b9403130 7572 busiest = find_busiest_queue(&env, group);
1e3c88bd
PZ
7573 if (!busiest) {
7574 schedstat_inc(sd, lb_nobusyq[idle]);
6fa3eb70 7575 mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_NOBUSYQ);
1e3c88bd
PZ
7576 goto out_balanced;
7577 }
7578
6fa3eb70
S
7579#ifdef CONFIG_HMP_LAZY_BALANCE
7580
7581#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7582 if (PA_ENABLE && LB_ENABLE) {
7583#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7584
7585 if (per_cpu(sd_pack_buddy, this_cpu) == busiest->cpu && !is_buddy_busy(per_cpu(sd_pack_buddy, this_cpu))) {
7586
7587#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7588 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[this_cpu][busiest->cpu]++;
7589
7590#ifdef CONFIG_HMP_TRACER
7591 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_BALANCE_FORM_CPUX_TO_CPUY, 0, this_cpu, busiest->cpu);
7592#endif /* CONFIG_HMP_TRACER */
7593
7594#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7595
7596 schedstat_inc(sd, lb_nobusyq[idle]);
7597 goto out_balanced;
7598 }
7599
7600#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7601 }
7602#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7603
7604#endif /* CONFIG_HMP_LAZY_BALANCE */
7605
78feefc5 7606 BUG_ON(busiest == env.dst_rq);
1e3c88bd 7607
bd939f45 7608 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
1e3c88bd
PZ
7609
7610 ld_moved = 0;
7611 if (busiest->nr_running > 1) {
7612 /*
7613 * Attempt to move tasks. If find_busiest_group has found
7614 * an imbalance but busiest->nr_running <= 1, the group is
7615 * still unbalanced. ld_moved simply stays zero, so it is
7616 * correctly treated as an imbalance.
7617 */
8e45cb54 7618 env.flags |= LBF_ALL_PINNED;
c82513e5
PZ
7619 env.src_cpu = busiest->cpu;
7620 env.src_rq = busiest;
7621 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
6fa3eb70
S
7622#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7623 env.mt_check_cache_in_idle = 1;
7624#endif
8e45cb54 7625
a35b6466 7626 update_h_load(env.src_cpu);
5d6523eb 7627more_balance:
1e3c88bd 7628 local_irq_save(flags);
78feefc5 7629 double_rq_lock(env.dst_rq, busiest);
6fa3eb70
S
7630#ifdef CONFIG_MTK_SCHED_CMP
7631 env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
7632 mt_sched_printf("1 env.loop_max=%d, busiest->nr_running=%d src=%d, dst=%d, cpus_share_cache=%d",
7633 env.loop_max, busiest->nr_running, env.src_cpu, env.dst_cpu, cpus_share_cache(env.src_cpu, env.dst_cpu));
7634#endif /* CONFIG_MTK_SCHED_CMP */
88b8dac0
SV
7635 /*
7636 * cur_ld_moved - load moved in current iteration
7637 * ld_moved - cumulative load moved across iterations
7638 */
6fa3eb70
S
7639#ifdef CONFIG_MTK_SCHED_CMP
7640 if (!cpus_share_cache(env.src_cpu, env.dst_cpu))
7641 cur_ld_moved = cmp_move_tasks(sd, &env);
7642 else
7643 cur_ld_moved = move_tasks(&env);
7644#else /* !CONFIG_MTK_SCHED_CMP */
88b8dac0 7645 cur_ld_moved = move_tasks(&env);
6fa3eb70 7646#endif /* CONFIG_MTK_SCHED_CMP */
88b8dac0 7647 ld_moved += cur_ld_moved;
78feefc5 7648 double_rq_unlock(env.dst_rq, busiest);
1e3c88bd
PZ
7649 local_irq_restore(flags);
7650
7651 /*
7652 * some other cpu did the load balance for us.
7653 */
88b8dac0
SV
7654 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
7655 resched_cpu(env.dst_cpu);
7656
f1cd0858
JK
7657 if (env.flags & LBF_NEED_BREAK) {
7658 env.flags &= ~LBF_NEED_BREAK;
7659 goto more_balance;
7660 }
7661
88b8dac0
SV
7662 /*
7663 * Revisit (affine) tasks on src_cpu that couldn't be moved to
7664 * us and move them to an alternate dst_cpu in our sched_group
7665 * where they can run. The upper limit on how many times we
7666 * iterate on same src_cpu is dependent on number of cpus in our
7667 * sched_group.
7668 *
7669 * This changes load balance semantics a bit on who can move
7670 * load to a given_cpu. In addition to the given_cpu itself
7671 * (or a ilb_cpu acting on its behalf where given_cpu is
7672 * nohz-idle), we now have balance_cpu in a position to move
7673 * load to given_cpu. In rare situations, this may cause
7674 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7675 * _independently_ and at _same_ time to move some load to
7676 * given_cpu) causing exceess load to be moved to given_cpu.
7677 * This however should not happen so much in practice and
7678 * moreover subsequent load balance cycles should correct the
7679 * excess load moved.
7680 */
e02e60c1 7681 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
88b8dac0 7682
78feefc5 7683 env.dst_rq = cpu_rq(env.new_dst_cpu);
88b8dac0
SV
7684 env.dst_cpu = env.new_dst_cpu;
7685 env.flags &= ~LBF_SOME_PINNED;
7686 env.loop = 0;
7687 env.loop_break = sched_nr_migrate_break;
e02e60c1
JK
7688
7689 /* Prevent to re-select dst_cpu via env's cpus */
7690 cpumask_clear_cpu(env.dst_cpu, env.cpus);
7691
88b8dac0
SV
7692 /*
7693 * Go back to "more_balance" rather than "redo" since we
7694 * need to continue with same src_cpu.
7695 */
7696 goto more_balance;
7697 }
1e3c88bd
PZ
7698
7699 /* All tasks on this runqueue were pinned by CPU affinity */
8e45cb54 7700 if (unlikely(env.flags & LBF_ALL_PINNED)) {
6fa3eb70 7701 mt_lbprof_update_state(busiest->cpu, MT_LBPROF_ALLPINNED);
1e3c88bd 7702 cpumask_clear_cpu(cpu_of(busiest), cpus);
bbf18b19
PN
7703 if (!cpumask_empty(cpus)) {
7704 env.loop = 0;
7705 env.loop_break = sched_nr_migrate_break;
1e3c88bd 7706 goto redo;
bbf18b19 7707 }
1e3c88bd
PZ
7708 goto out_balanced;
7709 }
6fa3eb70
S
7710
7711#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7712 /* when move tasks fil, force migration no matter cache-hot */
7713 /* use mt_check_cache_in_idle */
7714 if (!ld_moved && ((CPU_NEWLY_IDLE == idle) || (CPU_IDLE == idle) ) ) {
7715#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7716 mt_lbprof_stat_set(env.fail_reason, MT_LBPROF_DO_LB);
7717#endif
7718 env.mt_check_cache_in_idle = 0;
7719 env.loop = 0;
7720 local_irq_save(flags);
7721 double_rq_lock(env.dst_rq, busiest);
7722#ifdef CONFIG_MTK_SCHED_CMP
7723 env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
7724 mt_sched_printf("2 env.loop_max=%d, busiest->nr_running=%d",
7725 env.loop_max, busiest->nr_running);
7726#endif /* CONFIG_MTK_SCHED_CMP */
7727 if (!env.loop)
7728 update_h_load(env.src_cpu);
7729#ifdef CONFIG_MTK_SCHED_CMP_TGS
7730 if (!cpus_share_cache(env.src_cpu, env.dst_cpu))
7731 ld_moved = cmp_move_tasks(sd, &env);
7732 else{
7733 ld_moved = move_tasks(&env);
7734 }
7735#else /* !CONFIG_MTK_SCHED_CMP_TGS */
7736 ld_moved = move_tasks(&env);
7737#endif /* CONFIG_MTK_SCHED_CMP_TGS */
7738 double_rq_unlock(env.dst_rq, busiest);
7739 local_irq_restore(flags);
7740
7741 /*
7742 * some other cpu did the load balance for us.
7743 */
7744 if (ld_moved && this_cpu != smp_processor_id())
7745 resched_cpu(this_cpu);
7746 }
7747#endif
1e3c88bd
PZ
7748 }
7749
7750 if (!ld_moved) {
7751 schedstat_inc(sd, lb_failed[idle]);
6fa3eb70
S
7752 mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_FAILED);
7753 if ( mt_lbprof_test(env.fail_reason, MT_LBPROF_AFFINITY) ) {
7754 mt_lbprof_update_state(busiest->cpu, MT_LBPROF_FAILURE_STATE);
7755 }else if ( mt_lbprof_test(env.fail_reason, MT_LBPROF_CACHEHOT) ) {
7756 mt_lbprof_update_state(busiest->cpu, MT_LBPROF_FAILURE_STATE);
7757 }
7758
58b26c4c
VP
7759 /*
7760 * Increment the failure counter only on periodic balance.
7761 * We do not want newidle balance, which can be very
7762 * frequent, pollute the failure counter causing
7763 * excessive cache_hot migrations and active balances.
7764 */
7765 if (idle != CPU_NEWLY_IDLE)
7766 sd->nr_balance_failed++;
6fa3eb70 7767 mt_lbprof_stat_inc(sd, mt_lbprof_nr_balance_failed);
1e3c88bd 7768
bd939f45 7769 if (need_active_balance(&env)) {
1e3c88bd
PZ
7770 raw_spin_lock_irqsave(&busiest->lock, flags);
7771
969c7921
TH
7772 /* don't kick the active_load_balance_cpu_stop,
7773 * if the curr task on busiest cpu can't be
7774 * moved to this_cpu
1e3c88bd
PZ
7775 */
7776 if (!cpumask_test_cpu(this_cpu,
fa17b507 7777 tsk_cpus_allowed(busiest->curr))) {
1e3c88bd
PZ
7778 raw_spin_unlock_irqrestore(&busiest->lock,
7779 flags);
8e45cb54 7780 env.flags |= LBF_ALL_PINNED;
1e3c88bd
PZ
7781 goto out_one_pinned;
7782 }
7783
969c7921
TH
7784 /*
7785 * ->active_balance synchronizes accesses to
7786 * ->active_balance_work. Once set, it's cleared
7787 * only after active load balance is finished.
7788 */
1e3c88bd
PZ
7789 if (!busiest->active_balance) {
7790 busiest->active_balance = 1;
7791 busiest->push_cpu = this_cpu;
7792 active_balance = 1;
7793 }
7794 raw_spin_unlock_irqrestore(&busiest->lock, flags);
969c7921 7795
bd939f45 7796 if (active_balance) {
969c7921
TH
7797 stop_one_cpu_nowait(cpu_of(busiest),
7798 active_load_balance_cpu_stop, busiest,
7799 &busiest->active_balance_work);
bd939f45 7800 }
1e3c88bd
PZ
7801
7802 /*
7803 * We've kicked active balancing, reset the failure
7804 * counter.
7805 */
7806 sd->nr_balance_failed = sd->cache_nice_tries+1;
7807 }
7808 } else
7809 sd->nr_balance_failed = 0;
7810
7811 if (likely(!active_balance)) {
7812 /* We were unbalanced, so reset the balancing interval */
7813 sd->balance_interval = sd->min_interval;
7814 } else {
7815 /*
7816 * If we've begun active balancing, start to back off. This
7817 * case may not be covered by the all_pinned logic if there
7818 * is only 1 task on the busy runqueue (because we don't call
7819 * move_tasks).
7820 */
7821 if (sd->balance_interval < sd->max_interval)
7822 sd->balance_interval *= 2;
7823 }
7824
1e3c88bd
PZ
7825 goto out;
7826
7827out_balanced:
7828 schedstat_inc(sd, lb_balanced[idle]);
7829
7830 sd->nr_balance_failed = 0;
6fa3eb70 7831 mt_lbprof_stat_set(sd->mt_lbprof_nr_balance_failed, 0);
1e3c88bd
PZ
7832
7833out_one_pinned:
7834 /* tune up the balancing interval */
8e45cb54 7835 if (((env.flags & LBF_ALL_PINNED) &&
5b54b56b 7836 sd->balance_interval < MAX_PINNED_INTERVAL) ||
1e3c88bd
PZ
7837 (sd->balance_interval < sd->max_interval))
7838 sd->balance_interval *= 2;
7839
46e49b38 7840 ld_moved = 0;
1e3c88bd 7841out:
6fa3eb70
S
7842 if (ld_moved){
7843 mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_SUCCESS);
7844 mt_lbprof_stat_set(sd->mt_lbprof_nr_balance_failed, 0);
7845 }
7846
7847#ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7848 if( CPU_NEWLY_IDLE == idle){
7849 char strings[128]="";
7850 snprintf(strings, 128, "%d:idle balance:%d:0x%x ", this_cpu, ld_moved, env.fail_reason);
7851 mt_lbprof_rqinfo(strings);
7852 trace_sched_lbprof_log(strings);
7853 }else{
7854 char strings[128]="";
7855 snprintf(strings, 128, "%d:periodic balance:%d:0x%x ", this_cpu, ld_moved, env.fail_reason);
7856 mt_lbprof_rqinfo(strings);
7857 trace_sched_lbprof_log(strings);
7858 }
7859#endif
7860
1e3c88bd
PZ
7861 return ld_moved;
7862}
7863
1e3c88bd
PZ
7864/*
7865 * idle_balance is called by schedule() if this_cpu is about to become
7866 * idle. Attempts to pull tasks from other CPUs.
7867 */
029632fb 7868void idle_balance(int this_cpu, struct rq *this_rq)
1e3c88bd
PZ
7869{
7870 struct sched_domain *sd;
7871 int pulled_task = 0;
7872 unsigned long next_balance = jiffies + HZ;
6fa3eb70
S
7873#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) || defined(CONFIG_MT_LOAD_BALANCE_PROFILER)
7874 unsigned long counter = 0;
7875#endif
1e3c88bd
PZ
7876
7877 this_rq->idle_stamp = this_rq->clock;
7878
6fa3eb70
S
7879 mt_lbprof_update_state_has_lock(this_cpu, MT_LBPROF_UPDATE_STATE);
7880#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7881 #ifdef CONFIG_LOCAL_TIMERS
7882 counter = localtimer_get_counter();
7883 if ( counter >= 260000 ) // 20ms
7884 goto must_do;
7885 if ( time_before(jiffies + 2, this_rq->next_balance) ) // 20ms
7886 goto must_do;
7887 #endif
7888#endif
7889
7890 if (this_rq->avg_idle < sysctl_sched_migration_cost){
7891#if defined(CONFIG_MT_LOAD_BALANCE_PROFILER)
7892 char strings[128]="";
7893 mt_lbprof_update_state_has_lock(this_cpu, MT_LBPROF_ALLOW_UNBLANCE_STATE);
7894 snprintf(strings, 128, "%d:idle balance bypass: %llu %lu ", this_cpu, this_rq->avg_idle, counter);
7895 mt_lbprof_rqinfo(strings);
7896 trace_sched_lbprof_log(strings);
7897#endif
1e3c88bd 7898 return;
6fa3eb70
S
7899 }
7900
7901#ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7902 must_do:
7903#endif
1e3c88bd 7904
f492e12e
PZ
7905 /*
7906 * Drop the rq->lock, but keep IRQ/preempt disabled.
7907 */
7908 raw_spin_unlock(&this_rq->lock);
7909
6fa3eb70 7910 mt_lbprof_update_status();
48a16753 7911 update_blocked_averages(this_cpu);
dce840a0 7912 rcu_read_lock();
1e3c88bd
PZ
7913 for_each_domain(this_cpu, sd) {
7914 unsigned long interval;
f492e12e 7915 int balance = 1;
1e3c88bd
PZ
7916
7917 if (!(sd->flags & SD_LOAD_BALANCE))
7918 continue;
7919
f492e12e 7920 if (sd->flags & SD_BALANCE_NEWIDLE) {
1e3c88bd 7921 /* If we've pulled tasks over stop searching: */
f492e12e
PZ
7922 pulled_task = load_balance(this_cpu, this_rq,
7923 sd, CPU_NEWLY_IDLE, &balance);
7924 }
1e3c88bd
PZ
7925
7926 interval = msecs_to_jiffies(sd->balance_interval);
7927 if (time_after(next_balance, sd->last_balance + interval))
7928 next_balance = sd->last_balance + interval;
d5ad140b
NR
7929 if (pulled_task) {
7930 this_rq->idle_stamp = 0;
1e3c88bd 7931 break;
d5ad140b 7932 }
1e3c88bd 7933 }
dce840a0 7934 rcu_read_unlock();
f492e12e
PZ
7935
7936 raw_spin_lock(&this_rq->lock);
7937
1e3c88bd
PZ
7938 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
7939 /*
7940 * We are going idle. next_balance may be set based on
7941 * a busy processor. So reset next_balance.
7942 */
7943 this_rq->next_balance = next_balance;
7944 }
7945}
7946
7947/*
969c7921
TH
7948 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7949 * running tasks off the busiest CPU onto idle CPUs. It requires at
7950 * least 1 task to be running on each physical CPU where possible, and
7951 * avoids physical / logical imbalances.
1e3c88bd 7952 */
969c7921 7953static int active_load_balance_cpu_stop(void *data)
1e3c88bd 7954{
969c7921
TH
7955 struct rq *busiest_rq = data;
7956 int busiest_cpu = cpu_of(busiest_rq);
1e3c88bd 7957 int target_cpu = busiest_rq->push_cpu;
969c7921 7958 struct rq *target_rq = cpu_rq(target_cpu);
1e3c88bd 7959 struct sched_domain *sd;
969c7921
TH
7960
7961 raw_spin_lock_irq(&busiest_rq->lock);
7962
7963 /* make sure the requested cpu hasn't gone down in the meantime */
7964 if (unlikely(busiest_cpu != smp_processor_id() ||
7965 !busiest_rq->active_balance))
7966 goto out_unlock;
1e3c88bd
PZ
7967
7968 /* Is there any task to move? */
7969 if (busiest_rq->nr_running <= 1)
969c7921 7970 goto out_unlock;
1e3c88bd
PZ
7971
7972 /*
7973 * This condition is "impossible", if it occurs
7974 * we need to fix it. Originally reported by
7975 * Bjorn Helgaas on a 128-cpu setup.
7976 */
7977 BUG_ON(busiest_rq == target_rq);
7978
7979 /* move a task from busiest_rq to target_rq */
7980 double_lock_balance(busiest_rq, target_rq);
1e3c88bd
PZ
7981
7982 /* Search for an sd spanning us and the target CPU. */
dce840a0 7983 rcu_read_lock();
1e3c88bd
PZ
7984 for_each_domain(target_cpu, sd) {
7985 if ((sd->flags & SD_LOAD_BALANCE) &&
7986 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7987 break;
7988 }
7989
7990 if (likely(sd)) {
8e45cb54
PZ
7991 struct lb_env env = {
7992 .sd = sd,
ddcdf6e7
PZ
7993 .dst_cpu = target_cpu,
7994 .dst_rq = target_rq,
7995 .src_cpu = busiest_rq->cpu,
7996 .src_rq = busiest_rq,
8e45cb54
PZ
7997 .idle = CPU_IDLE,
7998 };
7999
1e3c88bd
PZ
8000 schedstat_inc(sd, alb_count);
8001
8e45cb54 8002 if (move_one_task(&env))
1e3c88bd
PZ
8003 schedstat_inc(sd, alb_pushed);
8004 else
8005 schedstat_inc(sd, alb_failed);
8006 }
dce840a0 8007 rcu_read_unlock();
1e3c88bd 8008 double_unlock_balance(busiest_rq, target_rq);
969c7921
TH
8009out_unlock:
8010 busiest_rq->active_balance = 0;
8011 raw_spin_unlock_irq(&busiest_rq->lock);
8012 return 0;
1e3c88bd
PZ
8013}
8014
3451d024 8015#ifdef CONFIG_NO_HZ_COMMON
83cd4fe2
VP
8016/*
8017 * idle load balancing details
83cd4fe2
VP
8018 * - When one of the busy CPUs notice that there may be an idle rebalancing
8019 * needed, they will kick the idle load balancer, which then does idle
8020 * load balancing for all the idle CPUs.
8021 */
1e3c88bd 8022static struct {
83cd4fe2 8023 cpumask_var_t idle_cpus_mask;
0b005cf5 8024 atomic_t nr_cpus;
83cd4fe2
VP
8025 unsigned long next_balance; /* in jiffy units */
8026} nohz ____cacheline_aligned;
1e3c88bd 8027
6fa3eb70 8028
8e7fbcbc 8029static inline int find_new_ilb(int call_cpu)
1e3c88bd 8030{
6fa3eb70
S
8031#ifdef CONFIG_HMP_PACK_SMALL_TASK
8032
8033#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
8034
8035 struct sched_domain *sd;
8036
8037 int ilb_new = nr_cpu_ids;
8038
8039 int ilb_return = 0;
8040
0b005cf5 8041 int ilb = cpumask_first(nohz.idle_cpus_mask);
1e3c88bd 8042
6fa3eb70
S
8043
8044 if(PA_ENABLE)
8045 {
8046 int buddy = per_cpu(sd_pack_buddy, call_cpu);
786d6dc7 8047
6fa3eb70
S
8048 /*
8049 * If we have a pack buddy CPU, we try to run load balance on a CPU
8050 * that is close to the buddy.
8051 */
8052 if (buddy != -1)
8053 for_each_domain(buddy, sd) {
8054 if (sd->flags & SD_SHARE_CPUPOWER)
8055 continue;
1e3c88bd 8056
6fa3eb70
S
8057 ilb_new = cpumask_first_and(sched_domain_span(sd),
8058 nohz.idle_cpus_mask);
83cd4fe2 8059
6fa3eb70
S
8060 if (ilb_new < nr_cpu_ids)
8061 break;
8062
8063 }
8064 }
83cd4fe2 8065
6fa3eb70
S
8066 if (ilb < nr_cpu_ids && idle_cpu(ilb)) {
8067 ilb_return = 1;
8068 }
83cd4fe2 8069
6fa3eb70
S
8070 if (ilb_new < nr_cpu_ids) {
8071 if (idle_cpu(ilb_new)) {
8072 if(PA_ENABLE && ilb_return && ilb_new != ilb) {
8073 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[call_cpu][ilb]++;
83cd4fe2 8074
6fa3eb70
S
8075#ifdef CONFIG_HMP_TRACER
8076 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_WAKE_UP_FORM_CPUX_TO_CPUY, 0, call_cpu, ilb);
8077#endif /* CONFIG_HMP_TRACER */
8078
8079 }
8080 return ilb_new;
8081 }
8082 }
8083
8084 if(ilb_return) {
8085 return ilb;
8086 }
8087
8088 return nr_cpu_ids;
8089
8090#else /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
8091
8092 struct sched_domain *sd;
8093 int ilb = cpumask_first(nohz.idle_cpus_mask);
8094 int buddy = per_cpu(sd_pack_buddy, call_cpu);
8095
8096 /*
8097 * If we have a pack buddy CPU, we try to run load balance on a CPU
8098 * that is close to the buddy.
8099 */
8100 if (buddy != -1)
8101 for_each_domain(buddy, sd) {
8102 if (sd->flags & SD_SHARE_CPUPOWER)
8103 continue;
8104
8105 ilb = cpumask_first_and(sched_domain_span(sd),
8106 nohz.idle_cpus_mask);
8107
8108 if (ilb < nr_cpu_ids)
8109 break;
8110 }
8111
8112 if (ilb < nr_cpu_ids && idle_cpu(ilb))
8113 return ilb;
8114
8115 return nr_cpu_ids;
8116
8117#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
8118
8119#else /* CONFIG_HMP_PACK_SMALL_TASK */
8120
8121 int ilb = cpumask_first(nohz.idle_cpus_mask);
8122#ifdef CONFIG_MTK_SCHED_CMP_TGS
8123 /* Find nohz balancing to occur in the same cluster firstly */
8124 int new_ilb;
8125 struct cpumask tmp;
8126 //Find idle cpu with online one
8127 get_cluster_cpus(&tmp, get_cluster_id(call_cpu), true);
8128 new_ilb = cpumask_first_and(nohz.idle_cpus_mask, &tmp);
8129 if (new_ilb < nr_cpu_ids && idle_cpu(new_ilb))
8130 {
8131#ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
8132 if(new_ilb != ilb)
8133 {
8134 mt_sched_printf("[PA]find_new_ilb(cpu%x), new_ilb = %d, ilb = %d\n", call_cpu, new_ilb, ilb);
8135 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[call_cpu][ilb]++;
8136 }
8137#endif
8138 return new_ilb;
8139 }
8140#endif /* CONFIG_MTK_SCHED_CMP_TGS */
8141
8142 if (ilb < nr_cpu_ids && idle_cpu(ilb))
8143 return ilb;
8144
8145 return nr_cpu_ids;
8146
8147#endif /* CONFIG_HMP_PACK_SMALL_TASK */
8148
8149}
8150
8151
8152/*
8153 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
8154 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
8155 * CPU (if there is one).
8156 */
8157static void nohz_balancer_kick(int cpu)
8158{
8159 int ilb_cpu;
8160
8161 nohz.next_balance++;
8162
8163 ilb_cpu = find_new_ilb(cpu);
8164
8165 if (ilb_cpu >= nr_cpu_ids)
8166 return;
8167
8168 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
8169 return;
8170 /*
8171 * Use smp_send_reschedule() instead of resched_cpu().
8172 * This way we generate a sched IPI on the target cpu which
8173 * is idle. And the softirq performing nohz idle load balance
1c792db7
SS
8174 * will be run before returning from the IPI.
8175 */
8176 smp_send_reschedule(ilb_cpu);
83cd4fe2
VP
8177 return;
8178}
8179
c1cc017c 8180static inline void nohz_balance_exit_idle(int cpu)
71325960
SS
8181{
8182 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
8183 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
8184 atomic_dec(&nohz.nr_cpus);
8185 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8186 }
8187}
8188
69e1e811
SS
8189static inline void set_cpu_sd_state_busy(void)
8190{
8191 struct sched_domain *sd;
8192 int cpu = smp_processor_id();
8193
69e1e811 8194 rcu_read_lock();
25f55d9d
VG
8195 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
8196
8197 if (!sd || !sd->nohz_idle)
8198 goto unlock;
8199 sd->nohz_idle = 0;
8200
8201 for (; sd; sd = sd->parent)
69e1e811 8202 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
25f55d9d 8203unlock:
69e1e811
SS
8204 rcu_read_unlock();
8205}
8206
8207void set_cpu_sd_state_idle(void)
8208{
8209 struct sched_domain *sd;
8210 int cpu = smp_processor_id();
8211
69e1e811 8212 rcu_read_lock();
25f55d9d
VG
8213 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
8214
8215 if (!sd || sd->nohz_idle)
8216 goto unlock;
8217 sd->nohz_idle = 1;
8218
8219 for (; sd; sd = sd->parent)
69e1e811 8220 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
25f55d9d 8221unlock:
69e1e811
SS
8222 rcu_read_unlock();
8223}
8224
1e3c88bd 8225/*
c1cc017c 8226 * This routine will record that the cpu is going idle with tick stopped.
0b005cf5 8227 * This info will be used in performing idle load balancing in the future.
1e3c88bd 8228 */
c1cc017c 8229void nohz_balance_enter_idle(int cpu)
1e3c88bd 8230{
71325960
SS
8231 /*
8232 * If this cpu is going down, then nothing needs to be done.
8233 */
8234 if (!cpu_active(cpu))
8235 return;
8236
c1cc017c
AS
8237 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
8238 return;
1e3c88bd 8239
c1cc017c
AS
8240 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
8241 atomic_inc(&nohz.nr_cpus);
8242 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
1e3c88bd 8243}
71325960
SS
8244
8245static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
8246 unsigned long action, void *hcpu)
8247{
8248 switch (action & ~CPU_TASKS_FROZEN) {
8249 case CPU_DYING:
c1cc017c 8250 nohz_balance_exit_idle(smp_processor_id());
71325960
SS
8251 return NOTIFY_OK;
8252 default:
8253 return NOTIFY_DONE;
8254 }
8255}
1e3c88bd
PZ
8256#endif
8257
8258static DEFINE_SPINLOCK(balancing);
8259
49c022e6
PZ
8260/*
8261 * Scale the max load_balance interval with the number of CPUs in the system.
8262 * This trades load-balance latency on larger machines for less cross talk.
8263 */
029632fb 8264void update_max_interval(void)
49c022e6
PZ
8265{
8266 max_load_balance_interval = HZ*num_online_cpus()/10;
8267}
8268
1e3c88bd
PZ
8269/*
8270 * It checks each scheduling domain to see if it is due to be balanced,
8271 * and initiates a balancing operation if so.
8272 *
b9b0853a 8273 * Balancing parameters are set up in init_sched_domains.
1e3c88bd
PZ
8274 */
8275static void rebalance_domains(int cpu, enum cpu_idle_type idle)
8276{
8277 int balance = 1;
8278 struct rq *rq = cpu_rq(cpu);
8279 unsigned long interval;
04f733b4 8280 struct sched_domain *sd;
1e3c88bd
PZ
8281 /* Earliest time when we have to do rebalance again */
8282 unsigned long next_balance = jiffies + 60*HZ;
8283 int update_next_balance = 0;
8284 int need_serialize;
8285
48a16753 8286 update_blocked_averages(cpu);
2069dd75 8287
dce840a0 8288 rcu_read_lock();
1e3c88bd
PZ
8289 for_each_domain(cpu, sd) {
8290 if (!(sd->flags & SD_LOAD_BALANCE))
8291 continue;
8292
8293 interval = sd->balance_interval;
8294 if (idle != CPU_IDLE)
8295 interval *= sd->busy_factor;
8296
8297 /* scale ms to jiffies */
8298 interval = msecs_to_jiffies(interval);
49c022e6 8299 interval = clamp(interval, 1UL, max_load_balance_interval);
1e3c88bd
PZ
8300
8301 need_serialize = sd->flags & SD_SERIALIZE;
8302
8303 if (need_serialize) {
8304 if (!spin_trylock(&balancing))
8305 goto out;
8306 }
8307
8308 if (time_after_eq(jiffies, sd->last_balance + interval)) {
8309 if (load_balance(cpu, rq, sd, idle, &balance)) {
8310 /*
de5eb2dd
JK
8311 * The LBF_SOME_PINNED logic could have changed
8312 * env->dst_cpu, so we can't know our idle
8313 * state even if we migrated tasks. Update it.
1e3c88bd 8314 */
de5eb2dd 8315 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
1e3c88bd
PZ
8316 }
8317 sd->last_balance = jiffies;
8318 }
8319 if (need_serialize)
8320 spin_unlock(&balancing);
8321out:
8322 if (time_after(next_balance, sd->last_balance + interval)) {
8323 next_balance = sd->last_balance + interval;
8324 update_next_balance = 1;
8325 }
8326
8327 /*
8328 * Stop the load balance at this level. There is another
8329 * CPU in our sched group which is doing load balancing more
8330 * actively.
8331 */
8332 if (!balance)
8333 break;
8334 }
dce840a0 8335 rcu_read_unlock();
1e3c88bd
PZ
8336
8337 /*
8338 * next_balance will be updated only when there is a need.
8339 * When the cpu is attached to null domain for ex, it will not be
8340 * updated.
8341 */
8342 if (likely(update_next_balance))
8343 rq->next_balance = next_balance;
8344}
8345
6fa3eb70
S
8346#ifdef CONFIG_NO_HZ_COMMON
8347/*
8348 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
8349 * rebalancing for all the cpus for whom scheduler ticks are stopped.
8350 */
8351static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
8352{
8353 struct rq *this_rq = cpu_rq(this_cpu);
8354 struct rq *rq;
8355 int balance_cpu;
8356
8357 if (idle != CPU_IDLE ||
8358 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
8359 goto end;
8360
8361 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8362 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
8363 continue;
8364
8365 /*
8366 * If this cpu gets work to do, stop the load balancing
8367 * work being done for other cpus. Next load
8368 * balancing owner will pick it up.
8369 */
8370 if (need_resched())
8371 break;
8372
8373 rq = cpu_rq(balance_cpu);
8374
8375 raw_spin_lock_irq(&rq->lock);
8376 update_rq_clock(rq);
8377 update_idle_cpu_load(rq);
8378 raw_spin_unlock_irq(&rq->lock);
8379
8380 rebalance_domains(balance_cpu, CPU_IDLE);
8381
8382 if (time_after(this_rq->next_balance, rq->next_balance))
8383 this_rq->next_balance = rq->next_balance;
8384 }
8385 nohz.next_balance = this_rq->next_balance;
8386end:
8387 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
8388}
8389
8390/*
8391 * Current heuristic for kicking the idle load balancer in the presence
8392 * of an idle cpu is the system.
8393 * - This rq has more than one task.
8394 * - At any scheduler domain level, this cpu's scheduler group has multiple
8395 * busy cpu's exceeding the group's power.
8396 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
8397 * domain span are idle.
8398 */
8399static inline int nohz_kick_needed(struct rq *rq, int cpu)
8400{
8401 unsigned long now = jiffies;
8402 struct sched_domain *sd;
8403
8404 if (unlikely(idle_cpu(cpu)))
8405 return 0;
8406
8407 /*
8408 * We may be recently in ticked or tickless idle mode. At the first
8409 * busy tick after returning from idle, we will update the busy stats.
8410 */
8411 set_cpu_sd_state_busy();
8412 nohz_balance_exit_idle(cpu);
8413
8414 /*
8415 * None are in tickless mode and hence no need for NOHZ idle load
8416 * balancing.
8417 */
8418 if (likely(!atomic_read(&nohz.nr_cpus)))
8419 return 0;
8420
8421 if (time_before(now, nohz.next_balance))
8422 return 0;
8423
8424#ifdef CONFIG_SCHED_HMP
8425 /*
8426 * Bail out if there are no nohz CPUs in our
8427 * HMP domain, since we will move tasks between
8428 * domains through wakeup and force balancing
8429 * as necessary based upon task load.
8430 */
8431 if (cpumask_first_and(nohz.idle_cpus_mask,
8432 &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
8433 return 0;
8434#endif
8435
8436 if (rq->nr_running >= 2)
8437 goto need_kick;
8438
8439 rcu_read_lock();
8440 for_each_domain(cpu, sd) {
8441 struct sched_group *sg = sd->groups;
8442 struct sched_group_power *sgp = sg->sgp;
8443 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
8444
8445 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
8446 goto need_kick_unlock;
8447
8448 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
8449 && (cpumask_first_and(nohz.idle_cpus_mask,
8450 sched_domain_span(sd)) < cpu))
8451 goto need_kick_unlock;
8452
8453 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
8454 break;
8455 }
8456 rcu_read_unlock();
8457 return 0;
8458
8459need_kick_unlock:
8460 rcu_read_unlock();
8461need_kick:
8462 return 1;
8463}
8464#else
8465static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
8466#endif
8467
8468#ifdef CONFIG_SCHED_HMP
8469#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
8470
8471/*
8472 * Heterogenous Multi-Processor (HMP) - Declaration and Useful Macro
8473 */
8474
8475/* Function Declaration */
8476static int hmp_up_stable(int cpu);
8477static int hmp_down_stable(int cpu);
8478static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se,
8479 struct clb_env *clbenv);
8480static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se,
8481 struct clb_env *clbenv);
8482
8483#define hmp_caller_is_gb(caller) ((HMP_GB == caller)?1:0)
8484
8485#define hmp_cpu_is_fast(cpu) cpumask_test_cpu(cpu,&hmp_fast_cpu_mask)
8486#define hmp_cpu_is_slow(cpu) cpumask_test_cpu(cpu,&hmp_slow_cpu_mask)
8487#define hmp_cpu_stable(cpu) (hmp_cpu_is_fast(cpu)? \
8488 hmp_up_stable(cpu):hmp_down_stable(cpu))
8489
8490#define hmp_inc(v) ((v) + 1)
8491#define hmp_dec(v) ((v) - 1)
8492#define hmp_pos(v) ((v) < (0) ? (0) : (v))
8493
8494#define task_created(f) ((SD_BALANCE_EXEC == f || SD_BALANCE_FORK == f)?1:0)
8495#define task_cpus_allowed(mask,p) cpumask_intersects(mask,tsk_cpus_allowed(p))
8496#define task_slow_cpu_allowed(p) task_cpus_allowed(&hmp_slow_cpu_mask,p)
8497#define task_fast_cpu_allowed(p) task_cpus_allowed(&hmp_fast_cpu_mask,p)
8498
8499/*
8500 * Heterogenous Multi-Processor (HMP) - Utility Function
8501 */
8502
8503/*
8504 * These functions add next up/down migration delay that prevents the task from
8505 * doing another migration in the same direction until the delay has expired.
8506 */
8507static int hmp_up_stable(int cpu)
8508{
8509 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8510 u64 now = cfs_rq_clock_task(cfs_rq);
8511 if (((now - hmp_last_up_migration(cpu)) >> 10) < hmp_next_up_threshold)
8512 return 0;
8513 return 1;
8514}
8515
8516static int hmp_down_stable(int cpu)
8517{
8518 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8519 u64 now = cfs_rq_clock_task(cfs_rq);
8520 if (((now - hmp_last_down_migration(cpu)) >> 10) < hmp_next_down_threshold)
8521 return 0;
8522 return 1;
8523}
8524
8525/* Select the most appropriate CPU from hmp cluster */
8526static unsigned int hmp_select_cpu(unsigned int caller, struct task_struct *p,
8527 struct cpumask *mask, int prev)
8528{
8529 int curr = 0;
8530 int target = NR_CPUS;
8531 unsigned long curr_wload = 0;
8532 unsigned long target_wload = 0;
8533 struct cpumask srcp;
8534 cpumask_and(&srcp, cpu_online_mask, mask);
8535 target = cpumask_any_and(&srcp, tsk_cpus_allowed(p));
8536 if (NR_CPUS == target)
8537 goto out;
8538
8539 /*
8540 * RT class is taken into account because CPU load is multiplied
8541 * by the total number of CPU runnable tasks that includes RT tasks.
8542 */
8543 target_wload = hmp_inc(cfs_load(target));
8544 target_wload += cfs_pending_load(target);
8545 target_wload *= rq_length(target);
8546 for_each_cpu(curr, mask) {
8547 /* Check CPU status and task affinity */
8548 if(!cpu_online(curr) || !cpumask_test_cpu(curr, tsk_cpus_allowed(p)))
8549 continue;
8550
8551 /* For global load balancing, unstable CPU will be bypassed */
8552 if(hmp_caller_is_gb(caller) && !hmp_cpu_stable(curr))
8553 continue;
8554
8555 curr_wload = hmp_inc(cfs_load(curr));
8556 curr_wload += cfs_pending_load(curr);
8557 curr_wload *= rq_length(curr);
8558 if(curr_wload < target_wload) {
8559 target_wload = curr_wload;
8560 target = curr;
8561 } else if(curr_wload == target_wload && curr == prev) {
8562 target = curr;
8563 }
8564 }
8565
8566out:
8567 return target;
8568}
8569
8570/*
8571 * Heterogenous Multi-Processor (HMP) - Task Runqueue Selection
8572 */
8573
8574/* This function enhances the original task selection function */
8575static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p,
8576 int prev_cpu, int new_cpu)
8577{
8578#ifdef CONFIG_HMP_TASK_ASSIGNMENT
8579 int step = 0;
8580 struct sched_entity *se = &p->se;
8581 int B_target = NR_CPUS;
8582 int L_target = NR_CPUS;
8583 struct clb_env clbenv;
8584
8585#ifdef CONFIG_HMP_TRACER
8586 int cpu = 0;
8587 for_each_online_cpu(cpu)
8588 trace_sched_cfs_runnable_load(cpu,cfs_load(cpu),cfs_length(cpu));
8589#endif
8590
8591 // error handling
8592 if (prev_cpu >= NR_CPUS)
8593 return new_cpu;
8594
8595 /*
8596 * Skip all the checks if only one CPU is online.
8597 * Otherwise, select the most appropriate CPU from cluster.
8598 */
8599 if (num_online_cpus() == 1)
8600 goto out;
8601 B_target = hmp_select_cpu(HMP_SELECT_RQ,p,&hmp_fast_cpu_mask,prev_cpu);
8602 L_target = hmp_select_cpu(HMP_SELECT_RQ,p,&hmp_slow_cpu_mask,prev_cpu);
8603
8604 /*
8605 * Only one cluster exists or only one cluster is allowed for this task
8606 * Case 1: return the runqueue whose load is minimum
8607 * Case 2: return original CFS runqueue selection result
8608 */
8609#ifdef CONFIG_HMP_DISCARD_CFS_SELECTION_RESULT
8610 if(NR_CPUS == B_target && NR_CPUS == L_target)
8611 goto out;
8612 if(NR_CPUS == B_target)
8613 goto select_slow;
8614 if(NR_CPUS == L_target)
8615 goto select_fast;
8616#else
8617 if(NR_CPUS == B_target || NR_CPUS == L_target)
8618 goto out;
8619#endif
8620
8621 /*
8622 * Two clusters exist and both clusters are allowed for this task
8623 * Step 1: Move newly created task to the cpu where no tasks are running
8624 * Step 2: Migrate heavy-load task to big
8625 * Step 3: Migrate light-load task to LITTLE
8626 * Step 4: Make sure the task stays in its previous hmp domain
8627 */
8628 step = 1;
8629 if (task_created(sd_flag) && !task_low_priority(p->prio)) {
8630 if (!rq_length(B_target))
8631 goto select_fast;
8632 if (!rq_length(L_target))
8633 goto select_slow;
8634 }
8635 memset(&clbenv, 0, sizeof(clbenv));
8636 clbenv.flags |= HMP_SELECT_RQ;
8637 clbenv.lcpus = &hmp_slow_cpu_mask;
8638 clbenv.bcpus = &hmp_fast_cpu_mask;
8639 clbenv.ltarget = L_target;
8640 clbenv.btarget = B_target;
8641 sched_update_clbstats(&clbenv);
8642 step = 2;
8643 if (hmp_up_migration(L_target, &B_target, se, &clbenv))
8644 goto select_fast;
8645 step = 3;
8646 if (hmp_down_migration(B_target, &L_target, se, &clbenv))
8647 goto select_slow;
8648 step = 4;
8649 if (hmp_cpu_is_slow(prev_cpu))
8650 goto select_slow;
8651 goto select_fast;
8652
8653select_fast:
8654 new_cpu = B_target;
8655 goto out;
8656select_slow:
8657 new_cpu = L_target;
8658 goto out;
8659
8660out:
8661
8662 // it happens when num_online_cpus=1
8663 if (new_cpu >= nr_cpu_ids)
8664 {
8665 //BUG_ON(1);
8666 new_cpu = prev_cpu;
8667 }
8668
8669 cfs_nr_pending(new_cpu)++;
8670 cfs_pending_load(new_cpu) += se_load(se);
8671#ifdef CONFIG_HMP_TRACER
8672 trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
8673 trace_sched_hmp_select_task_rq(p,step,sd_flag,prev_cpu,new_cpu,
8674 se_load(se),&clbenv.bstats,&clbenv.lstats);
8675#endif
8676#ifdef CONFIG_MET_SCHED_HMP
8677 HmpLoad(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
8678#endif
8679#endif /* CONFIG_HMP_TASK_ASSIGNMENT */
8680 return new_cpu;
8681}
8682
8683/*
8684 * Heterogenous Multi-Processor (HMP) - Task Dynamic Migration Threshold
8685 */
8686
8687/*
8688 * If the workload between clusters is not balanced, adjust migration
8689 * threshold in an attempt to move task to the cluster where the workload
8690 * is not heavy
8691 */
8692
8693/*
8694 * According to ARM's cpu_efficiency table, the computing power of CA15 and
8695 * CA7 are 3891 and 2048 respectively. Thus, we assume big has twice the
8696 * computing power of LITTLE
8697 */
8698
8699#define HMP_RATIO(v) ((v)*17/10)
8700
8701#define hmp_fast_cpu_has_spare_cycles(B,cpu_load) (cpu_load < \
8702 (HMP_RATIO(B->cpu_capacity) - (B->cpu_capacity >> 2)))
8703
8704#define hmp_task_fast_cpu_afford(B,se,cpu) (B->acap > 0 \
8705 && hmp_fast_cpu_has_spare_cycles(B,se_load(se) + cfs_load(cpu)))
8706
8707#define hmp_fast_cpu_oversubscribed(caller,B,se,cpu) \
8708 (hmp_caller_is_gb(caller)? \
8709 !hmp_fast_cpu_has_spare_cycles(B,cfs_load(cpu)): \
8710 !hmp_task_fast_cpu_afford(B,se,cpu))
8711
8712#define hmp_task_slow_cpu_afford(L,se) \
8713 (L->acap > 0 && L->acap >= se_load(se))
8714
8715/* Macro used by low-priorty task filter */
8716#define hmp_low_prio_task_up_rejected(p,B,L) \
8717 (task_low_priority(p->prio) && \
8718 (B->ntask >= B->ncpu || 0 != L->nr_normal_prio_task) && \
8719 (p->se.avg.load_avg_ratio < 800))
8720
8721#define hmp_low_prio_task_down_allowed(p,B,L) \
8722 (task_low_priority(p->prio) && !B->nr_dequeuing_low_prio && \
8723 B->ntask >= B->ncpu && 0 != L->nr_normal_prio_task && \
8724 (p->se.avg.load_avg_ratio < 800))
8725
8726/* Migration check result */
8727#define HMP_BIG_NOT_OVERSUBSCRIBED (0x01)
8728#define HMP_BIG_CAPACITY_INSUFFICIENT (0x02)
8729#define HMP_LITTLE_CAPACITY_INSUFFICIENT (0x04)
8730#define HMP_LOW_PRIORITY_FILTER (0x08)
8731#define HMP_BIG_BUSY_LITTLE_IDLE (0x10)
8732#define HMP_BIG_IDLE (0x20)
8733#define HMP_MIGRATION_APPROVED (0x100)
8734#define HMP_TASK_UP_MIGRATION (0x200)
8735#define HMP_TASK_DOWN_MIGRATION (0x400)
8736
8737/* Migration statistics */
8738#ifdef CONFIG_HMP_TRACER
8739struct hmp_statisic hmp_stats;
8740#endif
8741
8742static inline void hmp_dynamic_threshold(struct clb_env *clbenv)
8743{
8744 struct clb_stats *L = &clbenv->lstats;
8745 struct clb_stats *B = &clbenv->bstats;
8746 unsigned int hmp_threshold_diff = hmp_up_threshold - hmp_down_threshold;
8747 unsigned int B_normalized_acap = hmp_pos(HMP_RATIO(B->scaled_acap));
8748 unsigned int B_normalized_atask = hmp_pos(HMP_RATIO(B->scaled_atask));
8749 unsigned int L_normalized_acap = hmp_pos(L->scaled_acap);
8750 unsigned int L_normalized_atask = hmp_pos(L->scaled_atask);
8751
8752#ifdef CONFIG_HMP_DYNAMIC_THRESHOLD
8753 L->threshold = hmp_threshold_diff;
8754 L->threshold *= hmp_inc(L_normalized_acap) * hmp_inc(L_normalized_atask);
8755 L->threshold /= hmp_inc(B_normalized_acap + L_normalized_acap);
8756 L->threshold /= hmp_inc(B_normalized_atask + L_normalized_atask);
8757 L->threshold = hmp_down_threshold + L->threshold;
8758
8759 B->threshold = hmp_threshold_diff;
8760 B->threshold *= hmp_inc(B_normalized_acap) * hmp_inc(B_normalized_atask);
8761 B->threshold /= hmp_inc(B_normalized_acap + L_normalized_acap);
8762 B->threshold /= hmp_inc(B_normalized_atask + L_normalized_atask);
8763 B->threshold = hmp_up_threshold - B->threshold;
8764#else /* !CONFIG_HMP_DYNAMIC_THRESHOLD */
8765 clbenv->lstats.threshold = hmp_down_threshold; // down threshold
8766 clbenv->bstats.threshold = hmp_up_threshold; // up threshold
8767#endif /* CONFIG_HMP_DYNAMIC_THRESHOLD */
8768
8769 mt_sched_printf("[%s]\tup/dl:%4d/%4d bcpu(%d):%d/%d, lcpu(%d):%d/%d\n", __func__,
8770 B->threshold, L->threshold,
8771 clbenv->btarget, clbenv->bstats.cpu_capacity, clbenv->bstats.cpu_power,
8772 clbenv->ltarget, clbenv->lstats.cpu_capacity, clbenv->lstats.cpu_power);
8773}
8774
8775/*
8776 * Check whether this task should be migrated to big
8777 * Briefly summarize the flow as below;
8778 * 1) Migration stabilizing
8779 * 1.5) Keep all cpu busy
8780 * 2) Filter low-priorty task
8781 * 3) Check CPU capacity
8782 * 4) Check dynamic migration threshold
8783 */
8784static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se,
8785 struct clb_env *clbenv)
8786{
8787 struct task_struct *p = task_of(se);
8788 struct clb_stats *L, *B;
8789 struct mcheck *check;
8790 int curr_cpu = cpu;
8791 unsigned int caller = clbenv->flags;
8792
8793 L = &clbenv->lstats;
8794 B = &clbenv->bstats;
8795 check = &clbenv->mcheck;
8796
8797 check->status = clbenv->flags;
8798 check->status |= HMP_TASK_UP_MIGRATION;
8799 check->result = 0;
8800
8801 /*
8802 * No migration is needed if
8803 * 1) There is only one cluster
8804 * 2) Task is already in big cluster
8805 * 3) It violates task affinity
8806 */
8807 if (!L->ncpu || !B->ncpu
8808 || cpumask_test_cpu(curr_cpu, clbenv->bcpus)
8809 || !cpumask_intersects(clbenv->bcpus, tsk_cpus_allowed(p)))
8810 goto out;
8811
8812 /*
8813 * [1] Migration stabilizing
8814 * Let the task load settle before doing another up migration.
8815 * It can prevent a bunch of tasks from migrating to a unstable CPU.
8816 */
8817 if (!hmp_up_stable(*target_cpu))
8818 goto out;
8819
8820 /* [2] Filter low-priorty task */
8821#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8822 if (hmp_low_prio_task_up_rejected(p,B,L)) {
8823 check->status |= HMP_LOW_PRIORITY_FILTER;
8824 goto trace;
8825 }
8826#endif
8827
8828 // [2.5]if big is idle, just go to big
8829 if (rq_length(*target_cpu)==0)
8830 {
8831 check->status |= HMP_BIG_IDLE;
8832 check->status |= HMP_MIGRATION_APPROVED;
8833 check->result = 1;
8834 goto trace;
8835 }
8836
8837 /*
8838 * [3] Check CPU capacity
8839 * Forbid up-migration if big CPU can't handle this task
8840 */
8841 if (!hmp_task_fast_cpu_afford(B,se,*target_cpu)) {
8842 check->status |= HMP_BIG_CAPACITY_INSUFFICIENT;
8843 goto trace;
8844 }
8845
8846 /*
8847 * [4] Check dynamic migration threshold
8848 * Migrate task from LITTLE to big if load is greater than up-threshold
8849 */
8850 if (se_load(se) > B->threshold) {
8851 check->status |= HMP_MIGRATION_APPROVED;
8852 check->result = 1;
8853 }
8854
8855trace:
8856#ifdef CONFIG_HMP_TRACER
8857 if(check->result && hmp_caller_is_gb(caller))
8858 hmp_stats.nr_force_up++;
8859 trace_sched_hmp_stats(&hmp_stats);
8860 trace_sched_dynamic_threshold(task_of(se),B->threshold,check->status,
8861 curr_cpu,*target_cpu,se_load(se),B,L);
8862#endif
8863#ifdef CONFIG_MET_SCHED_HMP
8864 TaskTh(B->threshold,L->threshold);
8865 HmpStat(&hmp_stats);
8866#endif
8867out:
8868 return check->result;
8869}
8870
8871/*
8872 * Check whether this task should be migrated to LITTLE
8873 * Briefly summarize the flow as below;
8874 * 1) Migration stabilizing
8875 * 1.5) Keep all cpu busy
8876 * 2) Filter low-priorty task
8877 * 3) Check CPU capacity
8878 * 4) Check dynamic migration threshold
8879 */
8880static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se,
8881 struct clb_env *clbenv)
8882{
8883 struct task_struct *p = task_of(se);
8884 struct clb_stats *L, *B;
8885 struct mcheck *check;
8886 int curr_cpu = cpu;
8887 unsigned int caller = clbenv->flags;
8888
8889 L = &clbenv->lstats;
8890 B = &clbenv->bstats;
8891 check = &clbenv->mcheck;
8892
8893 check->status = caller;
8894 check->status |= HMP_TASK_DOWN_MIGRATION;
8895 check->result = 0;
8896
8897 /*
8898 * No migration is needed if
8899 * 1) There is only one cluster
8900 * 2) Task is already in LITTLE cluster
8901 * 3) It violates task affinity
8902 */
8903 if (!L->ncpu || !B->ncpu
8904 || cpumask_test_cpu(curr_cpu, clbenv->lcpus)
8905 || !cpumask_intersects(clbenv->lcpus, tsk_cpus_allowed(p)))
8906 goto out;
8907
8908 /*
8909 * [1] Migration stabilizing
8910 * Let the task load settle before doing another down migration.
8911 * It can prevent a bunch of tasks from migrating to a unstable CPU.
8912 */
8913 if (!hmp_down_stable(*target_cpu))
8914 goto out;
8915
8916 // [1.5]if big is busy and little is idle, just go to little
8917 if (rq_length(*target_cpu)==0 && caller == HMP_SELECT_RQ && rq_length(curr_cpu)>0)
8918 {
8919 check->status |= HMP_BIG_BUSY_LITTLE_IDLE;
8920 check->status |= HMP_MIGRATION_APPROVED;
8921 check->result = 1;
8922 goto trace;
8923 }
8924
8925 /* [2] Filter low-priorty task */
8926#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8927 if (hmp_low_prio_task_down_allowed(p,B,L)) {
8928 cfs_nr_dequeuing_low_prio(curr_cpu)++;
8929 check->status |= HMP_LOW_PRIORITY_FILTER;
8930 check->status |= HMP_MIGRATION_APPROVED;
8931 check->result = 1;
8932 goto trace;
8933 }
8934#endif
8935
8936 /*
8937 * [3] Check CPU capacity
8938 * Forbid down-migration if either of the following conditions is true
8939 * 1) big cpu is not oversubscribed (if big CPU seems to have spare
8940 * cycles, do not force this task to run on LITTLE CPU, but
8941 * keep it staying in its previous cluster instead)
8942 * 2) LITTLE cpu doesn't have available capacity for this new task
8943 */
8944 if (!hmp_fast_cpu_oversubscribed(caller,B,se,curr_cpu)) {
8945 check->status |= HMP_BIG_NOT_OVERSUBSCRIBED;
8946 goto trace;
8947 }
8948
8949 if (!hmp_task_slow_cpu_afford(L,se)) {
8950 check->status |= HMP_LITTLE_CAPACITY_INSUFFICIENT;
8951 goto trace;
8952 }
8953
8954 /*
8955 * [4] Check dynamic migration threshold
8956 * Migrate task from big to LITTLE if load ratio is less than
8957 * or equal to down-threshold
8958 */
8959 if (L->threshold >= se_load(se)) {
8960 check->status |= HMP_MIGRATION_APPROVED;
8961 check->result = 1;
8962 }
8963
8964trace:
8965#ifdef CONFIG_HMP_TRACER
8966 if (check->result && hmp_caller_is_gb(caller))
8967 hmp_stats.nr_force_down++;
8968 trace_sched_hmp_stats(&hmp_stats);
8969 trace_sched_dynamic_threshold(task_of(se),L->threshold,check->status,
8970 curr_cpu,*target_cpu,se_load(se),B,L);
8971#endif
8972#ifdef CONFIG_MET_SCHED_HMP
8973 TaskTh(B->threshold,L->threshold);
8974 HmpStat(&hmp_stats);
8975#endif
8976out:
8977 return check->result;
8978}
8979#else /* CONFIG_SCHED_HMP_ENHANCEMENT */
8980/* Check if task should migrate to a faster cpu */
8981static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
8982{
8983 struct task_struct *p = task_of(se);
8984 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8985 u64 now;
8986
8987 if (target_cpu)
8988 *target_cpu = NR_CPUS;
8989
8990 if (hmp_cpu_is_fastest(cpu))
8991 return 0;
8992
8993#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8994 /* Filter by task priority */
8995 if (p->prio >= hmp_up_prio)
8996 return 0;
8997#endif
8998 if (se->avg.load_avg_ratio < hmp_up_threshold)
8999 return 0;
9000
9001 /* Let the task load settle before doing another up migration */
9002 now = cfs_rq_clock_task(cfs_rq);
9003 if (((now - se->avg.hmp_last_up_migration) >> 10)
9004 < hmp_next_up_threshold)
9005 return 0;
9006
9007 /* Target domain load < 94% */
9008 if (hmp_domain_min_load(hmp_faster_domain(cpu), target_cpu)
9009 > NICE_0_LOAD-64)
9010 return 0;
9011
9012 if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus,
9013 tsk_cpus_allowed(p)))
9014 return 1;
9015
9016 return 0;
9017}
9018
9019/* Check if task should migrate to a slower cpu */
9020static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
9021{
9022 struct task_struct *p = task_of(se);
9023 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
9024 u64 now;
9025
9026 if (hmp_cpu_is_slowest(cpu))
9027 return 0;
9028
9029#ifdef CONFIG_SCHED_HMP_PRIO_FILTER
9030 /* Filter by task priority */
9031 if ((p->prio >= hmp_up_prio) &&
9032 cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
9033 tsk_cpus_allowed(p))) {
9034 return 1;
9035 }
9036#endif
9037
9038 /* Let the task load settle before doing another down migration */
9039 now = cfs_rq_clock_task(cfs_rq);
9040 if (((now - se->avg.hmp_last_down_migration) >> 10)
9041 < hmp_next_down_threshold)
9042 return 0;
9043
9044 if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
9045 tsk_cpus_allowed(p))
9046 && se->avg.load_avg_ratio < hmp_down_threshold) {
9047 return 1;
9048 }
9049 return 0;
9050}
9051#endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
9052
9053/*
9054 * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
9055 * Ideally this function should be merged with can_migrate_task() to avoid
9056 * redundant code.
9057 */
9058static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
9059{
9060 int tsk_cache_hot = 0;
9061
9062 /*
9063 * We do not migrate tasks that are:
9064 * 1) running (obviously), or
9065 * 2) cannot be migrated to this CPU due to cpus_allowed
9066 */
9067 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
9068 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
9069 return 0;
9070 }
9071 env->flags &= ~LBF_ALL_PINNED;
9072
9073 if (task_running(env->src_rq, p)) {
9074 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
9075 return 0;
9076 }
9077
9078 /*
9079 * Aggressive migration if:
9080 * 1) task is cache cold, or
9081 * 2) too many balance attempts have failed.
9082 */
9083
9084#if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
9085 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd, env->mt_check_cache_in_idle);
9086#else
9087 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
9088#endif
9089 if (!tsk_cache_hot ||
9090 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
9091#ifdef CONFIG_SCHEDSTATS
9092 if (tsk_cache_hot) {
9093 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
9094 schedstat_inc(p, se.statistics.nr_forced_migrations);
9095 }
9096#endif
9097 return 1;
9098 }
9099
9100 return 1;
9101}
9102
9103/*
9104 * move_specific_task tries to move a specific task.
9105 * Returns 1 if successful and 0 otherwise.
9106 * Called with both runqueues locked.
9107 */
9108static int move_specific_task(struct lb_env *env, struct task_struct *pm)
9109{
9110 struct task_struct *p, *n;
9111
9112 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
9113 if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
9114 env->dst_cpu))
9115 continue;
9116
9117 if (!hmp_can_migrate_task(p, env))
9118 continue;
9119 /* Check if we found the right task */
9120 if (p != pm)
9121 continue;
9122
9123 move_task(p, env);
9124 /*
9125 * Right now, this is only the third place move_task()
9126 * is called, so we can safely collect move_task()
9127 * stats here rather than inside move_task().
9128 */
9129 schedstat_inc(env->sd, lb_gained[env->idle]);
9130 return 1;
9131 }
9132 return 0;
9133}
9134
9135/*
9136 * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
9137 * migrate a specific task from one runqueue to another.
9138 * hmp_force_up_migration uses this to push a currently running task
9139 * off a runqueue.
9140 * Based on active_load_balance_stop_cpu and can potentially be merged.
9141 */
9142static int hmp_active_task_migration_cpu_stop(void *data)
9143{
9144 struct rq *busiest_rq = data;
9145 struct task_struct *p = busiest_rq->migrate_task;
9146 int busiest_cpu = cpu_of(busiest_rq);
9147 int target_cpu = busiest_rq->push_cpu;
9148 struct rq *target_rq = cpu_rq(target_cpu);
9149 struct sched_domain *sd;
9150
9151 raw_spin_lock_irq(&busiest_rq->lock);
9152 /* make sure the requested cpu hasn't gone down in the meantime */
9153 if (unlikely(busiest_cpu != smp_processor_id() ||
9154 !busiest_rq->active_balance)) {
9155 goto out_unlock;
9156 }
9157 /* Is there any task to move? */
9158 if (busiest_rq->nr_running <= 1)
9159 goto out_unlock;
9160 /* Task has migrated meanwhile, abort forced migration */
9161 if (task_rq(p) != busiest_rq)
9162 goto out_unlock;
9163 /*
9164 * This condition is "impossible", if it occurs
9165 * we need to fix it. Originally reported by
9166 * Bjorn Helgaas on a 128-cpu setup.
9167 */
9168 BUG_ON(busiest_rq == target_rq);
9169
9170 /* move a task from busiest_rq to target_rq */
9171 double_lock_balance(busiest_rq, target_rq);
9172
9173 /* Search for an sd spanning us and the target CPU. */
9174 rcu_read_lock();
9175 for_each_domain(target_cpu, sd) {
9176 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9177 break;
9178 }
9179
9180 if (likely(sd)) {
9181 struct lb_env env = {
9182 .sd = sd,
9183 .dst_cpu = target_cpu,
9184 .dst_rq = target_rq,
9185 .src_cpu = busiest_rq->cpu,
9186 .src_rq = busiest_rq,
9187 .idle = CPU_IDLE,
9188 };
9189
9190 schedstat_inc(sd, alb_count);
9191
9192 if (move_specific_task(&env, p))
9193 schedstat_inc(sd, alb_pushed);
9194 else
9195 schedstat_inc(sd, alb_failed);
9196 }
9197 rcu_read_unlock();
9198 double_unlock_balance(busiest_rq, target_rq);
9199out_unlock:
9200 busiest_rq->active_balance = 0;
9201 raw_spin_unlock_irq(&busiest_rq->lock);
9202 return 0;
9203}
9204
9205static DEFINE_SPINLOCK(hmp_force_migration);
9206#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
9207/*
9208 * Heterogenous Multi-Processor (HMP) Global Load Balance
1e3c88bd 9209 */
83cd4fe2 9210
6fa3eb70
S
9211/*
9212 * According to Linaro's comment, we should only check the currently running
9213 * tasks because selecting other tasks for migration will require extensive
9214 * book keeping.
9215 */
9216#ifdef CONFIG_HMP_GLOBAL_BALANCE
9217static void hmp_force_down_migration(int this_cpu)
9218{
9219 int curr_cpu, target_cpu;
9220 struct sched_entity *se;
9221 struct rq *target;
9222 unsigned long flags;
9223 unsigned int force;
9224 struct task_struct *p;
9225 struct clb_env clbenv;
83cd4fe2 9226
6fa3eb70
S
9227 /* Migrate light task from big to LITTLE */
9228 for_each_cpu(curr_cpu, &hmp_fast_cpu_mask) {
9229 /* Check whether CPU is online */
9230 if(!cpu_online(curr_cpu))
83cd4fe2
VP
9231 continue;
9232
6fa3eb70
S
9233 force = 0;
9234 target = cpu_rq(curr_cpu);
9235 raw_spin_lock_irqsave(&target->lock, flags);
9236 se = target->cfs.curr;
9237 if (!se) {
9238 raw_spin_unlock_irqrestore(&target->lock, flags);
9239 continue;
9240 }
5ed4f1d9 9241
6fa3eb70
S
9242 /* Find task entity */
9243 if (!entity_is_task(se)) {
9244 struct cfs_rq *cfs_rq;
9245 cfs_rq = group_cfs_rq(se);
9246 while (cfs_rq) {
9247 se = cfs_rq->curr;
9248 cfs_rq = group_cfs_rq(se);
9249 }
9250 }
83cd4fe2 9251
6fa3eb70
S
9252 p = task_of(se);
9253 target_cpu = hmp_select_cpu(HMP_GB,p,&hmp_slow_cpu_mask,-1);
9254 if(NR_CPUS == target_cpu) {
9255 raw_spin_unlock_irqrestore(&target->lock, flags);
9256 continue;
9257 }
83cd4fe2 9258
6fa3eb70
S
9259 /* Collect cluster information */
9260 memset(&clbenv, 0, sizeof(clbenv));
9261 clbenv.flags |= HMP_GB;
9262 clbenv.btarget = curr_cpu;
9263 clbenv.ltarget = target_cpu;
9264 clbenv.lcpus = &hmp_slow_cpu_mask;
9265 clbenv.bcpus = &hmp_fast_cpu_mask;
9266 sched_update_clbstats(&clbenv);
9267
9268 /* Check migration threshold */
9269 if (!target->active_balance &&
9270 hmp_down_migration(curr_cpu, &target_cpu, se, &clbenv)) {
9271 target->active_balance = 1;
9272 target->push_cpu = target_cpu;
9273 target->migrate_task = p;
9274 force = 1;
9275 trace_sched_hmp_migrate(p, target->push_cpu, 1);
9276 hmp_next_down_delay(&p->se, target->push_cpu);
9277 }
9278 raw_spin_unlock_irqrestore(&target->lock, flags);
9279 if (force) {
9280 stop_one_cpu_nowait(cpu_of(target),
9281 hmp_active_task_migration_cpu_stop,
9282 target, &target->active_balance_work);
9283 }
83cd4fe2 9284 }
83cd4fe2 9285}
6fa3eb70
S
9286#endif /* CONFIG_HMP_GLOBAL_BALANCE */
9287#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9288u32 AVOID_FORCE_UP_MIGRATION_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
9289#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
83cd4fe2 9290
6fa3eb70 9291static void hmp_force_up_migration(int this_cpu)
83cd4fe2 9292{
6fa3eb70
S
9293 int curr_cpu, target_cpu;
9294 struct sched_entity *se;
9295 struct rq *target;
9296 unsigned long flags;
9297 unsigned int force;
9298 struct task_struct *p;
9299 struct clb_env clbenv;
9300#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9301 int push_cpu;
9302#endif
83cd4fe2 9303
6fa3eb70
S
9304 if (!spin_trylock(&hmp_force_migration))
9305 return;
0b005cf5 9306
6fa3eb70
S
9307#ifdef CONFIG_HMP_TRACER
9308 for_each_online_cpu(curr_cpu)
9309 trace_sched_cfs_runnable_load(curr_cpu,cfs_load(curr_cpu),
9310 cfs_length(curr_cpu));
9311#endif
1c792db7 9312
6fa3eb70
S
9313 /* Migrate heavy task from LITTLE to big */
9314 for_each_cpu(curr_cpu, &hmp_slow_cpu_mask) {
9315 /* Check whether CPU is online */
9316 if(!cpu_online(curr_cpu))
9317 continue;
83cd4fe2 9318
6fa3eb70
S
9319 force = 0;
9320 target = cpu_rq(curr_cpu);
9321 raw_spin_lock_irqsave(&target->lock, flags);
9322 se = target->cfs.curr;
9323 if (!se) {
9324 raw_spin_unlock_irqrestore(&target->lock, flags);
9325 continue;
9326 }
83cd4fe2 9327
6fa3eb70
S
9328 /* Find task entity */
9329 if (!entity_is_task(se)) {
9330 struct cfs_rq *cfs_rq;
9331 cfs_rq = group_cfs_rq(se);
9332 while (cfs_rq) {
9333 se = cfs_rq->curr;
9334 cfs_rq = group_cfs_rq(se);
9335 }
9336 }
9337
9338 p = task_of(se);
9339 target_cpu = hmp_select_cpu(HMP_GB,p,&hmp_fast_cpu_mask,-1);
9340 if(NR_CPUS == target_cpu) {
9341 raw_spin_unlock_irqrestore(&target->lock, flags);
9342 continue;
9343 }
83cd4fe2 9344
6fa3eb70
S
9345 /* Collect cluster information */
9346 memset(&clbenv, 0, sizeof(clbenv));
9347 clbenv.flags |= HMP_GB;
9348 clbenv.ltarget = curr_cpu;
9349 clbenv.btarget = target_cpu;
9350 clbenv.lcpus = &hmp_slow_cpu_mask;
9351 clbenv.bcpus = &hmp_fast_cpu_mask;
9352 sched_update_clbstats(&clbenv);
9353
9354#ifdef CONFIG_HMP_LAZY_BALANCE
9355#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9356 if (PA_ENABLE && LB_ENABLE) {
9357#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9358 if (is_light_task(p) && !is_buddy_busy(per_cpu(sd_pack_buddy, curr_cpu))) {
9359#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9360 push_cpu = hmp_select_cpu(HMP_GB,p,&hmp_fast_cpu_mask,-1);
9361 if (hmp_cpu_is_fast(push_cpu)) {
9362 AVOID_FORCE_UP_MIGRATION_FROM_CPUX_TO_CPUY_COUNT[curr_cpu][push_cpu]++;
9363#ifdef CONFIG_HMP_TRACER
9364 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_FORCE_UP_FORM_CPUX_TO_CPUY, p->pid, curr_cpu, push_cpu);
9365#endif /* CONFIG_HMP_TRACER */
9366 }
9367#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9368 goto out_force_up;
9369 }
9370#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9371 }
9372#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9373#endif /* CONFIG_HMP_LAZY_BALANCE */
9374
9375 /* Check migration threshold */
9376 if (!target->active_balance &&
9377 hmp_up_migration(curr_cpu, &target_cpu, se, &clbenv)) {
9378 target->active_balance = 1;
9379 target->push_cpu = target_cpu;
9380 target->migrate_task = p;
9381 force = 1;
9382 trace_sched_hmp_migrate(p, target->push_cpu, 1);
9383 hmp_next_up_delay(&p->se, target->push_cpu);
9384 }
0b005cf5 9385
6fa3eb70
S
9386#ifdef CONFIG_HMP_LAZY_BALANCE
9387out_force_up:
9388#endif /* CONFIG_HMP_LAZY_BALANCE */
0b005cf5 9389
6fa3eb70
S
9390 raw_spin_unlock_irqrestore(&target->lock, flags);
9391 if (force) {
9392 stop_one_cpu_nowait(cpu_of(target),
9393 hmp_active_task_migration_cpu_stop,
9394 target, &target->active_balance_work);
9395 }
83cd4fe2 9396 }
067491b7 9397
6fa3eb70
S
9398#ifdef CONFIG_HMP_GLOBAL_BALANCE
9399 hmp_force_down_migration(this_cpu);
9400#endif
9401#ifdef CONFIG_HMP_TRACER
9402 trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
9403#endif
9404 spin_unlock(&hmp_force_migration);
9405}
9406#else /* CONFIG_SCHED_HMP_ENHANCEMENT */
9407/*
9408 * hmp_force_up_migration checks runqueues for tasks that need to
9409 * be actively migrated to a faster cpu.
9410 */
9411static void hmp_force_up_migration(int this_cpu)
9412{
9413 int cpu, target_cpu;
9414 struct sched_entity *curr;
9415 struct rq *target;
9416 unsigned long flags;
9417 unsigned int force;
9418 struct task_struct *p;
9419
9420 if (!spin_trylock(&hmp_force_migration))
9421 return;
9422 for_each_online_cpu(cpu) {
9423 force = 0;
9424 target = cpu_rq(cpu);
9425 raw_spin_lock_irqsave(&target->lock, flags);
9426 curr = target->cfs.curr;
9427 if (!curr) {
9428 raw_spin_unlock_irqrestore(&target->lock, flags);
9429 continue;
9430 }
9431 if (!entity_is_task(curr)) {
9432 struct cfs_rq *cfs_rq;
9433
9434 cfs_rq = group_cfs_rq(curr);
9435 while (cfs_rq) {
9436 curr = cfs_rq->curr;
9437 cfs_rq = group_cfs_rq(curr);
9438 }
9439 }
9440 p = task_of(curr);
9441 if (hmp_up_migration(cpu, &target_cpu, curr)) {
9442 if (!target->active_balance) {
9443 target->active_balance = 1;
9444 target->push_cpu = target_cpu;
9445 target->migrate_task = p;
9446 force = 1;
9447 trace_sched_hmp_migrate(p, target->push_cpu, 1);
9448 hmp_next_up_delay(&p->se, target->push_cpu);
9449 }
9450 }
9451 if (!force && !target->active_balance) {
9452 /*
9453 * For now we just check the currently running task.
9454 * Selecting the lightest task for offloading will
9455 * require extensive book keeping.
9456 */
9457 target->push_cpu = hmp_offload_down(cpu, curr);
9458 if (target->push_cpu < NR_CPUS) {
9459 target->active_balance = 1;
9460 target->migrate_task = p;
9461 force = 1;
9462 trace_sched_hmp_migrate(p, target->push_cpu, 2);
9463 hmp_next_down_delay(&p->se, target->push_cpu);
9464 }
9465 }
9466 raw_spin_unlock_irqrestore(&target->lock, flags);
9467 if (force)
9468 stop_one_cpu_nowait(cpu_of(target),
9469 hmp_active_task_migration_cpu_stop,
9470 target, &target->active_balance_work);
9471 }
9472 spin_unlock(&hmp_force_migration);
83cd4fe2 9473}
6fa3eb70 9474#endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
83cd4fe2 9475#else
6fa3eb70
S
9476static void hmp_force_up_migration(int this_cpu) { }
9477#endif /* CONFIG_SCHED_HMP */
83cd4fe2
VP
9478
9479/*
9480 * run_rebalance_domains is triggered when needed from the scheduler tick.
9481 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
9482 */
1e3c88bd
PZ
9483static void run_rebalance_domains(struct softirq_action *h)
9484{
9485 int this_cpu = smp_processor_id();
9486 struct rq *this_rq = cpu_rq(this_cpu);
6eb57e0d 9487 enum cpu_idle_type idle = this_rq->idle_balance ?
1e3c88bd
PZ
9488 CPU_IDLE : CPU_NOT_IDLE;
9489
6fa3eb70
S
9490 hmp_force_up_migration(this_cpu);
9491
1e3c88bd
PZ
9492 rebalance_domains(this_cpu, idle);
9493
1e3c88bd 9494 /*
83cd4fe2 9495 * If this cpu has a pending nohz_balance_kick, then do the
1e3c88bd
PZ
9496 * balancing on behalf of the other idle cpus whose ticks are
9497 * stopped.
9498 */
83cd4fe2 9499 nohz_idle_balance(this_cpu, idle);
1e3c88bd
PZ
9500}
9501
9502static inline int on_null_domain(int cpu)
9503{
90a6501f 9504 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
1e3c88bd
PZ
9505}
9506
9507/*
9508 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
1e3c88bd 9509 */
029632fb 9510void trigger_load_balance(struct rq *rq, int cpu)
1e3c88bd 9511{
1e3c88bd
PZ
9512 /* Don't need to rebalance while attached to NULL domain */
9513 if (time_after_eq(jiffies, rq->next_balance) &&
9514 likely(!on_null_domain(cpu)))
9515 raise_softirq(SCHED_SOFTIRQ);
3451d024 9516#ifdef CONFIG_NO_HZ_COMMON
1c792db7 9517 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
83cd4fe2
VP
9518 nohz_balancer_kick(cpu);
9519#endif
1e3c88bd
PZ
9520}
9521
0bcdcf28
CE
9522static void rq_online_fair(struct rq *rq)
9523{
6fa3eb70
S
9524#ifdef CONFIG_SCHED_HMP
9525 hmp_online_cpu(rq->cpu);
9526#endif
0bcdcf28
CE
9527 update_sysctl();
9528}
9529
9530static void rq_offline_fair(struct rq *rq)
9531{
6fa3eb70
S
9532#ifdef CONFIG_SCHED_HMP
9533 hmp_offline_cpu(rq->cpu);
9534#endif
0bcdcf28 9535 update_sysctl();
a4c96ae3
PB
9536
9537 /* Ensure any throttled groups are reachable by pick_next_task */
9538 unthrottle_offline_cfs_rqs(rq);
0bcdcf28
CE
9539}
9540
55e12e5e 9541#endif /* CONFIG_SMP */
e1d1484f 9542
bf0f6f24
IM
9543/*
9544 * scheduler tick hitting a task of our scheduling class:
9545 */
8f4d37ec 9546static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
bf0f6f24
IM
9547{
9548 struct cfs_rq *cfs_rq;
9549 struct sched_entity *se = &curr->se;
9550
9551 for_each_sched_entity(se) {
9552 cfs_rq = cfs_rq_of(se);
8f4d37ec 9553 entity_tick(cfs_rq, se, queued);
bf0f6f24 9554 }
18bf2805 9555
cbee9f88
PZ
9556 if (sched_feat_numa(NUMA))
9557 task_tick_numa(rq, curr);
3d59eebc 9558
18bf2805 9559 update_rq_runnable_avg(rq, 1);
bf0f6f24
IM
9560}
9561
9562/*
cd29fe6f
PZ
9563 * called on fork with the child task as argument from the parent's context
9564 * - child not yet on the tasklist
9565 * - preemption disabled
bf0f6f24 9566 */
cd29fe6f 9567static void task_fork_fair(struct task_struct *p)
bf0f6f24 9568{
4fc420c9
DN
9569 struct cfs_rq *cfs_rq;
9570 struct sched_entity *se = &p->se, *curr;
00bf7bfc 9571 int this_cpu = smp_processor_id();
cd29fe6f
PZ
9572 struct rq *rq = this_rq();
9573 unsigned long flags;
9574
05fa785c 9575 raw_spin_lock_irqsave(&rq->lock, flags);
bf0f6f24 9576
861d034e
PZ
9577 update_rq_clock(rq);
9578
4fc420c9
DN
9579 cfs_rq = task_cfs_rq(current);
9580 curr = cfs_rq->curr;
9581
51f52947
DN
9582 /*
9583 * Not only the cpu but also the task_group of the parent might have
9584 * been changed after parent->se.parent,cfs_rq were copied to
9585 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
9586 * of child point to valid ones.
9587 */
9588 rcu_read_lock();
9589 __set_task_cpu(p, this_cpu);
9590 rcu_read_unlock();
bf0f6f24 9591
7109c442 9592 update_curr(cfs_rq);
cd29fe6f 9593
b5d9d734
MG
9594 if (curr)
9595 se->vruntime = curr->vruntime;
aeb73b04 9596 place_entity(cfs_rq, se, 1);
4d78e7b6 9597
cd29fe6f 9598 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
87fefa38 9599 /*
edcb60a3
IM
9600 * Upon rescheduling, sched_class::put_prev_task() will place
9601 * 'current' within the tree based on its new key value.
9602 */
4d78e7b6 9603 swap(curr->vruntime, se->vruntime);
aec0a514 9604 resched_task(rq->curr);
4d78e7b6 9605 }
bf0f6f24 9606
88ec22d3
PZ
9607 se->vruntime -= cfs_rq->min_vruntime;
9608
05fa785c 9609 raw_spin_unlock_irqrestore(&rq->lock, flags);
bf0f6f24
IM
9610}
9611
cb469845
SR
9612/*
9613 * Priority of the task has changed. Check to see if we preempt
9614 * the current task.
9615 */
da7a735e
PZ
9616static void
9617prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
cb469845 9618{
da7a735e
PZ
9619 if (!p->se.on_rq)
9620 return;
9621
cb469845
SR
9622 /*
9623 * Reschedule if we are currently running on this runqueue and
9624 * our priority decreased, or if we are not currently running on
9625 * this runqueue and our priority is higher than the current's
9626 */
da7a735e 9627 if (rq->curr == p) {
cb469845
SR
9628 if (p->prio > oldprio)
9629 resched_task(rq->curr);
9630 } else
15afe09b 9631 check_preempt_curr(rq, p, 0);
cb469845
SR
9632}
9633
da7a735e
PZ
9634static void switched_from_fair(struct rq *rq, struct task_struct *p)
9635{
9636 struct sched_entity *se = &p->se;
9637 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9638
9639 /*
84bb5b64 9640 * Ensure the task's vruntime is normalized, so that when it's
da7a735e
PZ
9641 * switched back to the fair class the enqueue_entity(.flags=0) will
9642 * do the right thing.
9643 *
84bb5b64
GM
9644 * If it's on_rq, then the dequeue_entity(.flags=0) will already
9645 * have normalized the vruntime, if it's !on_rq, then only when
da7a735e
PZ
9646 * the task is sleeping will it still have non-normalized vruntime.
9647 */
84bb5b64 9648 if (!p->on_rq && p->state != TASK_RUNNING) {
da7a735e
PZ
9649 /*
9650 * Fix up our vruntime so that the current sleep doesn't
9651 * cause 'unlimited' sleep bonus.
9652 */
9653 place_entity(cfs_rq, se, 0);
9654 se->vruntime -= cfs_rq->min_vruntime;
9655 }
9ee474f5 9656
6fa3eb70 9657#ifdef CONFIG_SMP
9ee474f5
PT
9658 /*
9659 * Remove our load from contribution when we leave sched_fair
9660 * and ensure we don't carry in an old decay_count if we
9661 * switch back.
9662 */
9663 if (p->se.avg.decay_count) {
9664 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
9665 __synchronize_entity_decay(&p->se);
9666 subtract_blocked_load_contrib(cfs_rq,
9667 p->se.avg.load_avg_contrib);
9668 }
9669#endif
da7a735e
PZ
9670}
9671
cb469845
SR
9672/*
9673 * We switched to the sched_fair class.
9674 */
da7a735e 9675static void switched_to_fair(struct rq *rq, struct task_struct *p)
cb469845 9676{
da7a735e
PZ
9677 if (!p->se.on_rq)
9678 return;
9679
cb469845
SR
9680 /*
9681 * We were most likely switched from sched_rt, so
9682 * kick off the schedule if running, otherwise just see
9683 * if we can still preempt the current task.
9684 */
da7a735e 9685 if (rq->curr == p)
cb469845 9686 resched_task(rq->curr);
6fa3eb70
S
9687 else{
9688 /*
9689 When task p change priority form RT to normal priority
9690 in switch_from_rt(), it might call pull_rt_task
9691 and potentially double_lock_balance will unlock rq.
9692 Task p might migrate to other CPU and result in task p is NOT at rq.
9693 In this case, it is not necessary to check preempt for rq.
9694 (Because task p is NOT at rq anymore)
9695 and the migrate flow for task p will check preempt in enqueue flow.
9696 So bypass the check_preempt_curr.
9697 */
9698 if (rq == task_rq(p)) {
9699 check_preempt_curr(rq, p, 0);
9700 }
9701 }
cb469845
SR
9702}
9703
83b699ed
SV
9704/* Account for a task changing its policy or group.
9705 *
9706 * This routine is mostly called to set cfs_rq->curr field when a task
9707 * migrates between groups/classes.
9708 */
9709static void set_curr_task_fair(struct rq *rq)
9710{
9711 struct sched_entity *se = &rq->curr->se;
9712
ec12cb7f
PT
9713 for_each_sched_entity(se) {
9714 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9715
9716 set_next_entity(cfs_rq, se);
9717 /* ensure bandwidth has been allocated on our new cfs_rq */
9718 account_cfs_rq_runtime(cfs_rq, 0);
9719 }
83b699ed
SV
9720}
9721
029632fb
PZ
9722void init_cfs_rq(struct cfs_rq *cfs_rq)
9723{
9724 cfs_rq->tasks_timeline = RB_ROOT;
029632fb
PZ
9725 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9726#ifndef CONFIG_64BIT
9727 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
9728#endif
6fa3eb70 9729#ifdef CONFIG_SMP
9ee474f5 9730 atomic64_set(&cfs_rq->decay_counter, 1);
6fa3eb70 9731 atomic_long_set(&cfs_rq->removed_load, 0);
9ee474f5 9732#endif
029632fb
PZ
9733}
9734
810b3817 9735#ifdef CONFIG_FAIR_GROUP_SCHED
b2b5ce02 9736static void task_move_group_fair(struct task_struct *p, int on_rq)
810b3817 9737{
aff3e498 9738 struct cfs_rq *cfs_rq;
b2b5ce02
PZ
9739 /*
9740 * If the task was not on the rq at the time of this cgroup movement
9741 * it must have been asleep, sleeping tasks keep their ->vruntime
9742 * absolute on their old rq until wakeup (needed for the fair sleeper
9743 * bonus in place_entity()).
9744 *
9745 * If it was on the rq, we've just 'preempted' it, which does convert
9746 * ->vruntime to a relative base.
9747 *
9748 * Make sure both cases convert their relative position when migrating
9749 * to another cgroup's rq. This does somewhat interfere with the
9750 * fair sleeper stuff for the first placement, but who cares.
9751 */
7ceff013
DN
9752 /*
9753 * When !on_rq, vruntime of the task has usually NOT been normalized.
9754 * But there are some cases where it has already been normalized:
9755 *
9756 * - Moving a forked child which is waiting for being woken up by
9757 * wake_up_new_task().
62af3783
DN
9758 * - Moving a task which has been woken up by try_to_wake_up() and
9759 * waiting for actually being woken up by sched_ttwu_pending().
7ceff013
DN
9760 *
9761 * To prevent boost or penalty in the new cfs_rq caused by delta
9762 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
9763 */
62af3783 9764 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
7ceff013
DN
9765 on_rq = 1;
9766
b2b5ce02
PZ
9767 if (!on_rq)
9768 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
9769 set_task_rq(p, task_cpu(p));
aff3e498
PT
9770 if (!on_rq) {
9771 cfs_rq = cfs_rq_of(&p->se);
9772 p->se.vruntime += cfs_rq->min_vruntime;
9773#ifdef CONFIG_SMP
9774 /*
9775 * migrate_task_rq_fair() will have removed our previous
9776 * contribution, but we must synchronize for ongoing future
9777 * decay.
9778 */
9779 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
9780 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
9781#endif
9782 }
810b3817 9783}
029632fb
PZ
9784
9785void free_fair_sched_group(struct task_group *tg)
9786{
9787 int i;
9788
9789 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
9790
9791 for_each_possible_cpu(i) {
9792 if (tg->cfs_rq)
9793 kfree(tg->cfs_rq[i]);
9794 if (tg->se)
9795 kfree(tg->se[i]);
9796 }
9797
9798 kfree(tg->cfs_rq);
9799 kfree(tg->se);
9800}
9801
9802int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9803{
9804 struct cfs_rq *cfs_rq;
9805 struct sched_entity *se;
9806 int i;
9807
9808 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
9809 if (!tg->cfs_rq)
9810 goto err;
9811 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
9812 if (!tg->se)
9813 goto err;
9814
9815 tg->shares = NICE_0_LOAD;
9816
9817 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
9818
9819 for_each_possible_cpu(i) {
9820 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
9821 GFP_KERNEL, cpu_to_node(i));
9822 if (!cfs_rq)
9823 goto err;
9824
9825 se = kzalloc_node(sizeof(struct sched_entity),
9826 GFP_KERNEL, cpu_to_node(i));
9827 if (!se)
9828 goto err_free_rq;
9829
9830 init_cfs_rq(cfs_rq);
9831 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
9832 }
9833
9834 return 1;
9835
9836err_free_rq:
9837 kfree(cfs_rq);
9838err:
9839 return 0;
9840}
9841
9842void unregister_fair_sched_group(struct task_group *tg, int cpu)
9843{
9844 struct rq *rq = cpu_rq(cpu);
9845 unsigned long flags;
9846
9847 /*
9848 * Only empty task groups can be destroyed; so we can speculatively
9849 * check on_list without danger of it being re-added.
9850 */
9851 if (!tg->cfs_rq[cpu]->on_list)
9852 return;
9853
9854 raw_spin_lock_irqsave(&rq->lock, flags);
9855 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
9856 raw_spin_unlock_irqrestore(&rq->lock, flags);
9857}
9858
9859void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
9860 struct sched_entity *se, int cpu,
9861 struct sched_entity *parent)
9862{
9863 struct rq *rq = cpu_rq(cpu);
9864
9865 cfs_rq->tg = tg;
9866 cfs_rq->rq = rq;
029632fb
PZ
9867 init_cfs_rq_runtime(cfs_rq);
9868
9869 tg->cfs_rq[cpu] = cfs_rq;
9870 tg->se[cpu] = se;
9871
9872 /* se could be NULL for root_task_group */
9873 if (!se)
9874 return;
9875
9876 if (!parent)
9877 se->cfs_rq = &rq->cfs;
9878 else
9879 se->cfs_rq = parent->my_q;
9880
9881 se->my_q = cfs_rq;
5ba45423
PT
9882 /* guarantee group entities always have weight */
9883 update_load_set(&se->load, NICE_0_LOAD);
029632fb
PZ
9884 se->parent = parent;
9885}
9886
9887static DEFINE_MUTEX(shares_mutex);
9888
9889int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9890{
9891 int i;
9892 unsigned long flags;
9893
9894 /*
9895 * We can't change the weight of the root cgroup.
9896 */
9897 if (!tg->se[0])
9898 return -EINVAL;
9899
9900 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
9901
9902 mutex_lock(&shares_mutex);
9903 if (tg->shares == shares)
9904 goto done;
9905
9906 tg->shares = shares;
9907 for_each_possible_cpu(i) {
9908 struct rq *rq = cpu_rq(i);
9909 struct sched_entity *se;
9910
9911 se = tg->se[i];
9912 /* Propagate contribution to hierarchy */
9913 raw_spin_lock_irqsave(&rq->lock, flags);
17bc14b7 9914 for_each_sched_entity(se)
029632fb
PZ
9915 update_cfs_shares(group_cfs_rq(se));
9916 raw_spin_unlock_irqrestore(&rq->lock, flags);
9917 }
9918
9919done:
9920 mutex_unlock(&shares_mutex);
9921 return 0;
9922}
9923#else /* CONFIG_FAIR_GROUP_SCHED */
9924
9925void free_fair_sched_group(struct task_group *tg) { }
9926
9927int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9928{
9929 return 1;
9930}
9931
9932void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
9933
9934#endif /* CONFIG_FAIR_GROUP_SCHED */
9935
810b3817 9936
6d686f45 9937static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
0d721cea
PW
9938{
9939 struct sched_entity *se = &task->se;
0d721cea
PW
9940 unsigned int rr_interval = 0;
9941
9942 /*
9943 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
9944 * idle runqueue:
9945 */
0d721cea 9946 if (rq->cfs.load.weight)
a59f4e07 9947 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
0d721cea
PW
9948
9949 return rr_interval;
9950}
9951
bf0f6f24
IM
9952/*
9953 * All the scheduling class methods:
9954 */
029632fb 9955const struct sched_class fair_sched_class = {
5522d5d5 9956 .next = &idle_sched_class,
bf0f6f24
IM
9957 .enqueue_task = enqueue_task_fair,
9958 .dequeue_task = dequeue_task_fair,
9959 .yield_task = yield_task_fair,
d95f4122 9960 .yield_to_task = yield_to_task_fair,
bf0f6f24 9961
2e09bf55 9962 .check_preempt_curr = check_preempt_wakeup,
bf0f6f24
IM
9963
9964 .pick_next_task = pick_next_task_fair,
9965 .put_prev_task = put_prev_task_fair,
9966
681f3e68 9967#ifdef CONFIG_SMP
4ce72a2c 9968 .select_task_rq = select_task_rq_fair,
0a74bef8 9969 .migrate_task_rq = migrate_task_rq_fair,
6fa3eb70 9970
0bcdcf28
CE
9971 .rq_online = rq_online_fair,
9972 .rq_offline = rq_offline_fair,
88ec22d3
PZ
9973
9974 .task_waking = task_waking_fair,
681f3e68 9975#endif
bf0f6f24 9976
83b699ed 9977 .set_curr_task = set_curr_task_fair,
bf0f6f24 9978 .task_tick = task_tick_fair,
cd29fe6f 9979 .task_fork = task_fork_fair,
cb469845
SR
9980
9981 .prio_changed = prio_changed_fair,
da7a735e 9982 .switched_from = switched_from_fair,
cb469845 9983 .switched_to = switched_to_fair,
810b3817 9984
0d721cea
PW
9985 .get_rr_interval = get_rr_interval_fair,
9986
810b3817 9987#ifdef CONFIG_FAIR_GROUP_SCHED
b2b5ce02 9988 .task_move_group = task_move_group_fair,
810b3817 9989#endif
bf0f6f24
IM
9990};
9991
9992#ifdef CONFIG_SCHED_DEBUG
029632fb 9993void print_cfs_stats(struct seq_file *m, int cpu)
bf0f6f24 9994{
bf0f6f24
IM
9995 struct cfs_rq *cfs_rq;
9996
5973e5b9 9997 rcu_read_lock();
c3b64f1e 9998 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5cef9eca 9999 print_cfs_rq(m, cpu, cfs_rq);
5973e5b9 10000 rcu_read_unlock();
bf0f6f24
IM
10001}
10002#endif
029632fb
PZ
10003
10004__init void init_sched_fair_class(void)
10005{
10006#ifdef CONFIG_SMP
10007 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10008
3451d024 10009#ifdef CONFIG_NO_HZ_COMMON
554cecaf 10010 nohz.next_balance = jiffies;
029632fb 10011 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
71325960 10012 cpu_notifier(sched_ilb_notifier, 0);
029632fb 10013#endif
6fa3eb70
S
10014
10015 cmp_cputopo_domain_setup();
10016#ifdef CONFIG_SCHED_HMP
10017 hmp_cpu_mask_setup();
10018#endif
029632fb 10019#endif /* SMP */
6fa3eb70
S
10020}
10021
10022#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
10023static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
10024{
10025 u32 result = curr / max;
10026 return result;
10027}
10028
10029#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
10030DEFINE_PER_CPU(u32, FREQ_CPU);
10031#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
10032
10033/* Called when the CPU Frequency is changed.
10034 * Once for each CPU.
10035 */
10036static int cpufreq_callback(struct notifier_block *nb,
10037 unsigned long val, void *data)
10038{
10039 struct cpufreq_freqs *freq = data;
10040 int cpu = freq->cpu;
10041 struct cpufreq_extents *extents;
10042#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10043 struct cpumask* mask;
10044 int id;
10045#endif
10046
10047 if (freq->flags & CPUFREQ_CONST_LOOPS)
10048 return NOTIFY_OK;
10049
10050 if (val != CPUFREQ_POSTCHANGE)
10051 return NOTIFY_OK;
10052
10053 /* if dynamic load scale is disabled, set the load scale to 1.0 */
10054 if (!hmp_data.freqinvar_load_scale_enabled) {
10055 freq_scale[cpu].curr_scale = 1024;
10056 return NOTIFY_OK;
10057 }
10058
10059 extents = &freq_scale[cpu];
10060#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10061 if (extents->max < extents->const_max){
10062 extents->throttling=1;
10063 }
10064 else {
10065 extents->throttling=0;
10066 }
10067#endif
10068 if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
10069 /* If our governor was recognised as a single-freq governor,
10070 * use 1.0
10071 */
10072 extents->curr_scale = 1024;
10073 } else {
10074#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10075 extents->curr_scale = cpufreq_calc_scale(extents->min,
10076 extents->const_max, freq->new);
10077#else
10078 extents->curr_scale = cpufreq_calc_scale(extents->min,
10079 extents->max, freq->new);
10080#endif
10081 }
10082
10083#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10084 mask = arch_cpu_is_big(cpu)?&hmp_fast_cpu_mask:&hmp_slow_cpu_mask;
10085 for_each_cpu(id, mask)
10086 freq_scale[id].curr_scale = extents->curr_scale;
10087#endif
10088
10089#if NR_CPUS == 4
10090#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10091 switch (cpu) {
10092 case 0:
10093 case 2:
10094 (extents + 1)->curr_scale = extents->curr_scale;
10095 break;
10096
10097 case 1:
10098 case 3:
10099 (extents - 1)->curr_scale = extents->curr_scale;
10100 break;
10101
10102 default:
10103
10104 break;
10105 }
10106#endif
10107#endif
10108
10109#ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
10110 per_cpu(FREQ_CPU, cpu) = freq->new;
10111#endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
10112 return NOTIFY_OK;
10113}
10114
10115/* Called when the CPUFreq governor is changed.
10116 * Only called for the CPUs which are actually changed by the
10117 * userspace.
10118 */
10119static int cpufreq_policy_callback(struct notifier_block *nb,
10120 unsigned long event, void *data)
10121{
10122 struct cpufreq_policy *policy = data;
10123 struct cpufreq_extents *extents;
10124 int cpu, singleFreq = 0;
10125 static const char performance_governor[] = "performance";
10126 static const char powersave_governor[] = "powersave";
10127
10128 if (event == CPUFREQ_START)
10129 return 0;
10130
10131 if (event != CPUFREQ_INCOMPATIBLE)
10132 return 0;
10133
10134 /* CPUFreq governors do not accurately report the range of
10135 * CPU Frequencies they will choose from.
10136 * We recognise performance and powersave governors as
10137 * single-frequency only.
10138 */
10139 if (!strncmp(policy->governor->name, performance_governor,
10140 strlen(performance_governor)) ||
10141 !strncmp(policy->governor->name, powersave_governor,
10142 strlen(powersave_governor)))
10143 singleFreq = 1;
10144
10145 /* Make sure that all CPUs impacted by this policy are
10146 * updated since we will only get a notification when the
10147 * user explicitly changes the policy on a CPU.
10148 */
10149 for_each_cpu(cpu, policy->cpus) {
10150 extents = &freq_scale[cpu];
10151 extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
10152 extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
10153#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10154 extents->const_max = policy->cpuinfo.max_freq >> SCHED_FREQSCALE_SHIFT;
10155#endif
10156 if (!hmp_data.freqinvar_load_scale_enabled) {
10157 extents->curr_scale = 1024;
10158 } else if (singleFreq) {
10159 extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
10160 extents->curr_scale = 1024;
10161 } else {
10162 extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
10163#ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10164 extents->curr_scale = cpufreq_calc_scale(extents->min,
10165 extents->const_max, policy->cur);
10166#else
10167 extents->curr_scale = cpufreq_calc_scale(extents->min,
10168 extents->max, policy->cur);
10169#endif
10170 }
10171 }
10172
10173 return 0;
10174}
10175
10176static struct notifier_block cpufreq_notifier = {
10177 .notifier_call = cpufreq_callback,
10178};
10179static struct notifier_block cpufreq_policy_notifier = {
10180 .notifier_call = cpufreq_policy_callback,
10181};
10182
10183static int __init register_sched_cpufreq_notifier(void)
10184{
10185 int ret = 0;
10186
10187 /* init safe defaults since there are no policies at registration */
10188 for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
10189 /* safe defaults */
10190 freq_scale[ret].max = 1024;
10191 freq_scale[ret].min = 1024;
10192 freq_scale[ret].curr_scale = 1024;
10193 }
10194
10195 pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
10196 ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
10197 CPUFREQ_POLICY_NOTIFIER);
029632fb 10198
6fa3eb70
S
10199 if (ret != -EINVAL)
10200 ret = cpufreq_register_notifier(&cpufreq_notifier,
10201 CPUFREQ_TRANSITION_NOTIFIER);
10202
10203 return ret;
10204}
10205
10206core_initcall(register_sched_cpufreq_notifier);
10207#endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
10208
10209#ifdef CONFIG_HEVTASK_INTERFACE
10210/*
10211 * * This allows printing both to /proc/task_detect and
10212 * * to the console
10213 * */
10214#ifndef CONFIG_KGDB_KDB
10215#define SEQ_printf(m, x...) \
10216 do { \
10217 if (m) \
10218 seq_printf(m, x); \
10219 else \
10220 printk(x); \
10221 } while (0)
10222#else
10223#define SEQ_printf(m, x...) \
10224 do { \
10225 if (m) \
10226 seq_printf(m, x); \
10227 else if (__get_cpu_var(kdb_in_use) == 1) \
10228 kdb_printf(x); \
10229 else \
10230 printk(x); \
10231 } while (0)
10232#endif
10233
10234static int task_detect_show(struct seq_file *m, void *v)
10235{
10236 struct task_struct *p;
10237 unsigned long flags;
10238 unsigned int i;
10239
10240#ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
10241 for(i=0;i<NR_CPUS;i++){
10242 SEQ_printf(m,"%5d ",freq_scale[i].curr_scale);
10243 }
10244#endif
10245
10246 SEQ_printf(m, "\n%lu\n ",jiffies_to_cputime(jiffies));
10247
10248 for(i=0;i<NR_CPUS;i++){
10249 raw_spin_lock_irqsave(&cpu_rq(i)->lock,flags);
10250 if(cpu_online(i)){
10251 list_for_each_entry(p,&cpu_rq(i)->cfs_tasks,se.group_node){
10252 SEQ_printf(m, "%lu %5d %5d %lu (%15s)\n ",
10253 p->se.avg.load_avg_ratio,p->pid,task_cpu(p),
10254 (p->utime+p->stime),p->comm);
10255 }
10256 }
10257 raw_spin_unlock_irqrestore(&cpu_rq(i)->lock,flags);
10258
10259 }
10260
10261 return 0;
10262}
10263
10264static int task_detect_open(struct inode *inode, struct file *filp)
10265{
10266 return single_open(filp, task_detect_show, NULL);
029632fb 10267}
6fa3eb70
S
10268
10269static const struct file_operations task_detect_fops = {
10270 .open = task_detect_open,
10271 .read = seq_read,
10272 .llseek = seq_lseek,
10273 .release = single_release,
10274};
10275
10276static int __init init_task_detect_procfs(void)
10277{
10278 struct proc_dir_entry *pe;
10279
10280 pe = proc_create("task_detect", 0444, NULL, &task_detect_fops);
10281 if (!pe)
10282 return -ENOMEM;
10283 return 0;
10284}
10285
10286__initcall(init_task_detect_procfs);
10287#endif /* CONFIG_HEVTASK_INTERFACE */