Merge tag 'v3.10.69' into update
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / kernel / sched / fair.c
1 /*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
21 */
22
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/slab.h>
27 #include <linux/profile.h>
28 #include <linux/interrupt.h>
29 #include <linux/mempolicy.h>
30 #include <linux/migrate.h>
31 #include <linux/task_work.h>
32
33 #include <trace/events/sched.h>
34 #ifdef CONFIG_HMP_VARIABLE_SCALE
35 #include <linux/sysfs.h>
36 #include <linux/vmalloc.h>
37 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
38 /* Include cpufreq header to add a notifier so that cpu frequency
39 * scaling can track the current CPU frequency
40 */
41 #include <linux/cpufreq.h>
42 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
43 #endif /* CONFIG_HMP_VARIABLE_SCALE */
44
45 #include "sched.h"
46
47 #include <mtlbprof/mtlbprof.h>
48
49
50 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
51 #ifdef CONFIG_LOCAL_TIMERS
52 unsigned long localtimer_get_counter(void);
53 #endif
54 #endif
55
56 #ifdef CONFIG_HEVTASK_INTERFACE
57 #include <linux/proc_fs.h>
58 #include <linux/seq_file.h>
59 #ifdef CONFIG_KGDB_KDB
60 #include <linux/kdb.h>
61 #endif
62 #endif
63
64 /*
65 * Targeted preemption latency for CPU-bound tasks:
66 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
67 *
68 * NOTE: this latency value is not the same as the concept of
69 * 'timeslice length' - timeslices in CFS are of variable length
70 * and have no persistent notion like in traditional, time-slice
71 * based scheduling concepts.
72 *
73 * (to see the precise effective timeslice length of your workload,
74 * run vmstat and monitor the context-switches (cs) field)
75 */
76 unsigned int sysctl_sched_latency = 6000000ULL;
77 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
78
79 /*
80 * The initial- and re-scaling of tunables is configurable
81 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
82 *
83 * Options are:
84 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
85 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
86 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
87 */
88 enum sched_tunable_scaling sysctl_sched_tunable_scaling
89 = SCHED_TUNABLESCALING_LOG;
90
91 /*
92 * Minimal preemption granularity for CPU-bound tasks:
93 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
94 */
95 unsigned int sysctl_sched_min_granularity = 750000ULL;
96 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
97
98 /*
99 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
100 */
101 static unsigned int sched_nr_latency = 8;
102
103 /*
104 * After fork, child runs first. If set to 0 (default) then
105 * parent will (try to) run first.
106 */
107 unsigned int sysctl_sched_child_runs_first __read_mostly;
108
109 /*
110 * SCHED_OTHER wake-up granularity.
111 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
112 *
113 * This option delays the preemption effects of decoupled workloads
114 * and reduces their over-scheduling. Synchronous workloads will still
115 * have immediate wakeup/sleep latencies.
116 */
117 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
118 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
119
120 const_debug unsigned int sysctl_sched_migration_cost = 100000UL;
121
122 /*
123 * The exponential sliding window over which load is averaged for shares
124 * distribution.
125 * (default: 10msec)
126 */
127 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
128
129 #ifdef CONFIG_CFS_BANDWIDTH
130 /*
131 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
132 * each time a cfs_rq requests quota.
133 *
134 * Note: in the case that the slice exceeds the runtime remaining (either due
135 * to consumption or the quota being specified to be smaller than the slice)
136 * we will always only issue the remaining available time.
137 *
138 * default: 5 msec, units: microseconds
139 */
140 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
141 #endif
142 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
143 static int need_lazy_balance(int dst_cpu, int src_cpu, struct task_struct *p);
144 #endif
145
146 /*
147 * Increase the granularity value when there are more CPUs,
148 * because with more CPUs the 'effective latency' as visible
149 * to users decreases. But the relationship is not linear,
150 * so pick a second-best guess by going with the log2 of the
151 * number of CPUs.
152 *
153 * This idea comes from the SD scheduler of Con Kolivas:
154 */
155 static int get_update_sysctl_factor(void)
156 {
157 unsigned int cpus = min_t(int, num_online_cpus(), 8);
158 unsigned int factor;
159
160 switch (sysctl_sched_tunable_scaling) {
161 case SCHED_TUNABLESCALING_NONE:
162 factor = 1;
163 break;
164 case SCHED_TUNABLESCALING_LINEAR:
165 factor = cpus;
166 break;
167 case SCHED_TUNABLESCALING_LOG:
168 default:
169 factor = 1 + ilog2(cpus);
170 break;
171 }
172
173 return factor;
174 }
175
176 static void update_sysctl(void)
177 {
178 unsigned int factor = get_update_sysctl_factor();
179
180 #define SET_SYSCTL(name) \
181 (sysctl_##name = (factor) * normalized_sysctl_##name)
182 SET_SYSCTL(sched_min_granularity);
183 SET_SYSCTL(sched_latency);
184 SET_SYSCTL(sched_wakeup_granularity);
185 #undef SET_SYSCTL
186 }
187
188 void sched_init_granularity(void)
189 {
190 update_sysctl();
191 }
192 #if defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK) || defined (CONFIG_HMP_PACK_SMALL_TASK)
193 /*
194 * Save the id of the optimal CPU that should be used to pack small tasks
195 * The value -1 is used when no buddy has been found
196 */
197 DEFINE_PER_CPU(int, sd_pack_buddy) = {-1};
198
199 #ifdef CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK
200 struct cpumask buddy_cpu_map = {{0}};
201 #endif
202
203 /* Look for the best buddy CPU that can be used to pack small tasks
204 * We make the assumption that it doesn't wort to pack on CPU that share the
205 * same powerline. We looks for the 1st sched_domain without the
206 * SD_SHARE_POWERLINE flag. Then We look for the sched_group witht the lowest
207 * power per core based on the assumption that their power efficiency is
208 * better */
209 void update_packing_domain(int cpu)
210 {
211 struct sched_domain *sd;
212 int id = -1;
213
214 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
215 pr_info("[PACK] update_packing_domain() CPU%d\n", cpu);
216 #endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
217 mt_sched_printf("[PACK] update_packing_domain() CPU%d", cpu);
218
219 sd = highest_flag_domain(cpu, SD_SHARE_POWERLINE);
220 if (!sd)
221 {
222 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
223 }
224 else
225 if (cpumask_first(sched_domain_span(sd)) == cpu || !sd->parent)
226 sd = sd->parent;
227
228 while (sd) {
229 struct sched_group *sg = sd->groups;
230 struct sched_group *pack = sg;
231 struct sched_group *tmp = sg->next;
232
233 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
234 pr_info("[PACK] sd = 0x%08x, flags = %d\n", (unsigned int)sd, sd->flags);
235 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
236
237 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
238 pr_info("[PACK] sg = 0x%08x\n", (unsigned int)sg);
239 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
240
241 /* 1st CPU of the sched domain is a good candidate */
242 if (id == -1)
243 id = cpumask_first(sched_domain_span(sd));
244
245 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
246 pr_info("[PACK] First cpu in this sd id = %d\n", id);
247 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
248
249 /* Find sched group of candidate */
250 tmp = sd->groups;
251 do {
252 if (cpumask_test_cpu(id, sched_group_cpus(tmp))) {
253 sg = tmp;
254 break;
255 }
256 } while (tmp = tmp->next, tmp != sd->groups);
257
258 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
259 pr_info("[PACK] pack = 0x%08x\n", (unsigned int)sg);
260 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
261
262 pack = sg;
263 tmp = sg->next;
264
265 /* loop the sched groups to find the best one */
266 //Stop find the best one in the same Load Balance Domain
267 //while (tmp != sg) {
268 while (tmp != sg && !(sd->flags & SD_LOAD_BALANCE)) {
269 if (tmp->sgp->power * sg->group_weight <
270 sg->sgp->power * tmp->group_weight) {
271
272 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
273 pr_info("[PACK] Now sg power = %u, weight = %u, mask = %lu\n", sg->sgp->power, sg->group_weight, sg->cpumask[0]);
274 pr_info("[PACK] Better sg power = %u, weight = %u, mask = %lu\n", tmp->sgp->power, tmp->group_weight, tmp->cpumask[0]);
275 #endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
276
277 pack = tmp;
278 }
279 tmp = tmp->next;
280 }
281
282 /* we have found a better group */
283 if (pack != sg) {
284 id = cpumask_first(sched_group_cpus(pack));
285
286 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
287 pr_info("[PACK] Better sg, first cpu id = %d\n", id);
288 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
289
290 }
291
292 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
293 if(sd->parent) {
294 pr_info("[PACK] cpu = %d, id = %d, sd->parent = 0x%08x, flags = %d, SD_LOAD_BALANCE = %d\n", cpu, id, (unsigned int)sd->parent, sd->parent->flags, SD_LOAD_BALANCE);
295 pr_info("[PACK] %d\n", (id != cpu));
296 pr_info("[PACK] 0x%08x\n", (unsigned int)(sd->parent));
297 pr_info("[PACK] %d\n", (sd->parent->flags & SD_LOAD_BALANCE));
298 }
299 else {
300 pr_info("[PACK] cpu = %d, id = %d, sd->parent = 0x%08x\n", cpu, id, (unsigned int)sd->parent);
301 }
302 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
303
304
305 /* Look for another CPU than itself */
306 if ((id != cpu) ||
307 ((sd->parent) && (sd->parent->flags & SD_LOAD_BALANCE))) {
308
309 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
310 pr_info("[PACK] Break\n");
311 #endif /*CONFIG_HMP_PACK_BUDDY_INFO */
312
313 break;
314 }
315 sd = sd->parent;
316 }
317
318 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
319 pr_info("[PACK] CPU%d packing on CPU%d\n", cpu, id);
320 #endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
321 mt_sched_printf("[PACK] CPU%d packing on CPU%d", cpu, id);
322
323 #ifdef CONFIG_HMP_PACK_SMALL_TASK
324 per_cpu(sd_pack_buddy, cpu) = id;
325 #else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
326 if(per_cpu(sd_pack_buddy, cpu) != -1)
327 cpu_clear(per_cpu(sd_pack_buddy, cpu), buddy_cpu_map);
328 per_cpu(sd_pack_buddy, cpu) = id;
329 if(id != -1)
330 cpumask_set_cpu(id, &buddy_cpu_map);
331 #endif
332 }
333
334 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
335 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_USAGE);
336 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_PERIOD);
337 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_NR);
338 DEFINE_PER_CPU(u32, TASK_USGAE);
339 DEFINE_PER_CPU(u32, TASK_PERIOD);
340 u32 PACK_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
341 u32 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
342 u32 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
343 u32 TASK_PACK_CPU_COUNT[4][NR_CPUS] = {{0}};
344 u32 PA_ENABLE = 1;
345 u32 PA_MON_ENABLE = 0;
346 char PA_MON[4][TASK_COMM_LEN]={{0}};
347 #endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER */
348
349 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
350 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_USAGE);
351 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_PERIOD);
352 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_NR);
353 DEFINE_PER_CPU(u32, TASK_USGAE);
354 DEFINE_PER_CPU(u32, TASK_PERIOD);
355 u32 PACK_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
356 u32 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
357 u32 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
358 u32 HMP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
359 u32 PA_ENABLE = 1;
360 u32 LB_ENABLE = 1;
361 u32 PA_MON_ENABLE = 0;
362 char PA_MON[TASK_COMM_LEN];
363
364 #ifdef CONFIG_HMP_TRACER
365 #define POWER_AWARE_ACTIVE_MODULE_PACK_FORM_CPUX_TO_CPUY (0)
366 #define POWER_AWARE_ACTIVE_MODULE_AVOID_WAKE_UP_FORM_CPUX_TO_CPUY (1)
367 #define POWER_AWARE_ACTIVE_MODULE_AVOID_BALANCE_FORM_CPUX_TO_CPUY (2)
368 #define POWER_AWARE_ACTIVE_MODULE_AVOID_FORCE_UP_FORM_CPUX_TO_CPUY (3)
369 #endif /* CONFIG_HMP_TRACER */
370
371 #endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER */
372
373
374 static inline bool is_buddy_busy(int cpu)
375 {
376 #ifdef CONFIG_HMP_PACK_SMALL_TASK
377 struct rq *rq;
378
379 if (cpu < 0)
380 return 0;
381
382 rq = cpu_rq(cpu);
383 #else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
384 struct rq *rq = cpu_rq(cpu);
385 #endif
386 /*
387 * A busy buddy is a CPU with a high load or a small load with a lot of
388 * running tasks.
389 */
390
391 #if defined (CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER)
392 per_cpu(BUDDY_CPU_RQ_USAGE, cpu) = rq->avg.usage_avg_sum;
393 per_cpu(BUDDY_CPU_RQ_PERIOD, cpu) = rq->avg.runnable_avg_period;
394 per_cpu(BUDDY_CPU_RQ_NR, cpu) = rq->nr_running;
395 #endif /*(CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER) */
396
397 return ((rq->avg.usage_avg_sum << rq->nr_running) >
398 rq->avg.runnable_avg_period);
399
400 }
401
402 static inline bool is_light_task(struct task_struct *p)
403 {
404 #if defined (CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER)
405 per_cpu(TASK_USGAE, task_cpu(p)) = p->se.avg.usage_avg_sum;
406 per_cpu(TASK_PERIOD, task_cpu(p)) = p->se.avg.runnable_avg_period;
407 #endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER || CONFIG_HMP_POWER_AWARE_CONTROLLER*/
408
409 /* A light task runs less than 25% in average */
410 return ((p->se.avg.usage_avg_sum << 2) < p->se.avg.runnable_avg_period);
411 }
412
413
414 static int check_pack_buddy(int cpu, struct task_struct *p)
415 {
416 #ifdef CONFIG_HMP_PACK_SMALL_TASK
417 int buddy;
418
419 if(cpu >= NR_CPUS || cpu < 0)
420 return false;
421 buddy = per_cpu(sd_pack_buddy, cpu);
422 #else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
423 int buddy = cpu;
424 #endif
425
426 /* No pack buddy for this CPU */
427 if (buddy == -1)
428 return false;
429
430 /*
431 * If a task is waiting for running on the CPU which is its own buddy,
432 * let the default behavior to look for a better CPU if available
433 * The threshold has been set to 37.5%
434 */
435 #ifdef CONFIG_HMP_PACK_SMALL_TASK
436 if ((buddy == cpu)
437 && ((p->se.avg.usage_avg_sum << 3) < (p->se.avg.runnable_avg_sum * 5)))
438 return false;
439 #endif
440
441 /* buddy is not an allowed CPU */
442 if (!cpumask_test_cpu(buddy, tsk_cpus_allowed(p)))
443 return false;
444
445 /*
446 * If the task is a small one and the buddy is not overloaded,
447 * we use buddy cpu
448 */
449 if (!is_light_task(p) || is_buddy_busy(buddy))
450 return false;
451
452 return true;
453 }
454 #endif /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK || CONFIG_HMP_PACK_SMALL_TASK*/
455
456 #if BITS_PER_LONG == 32
457 # define WMULT_CONST (~0UL)
458 #else
459 # define WMULT_CONST (1UL << 32)
460 #endif
461
462 #define WMULT_SHIFT 32
463
464 /*
465 * Shift right and round:
466 */
467 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
468
469 /*
470 * delta *= weight / lw
471 */
472 static unsigned long
473 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
474 struct load_weight *lw)
475 {
476 u64 tmp;
477
478 /*
479 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
480 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
481 * 2^SCHED_LOAD_RESOLUTION.
482 */
483 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
484 tmp = (u64)delta_exec * scale_load_down(weight);
485 else
486 tmp = (u64)delta_exec;
487
488 if (!lw->inv_weight) {
489 unsigned long w = scale_load_down(lw->weight);
490
491 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
492 lw->inv_weight = 1;
493 else if (unlikely(!w))
494 lw->inv_weight = WMULT_CONST;
495 else
496 lw->inv_weight = WMULT_CONST / w;
497 }
498
499 /*
500 * Check whether we'd overflow the 64-bit multiplication:
501 */
502 if (unlikely(tmp > WMULT_CONST))
503 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
504 WMULT_SHIFT/2);
505 else
506 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
507
508 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
509 }
510
511
512 const struct sched_class fair_sched_class;
513
514 /**************************************************************
515 * CFS operations on generic schedulable entities:
516 */
517
518 #ifdef CONFIG_FAIR_GROUP_SCHED
519
520 /* cpu runqueue to which this cfs_rq is attached */
521 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
522 {
523 return cfs_rq->rq;
524 }
525
526 /* An entity is a task if it doesn't "own" a runqueue */
527 #define entity_is_task(se) (!se->my_q)
528
529 static inline struct task_struct *task_of(struct sched_entity *se)
530 {
531 #ifdef CONFIG_SCHED_DEBUG
532 WARN_ON_ONCE(!entity_is_task(se));
533 #endif
534 return container_of(se, struct task_struct, se);
535 }
536
537 /* Walk up scheduling entities hierarchy */
538 #define for_each_sched_entity(se) \
539 for (; se; se = se->parent)
540
541 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
542 {
543 return p->se.cfs_rq;
544 }
545
546 /* runqueue on which this entity is (to be) queued */
547 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
548 {
549 return se->cfs_rq;
550 }
551
552 /* runqueue "owned" by this group */
553 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
554 {
555 return grp->my_q;
556 }
557
558 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
559 int force_update);
560
561 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
562 {
563 if (!cfs_rq->on_list) {
564 /*
565 * Ensure we either appear before our parent (if already
566 * enqueued) or force our parent to appear after us when it is
567 * enqueued. The fact that we always enqueue bottom-up
568 * reduces this to two cases.
569 */
570 if (cfs_rq->tg->parent &&
571 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
572 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
573 &rq_of(cfs_rq)->leaf_cfs_rq_list);
574 } else {
575 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
576 &rq_of(cfs_rq)->leaf_cfs_rq_list);
577 }
578
579 cfs_rq->on_list = 1;
580 /* We should have no load, but we need to update last_decay. */
581 update_cfs_rq_blocked_load(cfs_rq, 0);
582 }
583 }
584
585 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
586 {
587 if (cfs_rq->on_list) {
588 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
589 cfs_rq->on_list = 0;
590 }
591 }
592
593 /* Iterate thr' all leaf cfs_rq's on a runqueue */
594 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
595 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
596
597 /* Do the two (enqueued) entities belong to the same group ? */
598 static inline int
599 is_same_group(struct sched_entity *se, struct sched_entity *pse)
600 {
601 if (se && pse)
602 {
603 if (se->cfs_rq == pse->cfs_rq)
604 return 1;
605 }
606
607 return 0;
608 }
609
610 static inline struct sched_entity *parent_entity(struct sched_entity *se)
611 {
612 return se->parent;
613 }
614
615 /* return depth at which a sched entity is present in the hierarchy */
616 static inline int depth_se(struct sched_entity *se)
617 {
618 int depth = 0;
619
620 for_each_sched_entity(se)
621 depth++;
622
623 return depth;
624 }
625
626 static void
627 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
628 {
629 int se_depth, pse_depth;
630
631 /*
632 * preemption test can be made between sibling entities who are in the
633 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
634 * both tasks until we find their ancestors who are siblings of common
635 * parent.
636 */
637
638 /* First walk up until both entities are at same depth */
639 se_depth = depth_se(*se);
640 pse_depth = depth_se(*pse);
641
642 while (se_depth > pse_depth) {
643 se_depth--;
644 *se = parent_entity(*se);
645 }
646
647 while (pse_depth > se_depth) {
648 pse_depth--;
649 *pse = parent_entity(*pse);
650 }
651
652 while (!is_same_group(*se, *pse)) {
653 *se = parent_entity(*se);
654 *pse = parent_entity(*pse);
655 }
656 }
657
658 #else /* !CONFIG_FAIR_GROUP_SCHED */
659
660 static inline struct task_struct *task_of(struct sched_entity *se)
661 {
662 return container_of(se, struct task_struct, se);
663 }
664
665 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
666 {
667 return container_of(cfs_rq, struct rq, cfs);
668 }
669
670 #define entity_is_task(se) 1
671
672 #define for_each_sched_entity(se) \
673 for (; se; se = NULL)
674
675 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
676 {
677 return &task_rq(p)->cfs;
678 }
679
680 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
681 {
682 struct task_struct *p = task_of(se);
683 struct rq *rq = task_rq(p);
684
685 return &rq->cfs;
686 }
687
688 /* runqueue "owned" by this group */
689 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
690 {
691 return NULL;
692 }
693
694 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
695 {
696 }
697
698 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
699 {
700 }
701
702 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
703 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
704
705 static inline int
706 is_same_group(struct sched_entity *se, struct sched_entity *pse)
707 {
708 return 1;
709 }
710
711 static inline struct sched_entity *parent_entity(struct sched_entity *se)
712 {
713 return NULL;
714 }
715
716 static inline void
717 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
718 {
719 }
720
721 #endif /* CONFIG_FAIR_GROUP_SCHED */
722
723 static __always_inline
724 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
725
726 /**************************************************************
727 * Scheduling class tree data structure manipulation methods:
728 */
729
730 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
731 {
732 s64 delta = (s64)(vruntime - max_vruntime);
733 if (delta > 0)
734 max_vruntime = vruntime;
735
736 return max_vruntime;
737 }
738
739 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
740 {
741 s64 delta = (s64)(vruntime - min_vruntime);
742 if (delta < 0)
743 min_vruntime = vruntime;
744
745 return min_vruntime;
746 }
747
748 static inline int entity_before(struct sched_entity *a,
749 struct sched_entity *b)
750 {
751 return (s64)(a->vruntime - b->vruntime) < 0;
752 }
753
754 static void update_min_vruntime(struct cfs_rq *cfs_rq)
755 {
756 u64 vruntime = cfs_rq->min_vruntime;
757
758 if (cfs_rq->curr)
759 vruntime = cfs_rq->curr->vruntime;
760
761 if (cfs_rq->rb_leftmost) {
762 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
763 struct sched_entity,
764 run_node);
765
766 if (!cfs_rq->curr)
767 vruntime = se->vruntime;
768 else
769 vruntime = min_vruntime(vruntime, se->vruntime);
770 }
771
772 /* ensure we never gain time by being placed backwards. */
773 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
774 #ifndef CONFIG_64BIT
775 smp_wmb();
776 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
777 #endif
778 }
779
780 /*
781 * Enqueue an entity into the rb-tree:
782 */
783 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
784 {
785 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
786 struct rb_node *parent = NULL;
787 struct sched_entity *entry;
788 int leftmost = 1;
789
790 /*
791 * Find the right place in the rbtree:
792 */
793 while (*link) {
794 parent = *link;
795 entry = rb_entry(parent, struct sched_entity, run_node);
796 /*
797 * We dont care about collisions. Nodes with
798 * the same key stay together.
799 */
800 if (entity_before(se, entry)) {
801 link = &parent->rb_left;
802 } else {
803 link = &parent->rb_right;
804 leftmost = 0;
805 }
806 }
807
808 /*
809 * Maintain a cache of leftmost tree entries (it is frequently
810 * used):
811 */
812 if (leftmost)
813 cfs_rq->rb_leftmost = &se->run_node;
814
815 rb_link_node(&se->run_node, parent, link);
816 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
817 }
818
819 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
820 {
821 if (cfs_rq->rb_leftmost == &se->run_node) {
822 struct rb_node *next_node;
823
824 next_node = rb_next(&se->run_node);
825 cfs_rq->rb_leftmost = next_node;
826 }
827
828 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
829 }
830
831 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
832 {
833 struct rb_node *left = cfs_rq->rb_leftmost;
834
835 if (!left)
836 return NULL;
837
838 return rb_entry(left, struct sched_entity, run_node);
839 }
840
841 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
842 {
843 struct rb_node *next = rb_next(&se->run_node);
844
845 if (!next)
846 return NULL;
847
848 return rb_entry(next, struct sched_entity, run_node);
849 }
850
851 #ifdef CONFIG_SCHED_DEBUG
852 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
853 {
854 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
855
856 if (!last)
857 return NULL;
858
859 return rb_entry(last, struct sched_entity, run_node);
860 }
861
862 /**************************************************************
863 * Scheduling class statistics methods:
864 */
865
866 int sched_proc_update_handler(struct ctl_table *table, int write,
867 void __user *buffer, size_t *lenp,
868 loff_t *ppos)
869 {
870 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
871 int factor = get_update_sysctl_factor();
872
873 if (ret || !write)
874 return ret;
875
876 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
877 sysctl_sched_min_granularity);
878
879 #define WRT_SYSCTL(name) \
880 (normalized_sysctl_##name = sysctl_##name / (factor))
881 WRT_SYSCTL(sched_min_granularity);
882 WRT_SYSCTL(sched_latency);
883 WRT_SYSCTL(sched_wakeup_granularity);
884 #undef WRT_SYSCTL
885
886 return 0;
887 }
888 #endif
889
890 /*
891 * delta /= w
892 */
893 static inline unsigned long
894 calc_delta_fair(unsigned long delta, struct sched_entity *se)
895 {
896 if (unlikely(se->load.weight != NICE_0_LOAD))
897 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
898
899 return delta;
900 }
901
902 /*
903 * The idea is to set a period in which each task runs once.
904 *
905 * When there are too many tasks (sched_nr_latency) we have to stretch
906 * this period because otherwise the slices get too small.
907 *
908 * p = (nr <= nl) ? l : l*nr/nl
909 */
910 static u64 __sched_period(unsigned long nr_running)
911 {
912 u64 period = sysctl_sched_latency;
913 unsigned long nr_latency = sched_nr_latency;
914
915 if (unlikely(nr_running > nr_latency)) {
916 period = sysctl_sched_min_granularity;
917 period *= nr_running;
918 }
919
920 return period;
921 }
922
923 /*
924 * We calculate the wall-time slice from the period by taking a part
925 * proportional to the weight.
926 *
927 * s = p*P[w/rw]
928 */
929 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
930 {
931 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
932
933 for_each_sched_entity(se) {
934 struct load_weight *load;
935 struct load_weight lw;
936
937 cfs_rq = cfs_rq_of(se);
938 load = &cfs_rq->load;
939
940 if (unlikely(!se->on_rq)) {
941 lw = cfs_rq->load;
942
943 update_load_add(&lw, se->load.weight);
944 load = &lw;
945 }
946 slice = calc_delta_mine(slice, se->load.weight, load);
947 }
948 return slice;
949 }
950
951 /*
952 * We calculate the vruntime slice of a to-be-inserted task.
953 *
954 * vs = s/w
955 */
956 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
957 {
958 return calc_delta_fair(sched_slice(cfs_rq, se), se);
959 }
960
961
962 #ifdef CONFIG_SMP
963 static inline void __update_task_entity_contrib(struct sched_entity *se);
964
965 static long __update_task_entity_ratio(struct sched_entity *se);
966
967 #define LOAD_AVG_PERIOD 32
968 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
969 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
970 #define LOAD_AVG_VARIABLE_PERIOD 512
971 static unsigned int init_task_load_period = 4000;
972
973 /* Give new task start runnable values to heavy its load in infant time */
974 void init_task_runnable_average(struct task_struct *p)
975 {
976 u32 slice;
977
978 p->se.avg.decay_count = 0;
979 slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
980 p->se.avg.runnable_avg_sum = (init_task_load_period) ? 0 : slice;
981 p->se.avg.runnable_avg_period = (init_task_load_period)?(init_task_load_period):slice;
982 __update_task_entity_contrib(&p->se);
983
984 #ifdef CONFIG_MTK_SCHED_CMP
985 /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
986 p->se.avg.usage_avg_sum = (init_task_load_period) ? 0 : slice;
987 #endif
988 __update_task_entity_ratio(&p->se);
989 trace_sched_task_entity_avg(0, p, &p->se.avg);
990 }
991 #else
992 void init_task_runnable_average(struct task_struct *p)
993 {
994 }
995 #endif
996
997 /*
998 * Update the current task's runtime statistics. Skip current tasks that
999 * are not in our scheduling class.
1000 */
1001 static inline void
1002 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
1003 unsigned long delta_exec)
1004 {
1005 unsigned long delta_exec_weighted;
1006
1007 schedstat_set(curr->statistics.exec_max,
1008 max((u64)delta_exec, curr->statistics.exec_max));
1009
1010 curr->sum_exec_runtime += delta_exec;
1011 schedstat_add(cfs_rq, exec_clock, delta_exec);
1012 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
1013
1014 curr->vruntime += delta_exec_weighted;
1015 update_min_vruntime(cfs_rq);
1016 }
1017
1018 static void update_curr(struct cfs_rq *cfs_rq)
1019 {
1020 struct sched_entity *curr = cfs_rq->curr;
1021 u64 now = rq_of(cfs_rq)->clock_task;
1022 unsigned long delta_exec;
1023
1024 if (unlikely(!curr))
1025 return;
1026
1027 /*
1028 * Get the amount of time the current task was running
1029 * since the last time we changed load (this cannot
1030 * overflow on 32 bits):
1031 */
1032 delta_exec = (unsigned long)(now - curr->exec_start);
1033 if (!delta_exec)
1034 return;
1035
1036 __update_curr(cfs_rq, curr, delta_exec);
1037 curr->exec_start = now;
1038
1039 if (entity_is_task(curr)) {
1040 struct task_struct *curtask = task_of(curr);
1041
1042 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
1043 cpuacct_charge(curtask, delta_exec);
1044 account_group_exec_runtime(curtask, delta_exec);
1045 }
1046
1047 account_cfs_rq_runtime(cfs_rq, delta_exec);
1048 }
1049
1050 static inline void
1051 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1052 {
1053 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
1054 }
1055
1056 /*
1057 * Task is being enqueued - update stats:
1058 */
1059 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1060 {
1061 /*
1062 * Are we enqueueing a waiting task? (for current tasks
1063 * a dequeue/enqueue event is a NOP)
1064 */
1065 if (se != cfs_rq->curr)
1066 update_stats_wait_start(cfs_rq, se);
1067 }
1068
1069 static void
1070 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
1071 {
1072 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
1073 rq_of(cfs_rq)->clock - se->statistics.wait_start));
1074 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
1075 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
1076 rq_of(cfs_rq)->clock - se->statistics.wait_start);
1077 #ifdef CONFIG_SCHEDSTATS
1078 if (entity_is_task(se)) {
1079 trace_sched_stat_wait(task_of(se),
1080 rq_of(cfs_rq)->clock - se->statistics.wait_start);
1081 }
1082 #endif
1083 schedstat_set(se->statistics.wait_start, 0);
1084 }
1085
1086 static inline void
1087 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1088 {
1089 /*
1090 * Mark the end of the wait period if dequeueing a
1091 * waiting task:
1092 */
1093 if (se != cfs_rq->curr)
1094 update_stats_wait_end(cfs_rq, se);
1095 }
1096
1097 /*
1098 * We are picking a new current task - update its stats:
1099 */
1100 static inline void
1101 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1102 {
1103 /*
1104 * We are starting a new run period:
1105 */
1106 se->exec_start = rq_of(cfs_rq)->clock_task;
1107 }
1108
1109 /**************************************************
1110 * Scheduling class queueing methods:
1111 */
1112
1113 #ifdef CONFIG_NUMA_BALANCING
1114 /*
1115 * numa task sample period in ms
1116 */
1117 unsigned int sysctl_numa_balancing_scan_period_min = 100;
1118 unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
1119 unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
1120
1121 /* Portion of address space to scan in MB */
1122 unsigned int sysctl_numa_balancing_scan_size = 256;
1123
1124 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1125 unsigned int sysctl_numa_balancing_scan_delay = 1000;
1126
1127 static void task_numa_placement(struct task_struct *p)
1128 {
1129 int seq;
1130
1131 if (!p->mm) /* for example, ksmd faulting in a user's mm */
1132 return;
1133 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
1134 if (p->numa_scan_seq == seq)
1135 return;
1136 p->numa_scan_seq = seq;
1137
1138 /* FIXME: Scheduling placement policy hints go here */
1139 }
1140
1141 /*
1142 * Got a PROT_NONE fault for a page on @node.
1143 */
1144 void task_numa_fault(int node, int pages, bool migrated)
1145 {
1146 struct task_struct *p = current;
1147
1148 if (!sched_feat_numa(NUMA))
1149 return;
1150
1151 /* FIXME: Allocate task-specific structure for placement policy here */
1152
1153 /*
1154 * If pages are properly placed (did not migrate) then scan slower.
1155 * This is reset periodically in case of phase changes
1156 */
1157 if (!migrated)
1158 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
1159 p->numa_scan_period + jiffies_to_msecs(10));
1160
1161 task_numa_placement(p);
1162 }
1163
1164 static void reset_ptenuma_scan(struct task_struct *p)
1165 {
1166 ACCESS_ONCE(p->mm->numa_scan_seq)++;
1167 p->mm->numa_scan_offset = 0;
1168 }
1169
1170 /*
1171 * The expensive part of numa migration is done from task_work context.
1172 * Triggered from task_tick_numa().
1173 */
1174 void task_numa_work(struct callback_head *work)
1175 {
1176 unsigned long migrate, next_scan, now = jiffies;
1177 struct task_struct *p = current;
1178 struct mm_struct *mm = p->mm;
1179 struct vm_area_struct *vma;
1180 unsigned long start, end;
1181 long pages;
1182
1183 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
1184
1185 work->next = work; /* protect against double add */
1186 /*
1187 * Who cares about NUMA placement when they're dying.
1188 *
1189 * NOTE: make sure not to dereference p->mm before this check,
1190 * exit_task_work() happens _after_ exit_mm() so we could be called
1191 * without p->mm even though we still had it when we enqueued this
1192 * work.
1193 */
1194 if (p->flags & PF_EXITING)
1195 return;
1196
1197 /*
1198 * We do not care about task placement until a task runs on a node
1199 * other than the first one used by the address space. This is
1200 * largely because migrations are driven by what CPU the task
1201 * is running on. If it's never scheduled on another node, it'll
1202 * not migrate so why bother trapping the fault.
1203 */
1204 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
1205 mm->first_nid = numa_node_id();
1206 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
1207 /* Are we running on a new node yet? */
1208 if (numa_node_id() == mm->first_nid &&
1209 !sched_feat_numa(NUMA_FORCE))
1210 return;
1211
1212 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
1213 }
1214
1215 /*
1216 * Reset the scan period if enough time has gone by. Objective is that
1217 * scanning will be reduced if pages are properly placed. As tasks
1218 * can enter different phases this needs to be re-examined. Lacking
1219 * proper tracking of reference behaviour, this blunt hammer is used.
1220 */
1221 migrate = mm->numa_next_reset;
1222 if (time_after(now, migrate)) {
1223 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1224 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
1225 xchg(&mm->numa_next_reset, next_scan);
1226 }
1227
1228 /*
1229 * Enforce maximal scan/migration frequency..
1230 */
1231 migrate = mm->numa_next_scan;
1232 if (time_before(now, migrate))
1233 return;
1234
1235 if (p->numa_scan_period == 0)
1236 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1237
1238 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
1239 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
1240 return;
1241
1242 /*
1243 * Do not set pte_numa if the current running node is rate-limited.
1244 * This loses statistics on the fault but if we are unwilling to
1245 * migrate to this node, it is less likely we can do useful work
1246 */
1247 if (migrate_ratelimited(numa_node_id()))
1248 return;
1249
1250 start = mm->numa_scan_offset;
1251 pages = sysctl_numa_balancing_scan_size;
1252 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
1253 if (!pages)
1254 return;
1255
1256 down_read(&mm->mmap_sem);
1257 vma = find_vma(mm, start);
1258 if (!vma) {
1259 reset_ptenuma_scan(p);
1260 start = 0;
1261 vma = mm->mmap;
1262 }
1263 for (; vma; vma = vma->vm_next) {
1264 if (!vma_migratable(vma))
1265 continue;
1266
1267 /* Skip small VMAs. They are not likely to be of relevance */
1268 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
1269 continue;
1270
1271 /*
1272 * Skip inaccessible VMAs to avoid any confusion between
1273 * PROT_NONE and NUMA hinting ptes
1274 */
1275 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1276 continue;
1277
1278 do {
1279 start = max(start, vma->vm_start);
1280 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
1281 end = min(end, vma->vm_end);
1282 pages -= change_prot_numa(vma, start, end);
1283
1284 start = end;
1285 if (pages <= 0)
1286 goto out;
1287 } while (end != vma->vm_end);
1288 }
1289
1290 out:
1291 /*
1292 * It is possible to reach the end of the VMA list but the last few VMAs are
1293 * not guaranteed to the vma_migratable. If they are not, we would find the
1294 * !migratable VMA on the next scan but not reset the scanner to the start
1295 * so check it now.
1296 */
1297 if (vma)
1298 mm->numa_scan_offset = start;
1299 else
1300 reset_ptenuma_scan(p);
1301 up_read(&mm->mmap_sem);
1302 }
1303
1304 /*
1305 * Drive the periodic memory faults..
1306 */
1307 void task_tick_numa(struct rq *rq, struct task_struct *curr)
1308 {
1309 struct callback_head *work = &curr->numa_work;
1310 u64 period, now;
1311
1312 /*
1313 * We don't care about NUMA placement if we don't have memory.
1314 */
1315 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
1316 return;
1317
1318 /*
1319 * Using runtime rather than walltime has the dual advantage that
1320 * we (mostly) drive the selection from busy threads and that the
1321 * task needs to have done some actual work before we bother with
1322 * NUMA placement.
1323 */
1324 now = curr->se.sum_exec_runtime;
1325 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
1326
1327 if (now - curr->node_stamp > period) {
1328 if (!curr->node_stamp)
1329 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1330 curr->node_stamp = now;
1331
1332 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1333 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1334 task_work_add(curr, work, true);
1335 }
1336 }
1337 }
1338 #else
1339 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1340 {
1341 }
1342 #endif /* CONFIG_NUMA_BALANCING */
1343
1344 static void
1345 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1346 {
1347 update_load_add(&cfs_rq->load, se->load.weight);
1348 if (!parent_entity(se))
1349 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1350 #ifdef CONFIG_SMP
1351 if (entity_is_task(se))
1352 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1353 #endif
1354 cfs_rq->nr_running++;
1355 }
1356
1357 static void
1358 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1359 {
1360 update_load_sub(&cfs_rq->load, se->load.weight);
1361 if (!parent_entity(se))
1362 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1363 if (entity_is_task(se))
1364 list_del_init(&se->group_node);
1365 cfs_rq->nr_running--;
1366 }
1367
1368 #ifdef CONFIG_FAIR_GROUP_SCHED
1369 # ifdef CONFIG_SMP
1370 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1371 {
1372 long tg_weight;
1373
1374 /*
1375 * Use this CPU's actual weight instead of the last load_contribution
1376 * to gain a more accurate current total weight. See
1377 * update_cfs_rq_load_contribution().
1378 */
1379 tg_weight = atomic_long_read(&tg->load_avg);
1380 tg_weight -= cfs_rq->tg_load_contrib;
1381 tg_weight += cfs_rq->load.weight;
1382
1383 return tg_weight;
1384 }
1385
1386 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1387 {
1388 long tg_weight, load, shares;
1389
1390 tg_weight = calc_tg_weight(tg, cfs_rq);
1391 load = cfs_rq->load.weight;
1392
1393 shares = (tg->shares * load);
1394 if (tg_weight)
1395 shares /= tg_weight;
1396
1397 if (shares < MIN_SHARES)
1398 shares = MIN_SHARES;
1399 if (shares > tg->shares)
1400 shares = tg->shares;
1401
1402 return shares;
1403 }
1404 # else /* CONFIG_SMP */
1405 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1406 {
1407 return tg->shares;
1408 }
1409 # endif /* CONFIG_SMP */
1410 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1411 unsigned long weight)
1412 {
1413 if (se->on_rq) {
1414 /* commit outstanding execution time */
1415 if (cfs_rq->curr == se)
1416 update_curr(cfs_rq);
1417 account_entity_dequeue(cfs_rq, se);
1418 }
1419
1420 update_load_set(&se->load, weight);
1421
1422 if (se->on_rq)
1423 account_entity_enqueue(cfs_rq, se);
1424 }
1425
1426 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1427
1428 static void update_cfs_shares(struct cfs_rq *cfs_rq)
1429 {
1430 struct task_group *tg;
1431 struct sched_entity *se;
1432 long shares;
1433
1434 tg = cfs_rq->tg;
1435 se = tg->se[cpu_of(rq_of(cfs_rq))];
1436 if (!se || throttled_hierarchy(cfs_rq))
1437 return;
1438 #ifndef CONFIG_SMP
1439 if (likely(se->load.weight == tg->shares))
1440 return;
1441 #endif
1442 shares = calc_cfs_shares(cfs_rq, tg);
1443
1444 reweight_entity(cfs_rq_of(se), se, shares);
1445 }
1446 #else /* CONFIG_FAIR_GROUP_SCHED */
1447 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1448 {
1449 }
1450 #endif /* CONFIG_FAIR_GROUP_SCHED */
1451
1452 #ifdef CONFIG_SMP
1453 /*
1454 * We choose a half-life close to 1 scheduling period.
1455 * Note: The tables below are dependent on this value.
1456 */
1457 //#define LOAD_AVG_PERIOD 32
1458 //#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1459 //#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1460
1461 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1462 static const u32 runnable_avg_yN_inv[] = {
1463 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1464 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1465 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1466 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1467 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1468 0x85aac367, 0x82cd8698,
1469 };
1470
1471 /*
1472 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1473 * over-estimates when re-combining.
1474 */
1475 static const u32 runnable_avg_yN_sum[] = {
1476 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1477 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1478 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1479 };
1480
1481 /*
1482 * Approximate:
1483 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1484 */
1485 static __always_inline u64 decay_load(u64 val, u64 n)
1486 {
1487 unsigned int local_n;
1488
1489 if (!n)
1490 return val;
1491 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1492 return 0;
1493
1494 /* after bounds checking we can collapse to 32-bit */
1495 local_n = n;
1496
1497 /*
1498 * As y^PERIOD = 1/2, we can combine
1499 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1500 * With a look-up table which covers k^n (n<PERIOD)
1501 *
1502 * To achieve constant time decay_load.
1503 */
1504 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1505 val >>= local_n / LOAD_AVG_PERIOD;
1506 local_n %= LOAD_AVG_PERIOD;
1507 }
1508
1509 val *= runnable_avg_yN_inv[local_n];
1510 /* We don't use SRR here since we always want to round down. */
1511 return val >> 32;
1512 }
1513
1514 /*
1515 * For updates fully spanning n periods, the contribution to runnable
1516 * average will be: \Sum 1024*y^n
1517 *
1518 * We can compute this reasonably efficiently by combining:
1519 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1520 */
1521 static u32 __compute_runnable_contrib(u64 n)
1522 {
1523 u32 contrib = 0;
1524
1525 if (likely(n <= LOAD_AVG_PERIOD))
1526 return runnable_avg_yN_sum[n];
1527 else if (unlikely(n >= LOAD_AVG_MAX_N))
1528 return LOAD_AVG_MAX;
1529
1530 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1531 do {
1532 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1533 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1534
1535 n -= LOAD_AVG_PERIOD;
1536 } while (n > LOAD_AVG_PERIOD);
1537
1538 contrib = decay_load(contrib, n);
1539 return contrib + runnable_avg_yN_sum[n];
1540 }
1541
1542 #ifdef CONFIG_HMP_VARIABLE_SCALE
1543
1544 #define HMP_VARIABLE_SCALE_SHIFT 16ULL
1545 struct hmp_global_attr {
1546 struct attribute attr;
1547 ssize_t (*show)(struct kobject *kobj,
1548 struct attribute *attr, char *buf);
1549 ssize_t (*store)(struct kobject *a, struct attribute *b,
1550 const char *c, size_t count);
1551 int *value;
1552 int (*to_sysfs)(int);
1553 int (*from_sysfs)(int);
1554 };
1555
1556 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1557 #define HMP_DATA_SYSFS_MAX 5
1558 #else
1559 #define HMP_DATA_SYSFS_MAX 4
1560 #endif
1561
1562 struct hmp_data_struct {
1563 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1564 int freqinvar_load_scale_enabled;
1565 #endif
1566 int multiplier; /* used to scale the time delta */
1567 struct attribute_group attr_group;
1568 struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
1569 struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
1570 } hmp_data;
1571
1572 static u64 hmp_variable_scale_convert(u64 delta);
1573 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1574 /* Frequency-Invariant Load Modification:
1575 * Loads are calculated as in PJT's patch however we also scale the current
1576 * contribution in line with the frequency of the CPU that the task was
1577 * executed on.
1578 * In this version, we use a simple linear scale derived from the maximum
1579 * frequency reported by CPUFreq. As an example:
1580 *
1581 * Consider that we ran a task for 100% of the previous interval.
1582 *
1583 * Our CPU was under asynchronous frequency control through one of the
1584 * CPUFreq governors.
1585 *
1586 * The CPUFreq governor reports that it is able to scale the CPU between
1587 * 500MHz and 1GHz.
1588 *
1589 * During the period, the CPU was running at 1GHz.
1590 *
1591 * In this case, our load contribution for that period is calculated as
1592 * 1 * (number_of_active_microseconds)
1593 *
1594 * This results in our task being able to accumulate maximum load as normal.
1595 *
1596 *
1597 * Consider now that our CPU was executing at 500MHz.
1598 *
1599 * We now scale the load contribution such that it is calculated as
1600 * 0.5 * (number_of_active_microseconds)
1601 *
1602 * Our task can only record 50% maximum load during this period.
1603 *
1604 * This represents the task consuming 50% of the CPU's *possible* compute
1605 * capacity. However the task did consume 100% of the CPU's *available*
1606 * compute capacity which is the value seen by the CPUFreq governor and
1607 * user-side CPU Utilization tools.
1608 *
1609 * Restricting tracked load to be scaled by the CPU's frequency accurately
1610 * represents the consumption of possible compute capacity and allows the
1611 * HMP migration's simple threshold migration strategy to interact more
1612 * predictably with CPUFreq's asynchronous compute capacity changes.
1613 */
1614 #define SCHED_FREQSCALE_SHIFT 10
1615 struct cpufreq_extents {
1616 u32 curr_scale;
1617 u32 min;
1618 u32 max;
1619 u32 flags;
1620 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
1621 u32 const_max;
1622 u32 throttling;
1623 #endif
1624 };
1625 /* Flag set when the governor in use only allows one frequency.
1626 * Disables scaling.
1627 */
1628 #define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
1629
1630 static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
1631 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1632 #endif /* CONFIG_HMP_VARIABLE_SCALE */
1633
1634 #ifdef CONFIG_MTK_SCHED_CMP
1635 int get_cluster_id(unsigned int cpu)
1636 {
1637 return arch_get_cluster_id(cpu);
1638 }
1639
1640 void get_cluster_cpus(struct cpumask *cpus, int cluster_id,
1641 bool exclusive_offline)
1642 {
1643 struct cpumask cls_cpus;
1644
1645 arch_get_cluster_cpus(&cls_cpus, cluster_id);
1646 if (exclusive_offline) {
1647 cpumask_and(cpus, cpu_online_mask, &cls_cpus);
1648 } else
1649 cpumask_copy(cpus, &cls_cpus);
1650 }
1651
1652 static int nr_cpus_in_cluster(int cluster_id, bool exclusive_offline)
1653 {
1654 struct cpumask cls_cpus;
1655 int nr_cpus;
1656
1657 arch_get_cluster_cpus(&cls_cpus, cluster_id);
1658 if (exclusive_offline) {
1659 struct cpumask online_cpus;
1660 cpumask_and(&online_cpus, cpu_online_mask, &cls_cpus);
1661 nr_cpus = cpumask_weight(&online_cpus);
1662 } else
1663 nr_cpus = cpumask_weight(&cls_cpus);
1664
1665 return nr_cpus;
1666 }
1667 #endif /* CONFIG_MTK_SCHED_CMP */
1668
1669 void sched_get_big_little_cpus(struct cpumask *big, struct cpumask *little)
1670 {
1671 arch_get_big_little_cpus(big, little);
1672 }
1673 EXPORT_SYMBOL(sched_get_big_little_cpus);
1674
1675 /*
1676 * generic entry point for cpu mask construction, dedicated for
1677 * mediatek scheduler.
1678 */
1679 static __init __inline void cmp_cputopo_domain_setup(void)
1680 {
1681 WARN(smp_processor_id() != 0, "%s is supposed runs on CPU0 "
1682 "while kernel init", __func__);
1683 #ifdef CONFIG_MTK_CPU_TOPOLOGY
1684 /*
1685 * sched_init
1686 * |-> cmp_cputopo_domain_seutp()
1687 * ...
1688 * rest_init
1689 * ^ fork kernel_init
1690 * |-> kernel_init_freeable
1691 * ...
1692 * |-> arch_build_cpu_topology_domain
1693 *
1694 * here, we focus to build up cpu topology and domain before scheduler runs.
1695 */
1696 pr_debug("[CPUTOPO][%s] build CPU topology and cluster.\n", __func__);
1697 arch_build_cpu_topology_domain();
1698 #endif
1699 }
1700
1701 #ifdef CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY
1702 static u64 __inline variable_scale_convert(u64 delta)
1703 {
1704 u64 high = delta >> 32ULL;
1705 u64 low = delta & 0xffffffffULL;
1706 low *= LOAD_AVG_VARIABLE_PERIOD;
1707 high *= LOAD_AVG_VARIABLE_PERIOD;
1708 return (low >> 16ULL) + (high << (32ULL - 16ULL));
1709 }
1710 #endif
1711
1712 /* We can represent the historical contribution to runnable average as the
1713 * coefficients of a geometric series. To do this we sub-divide our runnable
1714 * history into segments of approximately 1ms (1024us); label the segment that
1715 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1716 *
1717 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1718 * p0 p1 p2
1719 * (now) (~1ms ago) (~2ms ago)
1720 *
1721 * Let u_i denote the fraction of p_i that the entity was runnable.
1722 *
1723 * We then designate the fractions u_i as our co-efficients, yielding the
1724 * following representation of historical load:
1725 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1726 *
1727 * We choose y based on the with of a reasonably scheduling period, fixing:
1728 * y^32 = 0.5
1729 *
1730 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1731 * approximately half as much as the contribution to load within the last ms
1732 * (u_0).
1733 *
1734 * When a period "rolls over" and we have new u_0`, multiplying the previous
1735 * sum again by y is sufficient to update:
1736 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1737 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1738 */
1739 static __always_inline int __update_entity_runnable_avg(u64 now,
1740 struct sched_avg *sa,
1741 int runnable,
1742 int running,
1743 int cpu)
1744 {
1745 u64 delta, periods, lru;
1746 u32 runnable_contrib;
1747 int delta_w, decayed = 0;
1748 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1749 u64 scaled_delta;
1750 u32 scaled_runnable_contrib;
1751 int scaled_delta_w;
1752 u32 curr_scale = 1024;
1753 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1754 u64 scaled_delta;
1755 u32 scaled_runnable_contrib;
1756 int scaled_delta_w;
1757 u32 curr_scale = CPUPOWER_FREQSCALE_DEFAULT;
1758 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1759
1760 delta = now - sa->last_runnable_update;
1761 lru = sa->last_runnable_update;
1762 /*
1763 * This should only happen when time goes backwards, which it
1764 * unfortunately does during sched clock init when we swap over to TSC.
1765 */
1766 if ((s64)delta < 0) {
1767 sa->last_runnable_update = now;
1768 return 0;
1769 }
1770
1771 #ifdef CONFIG_HMP_VARIABLE_SCALE
1772 delta = hmp_variable_scale_convert(delta);
1773 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1774 delta = variable_scale_convert(delta);
1775 #endif
1776 /*
1777 * Use 1024ns as the unit of measurement since it's a reasonable
1778 * approximation of 1us and fast to compute.
1779 */
1780 delta >>= 10;
1781 if (!delta)
1782 return 0;
1783 sa->last_runnable_update = now;
1784
1785 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1786 WARN(cpu < 0, "[%s] CPU %d < 0 !!!\n", __func__, cpu);
1787 /* retrieve scale factor for load */
1788 if (cpu >= 0 && cpu < nr_cpu_ids && hmp_data.freqinvar_load_scale_enabled)
1789 curr_scale = freq_scale[cpu].curr_scale;
1790 mt_sched_printf("[%s] cpu=%d delta=%llu now=%llu last=%llu curr_scale=%u",
1791 __func__, cpu, delta, now, lru, curr_scale);
1792 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1793 WARN(cpu < 0, "[%s] CPU %d < 0 !!!\n", __func__, cpu);
1794 /* retrieve scale factor for load */
1795 if (cpu >= 0 && cpu < nr_cpu_ids)
1796 curr_scale = (topology_cpu_capacity(cpu) << CPUPOWER_FREQSCALE_SHIFT)
1797 / (topology_max_cpu_capacity(cpu)+1);
1798 mt_sched_printf("[%s] cpu=%d delta=%llu now=%llu last=%llu curr_scale=%u",
1799 __func__, cpu, delta, now, lru, curr_scale);
1800 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1801
1802 /* delta_w is the amount already accumulated against our next period */
1803 delta_w = sa->runnable_avg_period % 1024;
1804 if (delta + delta_w >= 1024) {
1805 /* period roll-over */
1806 decayed = 1;
1807
1808 /*
1809 * Now that we know we're crossing a period boundary, figure
1810 * out how much from delta we need to complete the current
1811 * period and accrue it.
1812 */
1813 delta_w = 1024 - delta_w;
1814 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1815 /* scale runnable time if necessary */
1816 scaled_delta_w = (delta_w * curr_scale)
1817 >> SCHED_FREQSCALE_SHIFT;
1818 if (runnable)
1819 sa->runnable_avg_sum += scaled_delta_w;
1820 if (running)
1821 sa->usage_avg_sum += scaled_delta_w;
1822 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1823 /* scale runnable time if necessary */
1824 scaled_delta_w = (delta_w * curr_scale)
1825 >> CPUPOWER_FREQSCALE_SHIFT;
1826 if (runnable)
1827 sa->runnable_avg_sum += scaled_delta_w;
1828 if (running)
1829 sa->usage_avg_sum += scaled_delta_w;
1830 #else
1831 if (runnable)
1832 sa->runnable_avg_sum += delta_w;
1833 if (running)
1834 sa->usage_avg_sum += delta_w;
1835 #endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1836 sa->runnable_avg_period += delta_w;
1837
1838 delta -= delta_w;
1839
1840 /* Figure out how many additional periods this update spans */
1841 periods = delta / 1024;
1842 delta %= 1024;
1843 /* decay the load we have accumulated so far */
1844 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1845 periods + 1);
1846 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1847 periods + 1);
1848 sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
1849 /* add the contribution from this period */
1850 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1851 runnable_contrib = __compute_runnable_contrib(periods);
1852 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1853 /* Apply load scaling if necessary.
1854 * Note that multiplying the whole series is same as
1855 * multiplying all terms
1856 */
1857 scaled_runnable_contrib = (runnable_contrib * curr_scale)
1858 >> SCHED_FREQSCALE_SHIFT;
1859 if (runnable)
1860 sa->runnable_avg_sum += scaled_runnable_contrib;
1861 if (running)
1862 sa->usage_avg_sum += scaled_runnable_contrib;
1863 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1864 /* Apply load scaling if necessary.
1865 * Note that multiplying the whole series is same as
1866 * multiplying all terms
1867 */
1868 scaled_runnable_contrib = (runnable_contrib * curr_scale)
1869 >> CPUPOWER_FREQSCALE_SHIFT;
1870 if (runnable)
1871 sa->runnable_avg_sum += scaled_runnable_contrib;
1872 if (running)
1873 sa->usage_avg_sum += scaled_runnable_contrib;
1874 #else
1875 if (runnable)
1876 sa->runnable_avg_sum += runnable_contrib;
1877 if (running)
1878 sa->usage_avg_sum += runnable_contrib;
1879 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1880 sa->runnable_avg_period += runnable_contrib;
1881 }
1882
1883 /* Remainder of delta accrued against u_0` */
1884 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1885 /* scale if necessary */
1886 scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
1887 if (runnable)
1888 sa->runnable_avg_sum += scaled_delta;
1889 if (running)
1890 sa->usage_avg_sum += scaled_delta;
1891 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1892 /* scale if necessary */
1893 scaled_delta = ((delta * curr_scale) >> CPUPOWER_FREQSCALE_SHIFT);
1894 if (runnable)
1895 sa->runnable_avg_sum += scaled_delta;
1896 if (running)
1897 sa->usage_avg_sum += scaled_delta;
1898 #else
1899 if (runnable)
1900 sa->runnable_avg_sum += delta;
1901 if (running)
1902 sa->usage_avg_sum += delta;
1903 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1904 sa->runnable_avg_period += delta;
1905
1906 return decayed;
1907 }
1908
1909 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1910 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1911 {
1912 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1913 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1914
1915 decays -= se->avg.decay_count;
1916 if (!decays)
1917 return 0;
1918
1919 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1920 se->avg.decay_count = 0;
1921
1922 return decays;
1923 }
1924
1925 #ifdef CONFIG_FAIR_GROUP_SCHED
1926 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1927 int force_update)
1928 {
1929 struct task_group *tg = cfs_rq->tg;
1930 long tg_contrib;
1931
1932 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1933 tg_contrib -= cfs_rq->tg_load_contrib;
1934
1935 if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1936 atomic_long_add(tg_contrib, &tg->load_avg);
1937 cfs_rq->tg_load_contrib += tg_contrib;
1938 }
1939 }
1940
1941 /*
1942 * Aggregate cfs_rq runnable averages into an equivalent task_group
1943 * representation for computing load contributions.
1944 */
1945 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1946 struct cfs_rq *cfs_rq)
1947 {
1948 struct task_group *tg = cfs_rq->tg;
1949 long contrib, usage_contrib;
1950
1951 /* The fraction of a cpu used by this cfs_rq */
1952 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1953 sa->runnable_avg_period + 1);
1954 contrib -= cfs_rq->tg_runnable_contrib;
1955
1956 usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
1957 sa->runnable_avg_period + 1);
1958 usage_contrib -= cfs_rq->tg_usage_contrib;
1959
1960 /*
1961 * contrib/usage at this point represent deltas, only update if they
1962 * are substantive.
1963 */
1964 if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
1965 (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
1966 atomic_add(contrib, &tg->runnable_avg);
1967 cfs_rq->tg_runnable_contrib += contrib;
1968
1969 atomic_add(usage_contrib, &tg->usage_avg);
1970 cfs_rq->tg_usage_contrib += usage_contrib;
1971 }
1972 }
1973
1974 static inline void __update_group_entity_contrib(struct sched_entity *se)
1975 {
1976 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1977 struct task_group *tg = cfs_rq->tg;
1978 int runnable_avg;
1979
1980 u64 contrib;
1981
1982 contrib = cfs_rq->tg_load_contrib * tg->shares;
1983 se->avg.load_avg_contrib = div_u64(contrib,
1984 atomic_long_read(&tg->load_avg) + 1);
1985
1986 /*
1987 * For group entities we need to compute a correction term in the case
1988 * that they are consuming <1 cpu so that we would contribute the same
1989 * load as a task of equal weight.
1990 *
1991 * Explicitly co-ordinating this measurement would be expensive, but
1992 * fortunately the sum of each cpus contribution forms a usable
1993 * lower-bound on the true value.
1994 *
1995 * Consider the aggregate of 2 contributions. Either they are disjoint
1996 * (and the sum represents true value) or they are disjoint and we are
1997 * understating by the aggregate of their overlap.
1998 *
1999 * Extending this to N cpus, for a given overlap, the maximum amount we
2000 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
2001 * cpus that overlap for this interval and w_i is the interval width.
2002 *
2003 * On a small machine; the first term is well-bounded which bounds the
2004 * total error since w_i is a subset of the period. Whereas on a
2005 * larger machine, while this first term can be larger, if w_i is the
2006 * of consequential size guaranteed to see n_i*w_i quickly converge to
2007 * our upper bound of 1-cpu.
2008 */
2009 runnable_avg = atomic_read(&tg->runnable_avg);
2010 if (runnable_avg < NICE_0_LOAD) {
2011 se->avg.load_avg_contrib *= runnable_avg;
2012 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2013 }
2014 }
2015 #else
2016 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2017 int force_update) {}
2018 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2019 struct cfs_rq *cfs_rq) {}
2020 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2021 #endif
2022
2023 static inline void __update_task_entity_contrib(struct sched_entity *se)
2024 {
2025 u32 contrib;
2026
2027 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2028 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2029 contrib /= (se->avg.runnable_avg_period + 1);
2030 se->avg.load_avg_contrib = scale_load(contrib);
2031 }
2032
2033 /* Compute the current contribution to load_avg by se, return any delta */
2034 static long __update_entity_load_avg_contrib(struct sched_entity *se)
2035 {
2036 long old_contrib = se->avg.load_avg_contrib;
2037
2038 if (entity_is_task(se)) {
2039 __update_task_entity_contrib(se);
2040 } else {
2041 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
2042 __update_group_entity_contrib(se);
2043 }
2044
2045 return se->avg.load_avg_contrib - old_contrib;
2046 }
2047
2048 #if defined(CONFIG_MTK_SCHED_CMP) || defined(CONFIG_SCHED_HMP_ENHANCEMENT)
2049 /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
2050 static long __update_task_entity_ratio(struct sched_entity *se)
2051 {
2052 long old_ratio = se->avg.load_avg_ratio;
2053 u32 ratio;
2054
2055 ratio = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
2056 ratio /= (se->avg.runnable_avg_period + 1);
2057 se->avg.load_avg_ratio = scale_load(ratio);
2058
2059 return se->avg.load_avg_ratio - old_ratio;
2060 }
2061 #else
2062 static inline long __update_task_entity_ratio(struct sched_entity *se) { return 0; }
2063 #endif
2064
2065 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2066 long load_contrib)
2067 {
2068 if (likely(load_contrib < cfs_rq->blocked_load_avg))
2069 cfs_rq->blocked_load_avg -= load_contrib;
2070 else
2071 cfs_rq->blocked_load_avg = 0;
2072 }
2073
2074 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2075 unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
2076 #endif
2077
2078 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
2079 /* Schedule entity */
2080 #define se_pid(se) ((se != NULL && entity_is_task(se))? \
2081 container_of(se,struct task_struct,se)->pid:-1)
2082 #define se_load(se) se->avg.load_avg_ratio
2083 #define se_contrib(se) se->avg.load_avg_contrib
2084
2085 /* CPU related : load information */
2086 #define cfs_pending_load(cpu) cpu_rq(cpu)->cfs.avg.pending_load
2087 #define cfs_load(cpu) cpu_rq(cpu)->cfs.avg.load_avg_ratio
2088 #define cfs_contrib(cpu) cpu_rq(cpu)->cfs.avg.load_avg_contrib
2089
2090 /* CPU related : the number of tasks */
2091 #define cfs_nr_normal_prio(cpu) cpu_rq(cpu)->cfs.avg.nr_normal_prio
2092 #define cfs_nr_pending(cpu) cpu_rq(cpu)->cfs.avg.nr_pending
2093 #define cfs_length(cpu) cpu_rq(cpu)->cfs.h_nr_running
2094 #define rq_length(cpu) (cpu_rq(cpu)->nr_running + cfs_nr_pending(cpu))
2095
2096 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2097 #define task_low_priority(prio) ((prio >= hmp_up_prio)?1:0)
2098 #define cfs_nr_dequeuing_low_prio(cpu) \
2099 cpu_rq(cpu)->cfs.avg.nr_dequeuing_low_prio
2100 #define cfs_reset_nr_dequeuing_low_prio(cpu) \
2101 (cfs_nr_dequeuing_low_prio(cpu) = 0)
2102 #else
2103 #define task_low_priority(prio) (0)
2104 #define cfs_reset_nr_dequeuing_low_prio(cpu)
2105 #endif /* CONFIG_SCHED_HMP_PRIO_FILTER */
2106 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
2107
2108 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2109
2110 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2111 int group_leader_is_empty(struct task_struct *p) {
2112
2113 struct task_struct *tg = p->group_leader;
2114
2115 if (SIGNAL_GROUP_EXIT & p->signal->flags){
2116 // pr_warn("[%s] (0x%p/0x%p)(#%d/%s) leader: pid(%d) state(%d) exit_state(%d)signal_flags=%x p->signal->flags=%x group_exit_code=%x\n", __func__,
2117 // p, tg, get_nr_threads(p), thread_group_empty(p) ? "empty" : "not empty",
2118 // p->tgid, tg->state, tg->exit_state, tg->state, p->signal->flags, p->signal->group_exit_code);
2119 return 1;
2120 }
2121
2122 // workaround debug codes
2123 if(tg->state == 0x6b6b6b6b){
2124 // pr_warn("[%s] (0x%p/0x%p)(#%d/%s) leader: state(%d) exit_state(%d)\n", __func__,
2125 // p, tg, get_nr_threads(p), thread_group_empty(p) ? "empty" : "not empty",
2126 // tg->state, tg->exit_state);
2127 return 1;
2128 }
2129
2130 return 0;
2131 }
2132
2133 static inline void update_tg_info(struct cfs_rq *cfs_rq, struct sched_entity *se, long ratio_delta)
2134 {
2135 struct task_struct *p = task_of(se);
2136 struct task_struct *tg = p->group_leader;
2137 int id;
2138 unsigned long flags;
2139
2140 if (group_leader_is_empty(p))
2141 return;
2142 id = get_cluster_id(cfs_rq->rq->cpu);
2143 if (unlikely(WARN_ON(id < 0)))
2144 return;
2145
2146 raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags);
2147 tg->thread_group_info[id].load_avg_ratio += ratio_delta;
2148 raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags);
2149
2150 #ifdef CONFIG_MT_SCHED_INFO
2151 mt_sched_printf("update_tg_info %d:%s %d:%s %ld %ld %d %d %lu:%lu:%lu update",
2152 tg->pid, tg->comm, p->pid, p->comm,
2153 se->avg.load_avg_ratio, ratio_delta,
2154 cfs_rq->rq->cpu, id,
2155 tg->thread_group_info[id].nr_running,
2156 tg->thread_group_info[id].cfs_nr_running,
2157 tg->thread_group_info[id].load_avg_ratio);
2158 /*
2159 mt_sched_printf("update %d:%s %d:%s %ld %ld %d %d %lu %lu %lu, %lu %lu %lu",
2160 tg->pid, tg->comm, p->pid, p->comm,
2161 se->avg.load_avg_ratio, ratio_delta,
2162 id, cfs_rq->rq->cpu,
2163 tg->thread_group_info[0].nr_running,
2164 tg->thread_group_info[0].cfs_nr_running,
2165 tg->thread_group_info[0].load_avg_ratio,
2166 tg->thread_group_info[1].nr_running,
2167 tg->thread_group_info[1].cfs_nr_running,
2168 tg->thread_group_info[1].load_avg_ratio);
2169 */
2170 #endif
2171
2172 }
2173 #endif
2174
2175 /* Update a sched_entity's runnable average */
2176 static inline void update_entity_load_avg(struct sched_entity *se,
2177 int update_cfs_rq)
2178 {
2179 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2180 long contrib_delta;
2181 u64 now;
2182 long ratio_delta = 0;
2183 int cpu = -1; /* not used in normal case */
2184
2185 #if defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) \
2186 || defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
2187 cpu = cfs_rq->rq->cpu;
2188 #endif
2189
2190 /*
2191 * For a group entity we need to use their owned cfs_rq_clock_task() in
2192 * case they are the parent of a throttled hierarchy.
2193 */
2194 if (entity_is_task(se))
2195 now = cfs_rq_clock_task(cfs_rq);
2196 else
2197 now = cfs_rq_clock_task(group_cfs_rq(se));
2198
2199 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
2200 cfs_rq->curr == se, cpu)) {
2201 #if 0
2202 if (entity_is_task(se)) {
2203 ratio_delta = __update_task_entity_ratio(se);
2204 if (update_cfs_rq)
2205 {
2206 cpu = cfs_rq->rq->cpu;
2207 cpu_rq(cpu)->cfs.avg.load_avg_ratio += ratio_delta;
2208 #ifdef CONFIG_HMP_TRACER
2209 trace_sched_cfs_load_update(task_of(se),se_load(se),ratio_delta, cpu);
2210 #endif /* CONFIG_HMP_TRACER */
2211 }
2212
2213 trace_sched_task_entity_avg(2, task_of(se), &se->avg);
2214 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2215 if (se->on_rq) {
2216 update_tg_info(cfs_rq, se, ratio_delta);
2217 }
2218 #endif
2219 }
2220 #endif
2221 return;
2222 }
2223
2224 contrib_delta = __update_entity_load_avg_contrib(se);
2225
2226 /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
2227 if (entity_is_task(se)) {
2228 ratio_delta = __update_task_entity_ratio(se);
2229 /*
2230 * ratio is re-estimated just for entity of task; as
2231 * for contrib, mark tracer here for task entity while
2232 * mining tg's at __update_group_entity_contrib().
2233 *
2234 * track running usage in passing.
2235 */
2236 trace_sched_task_entity_avg(3, task_of(se), &se->avg);
2237 }
2238
2239 if (!update_cfs_rq)
2240 return;
2241
2242 if (se->on_rq) {
2243 cfs_rq->runnable_load_avg += contrib_delta;
2244 if (entity_is_task(se)) {
2245 cpu = cfs_rq->rq->cpu;
2246 cpu_rq(cpu)->cfs.avg.load_avg_ratio += ratio_delta;
2247 cpu_rq(cpu)->cfs.avg.load_avg_contrib += contrib_delta;
2248 #ifdef CONFIG_HMP_TRACER
2249 trace_sched_cfs_load_update(task_of(se),se_load(se),ratio_delta,cpu);
2250 #endif /* CONFIG_HMP_TRACER */
2251 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2252 update_tg_info(cfs_rq, se, ratio_delta);
2253 #endif
2254 }
2255 }
2256 else
2257 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2258 }
2259
2260
2261 /*
2262 * Decay the load contributed by all blocked children and account this so that
2263 * their contribution may appropriately discounted when they wake up.
2264 */
2265 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2266 {
2267 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
2268 u64 decays;
2269
2270 decays = now - cfs_rq->last_decay;
2271 if (!decays && !force_update)
2272 return;
2273
2274 if (atomic_long_read(&cfs_rq->removed_load)) {
2275 unsigned long removed_load;
2276 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
2277 subtract_blocked_load_contrib(cfs_rq, removed_load);
2278 }
2279
2280 if (decays) {
2281 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
2282 decays);
2283 atomic64_add(decays, &cfs_rq->decay_counter);
2284 cfs_rq->last_decay = now;
2285 }
2286
2287 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2288 }
2289
2290 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2291 {
2292 u32 contrib;
2293 int cpu = -1; /* not used in normal case */
2294
2295 #if defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) \
2296 || defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
2297 cpu = rq->cpu;
2298 #endif
2299 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
2300 runnable, cpu);
2301 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2302 contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
2303 contrib /= (rq->avg.runnable_avg_period + 1);
2304 trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib));
2305 trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
2306 }
2307
2308 /* Add the load generated by se into cfs_rq's child load-average */
2309 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2310 struct sched_entity *se,
2311 int wakeup)
2312 {
2313 int cpu = cfs_rq->rq->cpu;
2314
2315 /*
2316 * We track migrations using entity decay_count <= 0, on a wake-up
2317 * migration we use a negative decay count to track the remote decays
2318 * accumulated while sleeping.
2319 *
2320 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
2321 * are seen by enqueue_entity_load_avg() as a migration with an already
2322 * constructed load_avg_contrib.
2323 */
2324 if (unlikely(se->avg.decay_count <= 0)) {
2325 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
2326 if (se->avg.decay_count) {
2327 /*
2328 * In a wake-up migration we have to approximate the
2329 * time sleeping. This is because we can't synchronize
2330 * clock_task between the two cpus, and it is not
2331 * guaranteed to be read-safe. Instead, we can
2332 * approximate this using our carried decays, which are
2333 * explicitly atomically readable.
2334 */
2335 se->avg.last_runnable_update -= (-se->avg.decay_count)
2336 << 20;
2337 update_entity_load_avg(se, 0);
2338 /* Indicate that we're now synchronized and on-rq */
2339 se->avg.decay_count = 0;
2340 #ifdef CONFIG_MTK_SCHED_CMP
2341 } else {
2342 if (entity_is_task(se))
2343 trace_sched_task_entity_avg(1, task_of(se), &se->avg);
2344 #endif
2345 }
2346 wakeup = 0;
2347 } else {
2348 __synchronize_entity_decay(se);
2349 }
2350
2351 /* migrated tasks did not contribute to our blocked load */
2352 if (wakeup) {
2353 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
2354 update_entity_load_avg(se, 0);
2355 }
2356
2357 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2358 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2359 if(entity_is_task(se)){
2360 update_tg_info(cfs_rq, se, se->avg.load_avg_ratio);
2361 }
2362 #endif
2363
2364 if (entity_is_task(se)) {
2365 cpu_rq(cpu)->cfs.avg.load_avg_contrib += se->avg.load_avg_contrib;
2366 cpu_rq(cpu)->cfs.avg.load_avg_ratio += se->avg.load_avg_ratio;
2367 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
2368 cfs_nr_pending(cpu) = 0;
2369 cfs_pending_load(cpu) = 0;
2370 #endif
2371 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2372 if(!task_low_priority(task_of(se)->prio))
2373 cfs_nr_normal_prio(cpu)++;
2374 #endif
2375 #ifdef CONFIG_HMP_TRACER
2376 trace_sched_cfs_enqueue_task(task_of(se),se_load(se),cpu);
2377 #endif
2378 }
2379
2380 /* we force update consideration on load-balancer moves */
2381 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2382 }
2383
2384 /*
2385 * Remove se's load from this cfs_rq child load-average, if the entity is
2386 * transitioning to a blocked state we track its projected decay using
2387 * blocked_load_avg.
2388 */
2389 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2390 struct sched_entity *se,
2391 int sleep)
2392 {
2393 int cpu = cfs_rq->rq->cpu;
2394
2395 update_entity_load_avg(se, 1);
2396 /* we force update consideration on load-balancer moves */
2397 update_cfs_rq_blocked_load(cfs_rq, !sleep);
2398
2399 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
2400 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2401 if(entity_is_task(se)){
2402 update_tg_info(cfs_rq, se, -se->avg.load_avg_ratio);
2403 }
2404 #endif
2405
2406 if (entity_is_task(se)) {
2407 cpu_rq(cpu)->cfs.avg.load_avg_contrib -= se->avg.load_avg_contrib;
2408 cpu_rq(cpu)->cfs.avg.load_avg_ratio -= se->avg.load_avg_ratio;
2409 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2410 cfs_reset_nr_dequeuing_low_prio(cpu);
2411 if(!task_low_priority(task_of(se)->prio))
2412 cfs_nr_normal_prio(cpu)--;
2413 #endif
2414 #ifdef CONFIG_HMP_TRACER
2415 trace_sched_cfs_dequeue_task(task_of(se),se_load(se),cpu);
2416 #endif
2417 }
2418
2419 if (sleep) {
2420 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2421 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
2422 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
2423 }
2424
2425 /*
2426 * Update the rq's load with the elapsed running time before entering
2427 * idle. if the last scheduled task is not a CFS task, idle_enter will
2428 * be the only way to update the runnable statistic.
2429 */
2430 void idle_enter_fair(struct rq *this_rq)
2431 {
2432 update_rq_runnable_avg(this_rq, 1);
2433 }
2434
2435 /*
2436 * Update the rq's load with the elapsed idle time before a task is
2437 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2438 * be the only way to update the runnable statistic.
2439 */
2440 void idle_exit_fair(struct rq *this_rq)
2441 {
2442 update_rq_runnable_avg(this_rq, 0);
2443 }
2444
2445 #else
2446 static inline void update_entity_load_avg(struct sched_entity *se,
2447 int update_cfs_rq) {}
2448 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2449 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2450 struct sched_entity *se,
2451 int wakeup) {}
2452 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2453 struct sched_entity *se,
2454 int sleep) {}
2455 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2456 int force_update) {}
2457 #endif
2458
2459 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2460 {
2461 #ifdef CONFIG_SCHEDSTATS
2462 struct task_struct *tsk = NULL;
2463
2464 if (entity_is_task(se))
2465 tsk = task_of(se);
2466
2467 if (se->statistics.sleep_start) {
2468 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
2469
2470 if ((s64)delta < 0)
2471 delta = 0;
2472
2473 if (unlikely(delta > se->statistics.sleep_max))
2474 se->statistics.sleep_max = delta;
2475
2476 se->statistics.sleep_start = 0;
2477 se->statistics.sum_sleep_runtime += delta;
2478
2479 if (tsk) {
2480 account_scheduler_latency(tsk, delta >> 10, 1);
2481 trace_sched_stat_sleep(tsk, delta);
2482 }
2483 }
2484 if (se->statistics.block_start) {
2485 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
2486
2487 if ((s64)delta < 0)
2488 delta = 0;
2489
2490 if (unlikely(delta > se->statistics.block_max))
2491 se->statistics.block_max = delta;
2492
2493 se->statistics.block_start = 0;
2494 se->statistics.sum_sleep_runtime += delta;
2495
2496 if (tsk) {
2497 if (tsk->in_iowait) {
2498 se->statistics.iowait_sum += delta;
2499 se->statistics.iowait_count++;
2500 trace_sched_stat_iowait(tsk, delta);
2501 }
2502
2503 trace_sched_stat_blocked(tsk, delta);
2504
2505 /*
2506 * Blocking time is in units of nanosecs, so shift by
2507 * 20 to get a milliseconds-range estimation of the
2508 * amount of time that the task spent sleeping:
2509 */
2510 if (unlikely(prof_on == SLEEP_PROFILING)) {
2511 profile_hits(SLEEP_PROFILING,
2512 (void *)get_wchan(tsk),
2513 delta >> 20);
2514 }
2515 account_scheduler_latency(tsk, delta >> 10, 0);
2516 }
2517 }
2518 #endif
2519 }
2520
2521 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
2522 {
2523 #ifdef CONFIG_SCHED_DEBUG
2524 s64 d = se->vruntime - cfs_rq->min_vruntime;
2525
2526 if (d < 0)
2527 d = -d;
2528
2529 if (d > 3*sysctl_sched_latency)
2530 schedstat_inc(cfs_rq, nr_spread_over);
2531 #endif
2532 }
2533
2534 static void
2535 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
2536 {
2537 u64 vruntime = cfs_rq->min_vruntime;
2538
2539 /*
2540 * The 'current' period is already promised to the current tasks,
2541 * however the extra weight of the new task will slow them down a
2542 * little, place the new task so that it fits in the slot that
2543 * stays open at the end.
2544 */
2545 if (initial && sched_feat(START_DEBIT))
2546 vruntime += sched_vslice(cfs_rq, se);
2547
2548 /* sleeps up to a single latency don't count. */
2549 if (!initial) {
2550 unsigned long thresh = sysctl_sched_latency;
2551
2552 /*
2553 * Halve their sleep time's effect, to allow
2554 * for a gentler effect of sleepers:
2555 */
2556 if (sched_feat(GENTLE_FAIR_SLEEPERS))
2557 thresh >>= 1;
2558
2559 vruntime -= thresh;
2560 }
2561
2562 /* ensure we never gain time by being placed backwards. */
2563 se->vruntime = max_vruntime(se->vruntime, vruntime);
2564 }
2565
2566 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
2567
2568 static void
2569 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2570 {
2571 /*
2572 * Update the normalized vruntime before updating min_vruntime
2573 * through calling update_curr().
2574 */
2575 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
2576 se->vruntime += cfs_rq->min_vruntime;
2577
2578 /*
2579 * Update run-time statistics of the 'current'.
2580 */
2581 update_curr(cfs_rq);
2582 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
2583 account_entity_enqueue(cfs_rq, se);
2584 update_cfs_shares(cfs_rq);
2585
2586 if (flags & ENQUEUE_WAKEUP) {
2587 place_entity(cfs_rq, se, 0);
2588 enqueue_sleeper(cfs_rq, se);
2589 }
2590
2591 update_stats_enqueue(cfs_rq, se);
2592 check_spread(cfs_rq, se);
2593 if (se != cfs_rq->curr)
2594 __enqueue_entity(cfs_rq, se);
2595 se->on_rq = 1;
2596
2597 if (cfs_rq->nr_running == 1) {
2598 list_add_leaf_cfs_rq(cfs_rq);
2599 check_enqueue_throttle(cfs_rq);
2600 }
2601 }
2602
2603 static void __clear_buddies_last(struct sched_entity *se)
2604 {
2605 for_each_sched_entity(se) {
2606 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2607 if (cfs_rq->last == se)
2608 cfs_rq->last = NULL;
2609 else
2610 break;
2611 }
2612 }
2613
2614 static void __clear_buddies_next(struct sched_entity *se)
2615 {
2616 for_each_sched_entity(se) {
2617 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2618 if (cfs_rq->next == se)
2619 cfs_rq->next = NULL;
2620 else
2621 break;
2622 }
2623 }
2624
2625 static void __clear_buddies_skip(struct sched_entity *se)
2626 {
2627 for_each_sched_entity(se) {
2628 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2629 if (cfs_rq->skip == se)
2630 cfs_rq->skip = NULL;
2631 else
2632 break;
2633 }
2634 }
2635
2636 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
2637 {
2638 if (cfs_rq->last == se)
2639 __clear_buddies_last(se);
2640
2641 if (cfs_rq->next == se)
2642 __clear_buddies_next(se);
2643
2644 if (cfs_rq->skip == se)
2645 __clear_buddies_skip(se);
2646 }
2647
2648 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2649
2650 static void
2651 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2652 {
2653 /*
2654 * Update run-time statistics of the 'current'.
2655 */
2656 update_curr(cfs_rq);
2657 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
2658
2659 update_stats_dequeue(cfs_rq, se);
2660 if (flags & DEQUEUE_SLEEP) {
2661 #ifdef CONFIG_SCHEDSTATS
2662 if (entity_is_task(se)) {
2663 struct task_struct *tsk = task_of(se);
2664
2665 if (tsk->state & TASK_INTERRUPTIBLE)
2666 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
2667 if (tsk->state & TASK_UNINTERRUPTIBLE)
2668 se->statistics.block_start = rq_of(cfs_rq)->clock;
2669 }
2670 #endif
2671 }
2672
2673 clear_buddies(cfs_rq, se);
2674
2675 if (se != cfs_rq->curr)
2676 __dequeue_entity(cfs_rq, se);
2677 se->on_rq = 0;
2678 account_entity_dequeue(cfs_rq, se);
2679
2680 /*
2681 * Normalize the entity after updating the min_vruntime because the
2682 * update can refer to the ->curr item and we need to reflect this
2683 * movement in our normalized position.
2684 */
2685 if (!(flags & DEQUEUE_SLEEP))
2686 se->vruntime -= cfs_rq->min_vruntime;
2687
2688 /* return excess runtime on last dequeue */
2689 return_cfs_rq_runtime(cfs_rq);
2690
2691 update_min_vruntime(cfs_rq);
2692 update_cfs_shares(cfs_rq);
2693 }
2694
2695 /*
2696 * Preempt the current task with a newly woken task if needed:
2697 */
2698 static void
2699 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2700 {
2701 unsigned long ideal_runtime, delta_exec;
2702 struct sched_entity *se;
2703 s64 delta;
2704
2705 ideal_runtime = sched_slice(cfs_rq, curr);
2706 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2707 if (delta_exec > ideal_runtime) {
2708 resched_task(rq_of(cfs_rq)->curr);
2709 /*
2710 * The current task ran long enough, ensure it doesn't get
2711 * re-elected due to buddy favours.
2712 */
2713 clear_buddies(cfs_rq, curr);
2714 return;
2715 }
2716
2717 /*
2718 * Ensure that a task that missed wakeup preemption by a
2719 * narrow margin doesn't have to wait for a full slice.
2720 * This also mitigates buddy induced latencies under load.
2721 */
2722 if (delta_exec < sysctl_sched_min_granularity)
2723 return;
2724
2725 se = __pick_first_entity(cfs_rq);
2726 delta = curr->vruntime - se->vruntime;
2727
2728 if (delta < 0)
2729 return;
2730
2731 if (delta > ideal_runtime)
2732 resched_task(rq_of(cfs_rq)->curr);
2733 }
2734
2735 static void
2736 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
2737 {
2738 /* 'current' is not kept within the tree. */
2739 if (se->on_rq) {
2740 /*
2741 * Any task has to be enqueued before it get to execute on
2742 * a CPU. So account for the time it spent waiting on the
2743 * runqueue.
2744 */
2745 update_stats_wait_end(cfs_rq, se);
2746 __dequeue_entity(cfs_rq, se);
2747 update_entity_load_avg(se, 1);
2748 }
2749
2750 update_stats_curr_start(cfs_rq, se);
2751 cfs_rq->curr = se;
2752 #ifdef CONFIG_SCHEDSTATS
2753 /*
2754 * Track our maximum slice length, if the CPU's load is at
2755 * least twice that of our own weight (i.e. dont track it
2756 * when there are only lesser-weight tasks around):
2757 */
2758 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
2759 se->statistics.slice_max = max(se->statistics.slice_max,
2760 se->sum_exec_runtime - se->prev_sum_exec_runtime);
2761 }
2762 #endif
2763 se->prev_sum_exec_runtime = se->sum_exec_runtime;
2764 }
2765
2766 static int
2767 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2768
2769 /*
2770 * Pick the next process, keeping these things in mind, in this order:
2771 * 1) keep things fair between processes/task groups
2772 * 2) pick the "next" process, since someone really wants that to run
2773 * 3) pick the "last" process, for cache locality
2774 * 4) do not run the "skip" process, if something else is available
2775 */
2776 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2777 {
2778 struct sched_entity *se = __pick_first_entity(cfs_rq);
2779 struct sched_entity *left = se;
2780
2781 /*
2782 * Avoid running the skip buddy, if running something else can
2783 * be done without getting too unfair.
2784 */
2785 if (cfs_rq->skip == se) {
2786 struct sched_entity *second = __pick_next_entity(se);
2787 if (second && wakeup_preempt_entity(second, left) < 1)
2788 se = second;
2789 }
2790
2791 /*
2792 * Prefer last buddy, try to return the CPU to a preempted task.
2793 */
2794 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
2795 se = cfs_rq->last;
2796
2797 /*
2798 * Someone really wants this to run. If it's not unfair, run it.
2799 */
2800 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
2801 se = cfs_rq->next;
2802
2803 clear_buddies(cfs_rq, se);
2804
2805 return se;
2806 }
2807
2808 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2809
2810 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2811 {
2812 /*
2813 * If still on the runqueue then deactivate_task()
2814 * was not called and update_curr() has to be done:
2815 */
2816 if (prev->on_rq)
2817 update_curr(cfs_rq);
2818
2819 /* throttle cfs_rqs exceeding runtime */
2820 check_cfs_rq_runtime(cfs_rq);
2821
2822 check_spread(cfs_rq, prev);
2823 if (prev->on_rq) {
2824 update_stats_wait_start(cfs_rq, prev);
2825 /* Put 'current' back into the tree. */
2826 __enqueue_entity(cfs_rq, prev);
2827 /* in !on_rq case, update occurred at dequeue */
2828 update_entity_load_avg(prev, 1);
2829 }
2830 cfs_rq->curr = NULL;
2831 }
2832
2833 static void
2834 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2835 {
2836 /*
2837 * Update run-time statistics of the 'current'.
2838 */
2839 update_curr(cfs_rq);
2840
2841 /*
2842 * Ensure that runnable average is periodically updated.
2843 */
2844 update_entity_load_avg(curr, 1);
2845 update_cfs_rq_blocked_load(cfs_rq, 1);
2846 update_cfs_shares(cfs_rq);
2847
2848 #ifdef CONFIG_SCHED_HRTICK
2849 /*
2850 * queued ticks are scheduled to match the slice, so don't bother
2851 * validating it and just reschedule.
2852 */
2853 if (queued) {
2854 resched_task(rq_of(cfs_rq)->curr);
2855 return;
2856 }
2857 /*
2858 * don't let the period tick interfere with the hrtick preemption
2859 */
2860 if (!sched_feat(DOUBLE_TICK) &&
2861 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2862 return;
2863 #endif
2864
2865 if (cfs_rq->nr_running > 1)
2866 check_preempt_tick(cfs_rq, curr);
2867 }
2868
2869
2870 /**************************************************
2871 * CFS bandwidth control machinery
2872 */
2873
2874 #ifdef CONFIG_CFS_BANDWIDTH
2875
2876 #ifdef HAVE_JUMP_LABEL
2877 static struct static_key __cfs_bandwidth_used;
2878
2879 static inline bool cfs_bandwidth_used(void)
2880 {
2881 return static_key_false(&__cfs_bandwidth_used);
2882 }
2883
2884 void cfs_bandwidth_usage_inc(void)
2885 {
2886 static_key_slow_inc(&__cfs_bandwidth_used);
2887 }
2888
2889 void cfs_bandwidth_usage_dec(void)
2890 {
2891 static_key_slow_dec(&__cfs_bandwidth_used);
2892 }
2893 #else /* HAVE_JUMP_LABEL */
2894 static bool cfs_bandwidth_used(void)
2895 {
2896 return true;
2897 }
2898
2899 void cfs_bandwidth_usage_inc(void) {}
2900 void cfs_bandwidth_usage_dec(void) {}
2901 #endif /* HAVE_JUMP_LABEL */
2902
2903 /*
2904 * default period for cfs group bandwidth.
2905 * default: 0.1s, units: nanoseconds
2906 */
2907 static inline u64 default_cfs_period(void)
2908 {
2909 return 100000000ULL;
2910 }
2911
2912 static inline u64 sched_cfs_bandwidth_slice(void)
2913 {
2914 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2915 }
2916
2917 /*
2918 * Replenish runtime according to assigned quota and update expiration time.
2919 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2920 * additional synchronization around rq->lock.
2921 *
2922 * requires cfs_b->lock
2923 */
2924 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2925 {
2926 u64 now;
2927
2928 if (cfs_b->quota == RUNTIME_INF)
2929 return;
2930
2931 now = sched_clock_cpu(smp_processor_id());
2932 cfs_b->runtime = cfs_b->quota;
2933 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2934 }
2935
2936 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2937 {
2938 return &tg->cfs_bandwidth;
2939 }
2940
2941 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2942 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2943 {
2944 if (unlikely(cfs_rq->throttle_count))
2945 return cfs_rq->throttled_clock_task;
2946
2947 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2948 }
2949
2950 /* returns 0 on failure to allocate runtime */
2951 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2952 {
2953 struct task_group *tg = cfs_rq->tg;
2954 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2955 u64 amount = 0, min_amount, expires;
2956
2957 /* note: this is a positive sum as runtime_remaining <= 0 */
2958 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2959
2960 raw_spin_lock(&cfs_b->lock);
2961 if (cfs_b->quota == RUNTIME_INF)
2962 amount = min_amount;
2963 else {
2964 /*
2965 * If the bandwidth pool has become inactive, then at least one
2966 * period must have elapsed since the last consumption.
2967 * Refresh the global state and ensure bandwidth timer becomes
2968 * active.
2969 */
2970 if (!cfs_b->timer_active) {
2971 __refill_cfs_bandwidth_runtime(cfs_b);
2972 __start_cfs_bandwidth(cfs_b);
2973 }
2974
2975 if (cfs_b->runtime > 0) {
2976 amount = min(cfs_b->runtime, min_amount);
2977 cfs_b->runtime -= amount;
2978 cfs_b->idle = 0;
2979 }
2980 }
2981 expires = cfs_b->runtime_expires;
2982 raw_spin_unlock(&cfs_b->lock);
2983
2984 cfs_rq->runtime_remaining += amount;
2985 /*
2986 * we may have advanced our local expiration to account for allowed
2987 * spread between our sched_clock and the one on which runtime was
2988 * issued.
2989 */
2990 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2991 cfs_rq->runtime_expires = expires;
2992
2993 return cfs_rq->runtime_remaining > 0;
2994 }
2995
2996 /*
2997 * Note: This depends on the synchronization provided by sched_clock and the
2998 * fact that rq->clock snapshots this value.
2999 */
3000 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3001 {
3002 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3003 struct rq *rq = rq_of(cfs_rq);
3004
3005 /* if the deadline is ahead of our clock, nothing to do */
3006 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
3007 return;
3008
3009 if (cfs_rq->runtime_remaining < 0)
3010 return;
3011
3012 /*
3013 * If the local deadline has passed we have to consider the
3014 * possibility that our sched_clock is 'fast' and the global deadline
3015 * has not truly expired.
3016 *
3017 * Fortunately we can check determine whether this the case by checking
3018 * whether the global deadline has advanced.
3019 */
3020
3021 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
3022 /* extend local deadline, drift is bounded above by 2 ticks */
3023 cfs_rq->runtime_expires += TICK_NSEC;
3024 } else {
3025 /* global deadline is ahead, expiration has passed */
3026 cfs_rq->runtime_remaining = 0;
3027 }
3028 }
3029
3030 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3031 unsigned long delta_exec)
3032 {
3033 /* dock delta_exec before expiring quota (as it could span periods) */
3034 cfs_rq->runtime_remaining -= delta_exec;
3035 expire_cfs_rq_runtime(cfs_rq);
3036
3037 if (likely(cfs_rq->runtime_remaining > 0))
3038 return;
3039
3040 /*
3041 * if we're unable to extend our runtime we resched so that the active
3042 * hierarchy can be throttled
3043 */
3044 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3045 resched_task(rq_of(cfs_rq)->curr);
3046 }
3047
3048 static __always_inline
3049 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
3050 {
3051 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3052 return;
3053
3054 __account_cfs_rq_runtime(cfs_rq, delta_exec);
3055 }
3056
3057 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3058 {
3059 return cfs_bandwidth_used() && cfs_rq->throttled;
3060 }
3061
3062 /* check whether cfs_rq, or any parent, is throttled */
3063 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3064 {
3065 return cfs_bandwidth_used() && cfs_rq->throttle_count;
3066 }
3067
3068 /*
3069 * Ensure that neither of the group entities corresponding to src_cpu or
3070 * dest_cpu are members of a throttled hierarchy when performing group
3071 * load-balance operations.
3072 */
3073 static inline int throttled_lb_pair(struct task_group *tg,
3074 int src_cpu, int dest_cpu)
3075 {
3076 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3077
3078 src_cfs_rq = tg->cfs_rq[src_cpu];
3079 dest_cfs_rq = tg->cfs_rq[dest_cpu];
3080
3081 return throttled_hierarchy(src_cfs_rq) ||
3082 throttled_hierarchy(dest_cfs_rq);
3083 }
3084
3085 /* updated child weight may affect parent so we have to do this bottom up */
3086 static int tg_unthrottle_up(struct task_group *tg, void *data)
3087 {
3088 struct rq *rq = data;
3089 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3090
3091 cfs_rq->throttle_count--;
3092 #ifdef CONFIG_SMP
3093 if (!cfs_rq->throttle_count) {
3094 /* adjust cfs_rq_clock_task() */
3095 cfs_rq->throttled_clock_task_time += rq->clock_task -
3096 cfs_rq->throttled_clock_task;
3097 }
3098 #endif
3099
3100 return 0;
3101 }
3102
3103 static int tg_throttle_down(struct task_group *tg, void *data)
3104 {
3105 struct rq *rq = data;
3106 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3107
3108 /* group is entering throttled state, stop time */
3109 if (!cfs_rq->throttle_count)
3110 cfs_rq->throttled_clock_task = rq->clock_task;
3111 cfs_rq->throttle_count++;
3112
3113 return 0;
3114 }
3115
3116 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3117 {
3118 struct rq *rq = rq_of(cfs_rq);
3119 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3120 struct sched_entity *se;
3121 long task_delta, dequeue = 1;
3122
3123 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3124
3125 /* freeze hierarchy runnable averages while throttled */
3126 rcu_read_lock();
3127 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3128 rcu_read_unlock();
3129
3130 task_delta = cfs_rq->h_nr_running;
3131 for_each_sched_entity(se) {
3132 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3133 /* throttled entity or throttle-on-deactivate */
3134 if (!se->on_rq)
3135 break;
3136
3137 if (dequeue)
3138 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3139 qcfs_rq->h_nr_running -= task_delta;
3140
3141 if (qcfs_rq->load.weight)
3142 dequeue = 0;
3143 }
3144
3145 if (!se)
3146 rq->nr_running -= task_delta;
3147
3148 cfs_rq->throttled = 1;
3149 cfs_rq->throttled_clock = rq->clock;
3150 raw_spin_lock(&cfs_b->lock);
3151 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3152 if (!cfs_b->timer_active)
3153 __start_cfs_bandwidth(cfs_b);
3154 raw_spin_unlock(&cfs_b->lock);
3155 }
3156
3157 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3158 {
3159 struct rq *rq = rq_of(cfs_rq);
3160 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3161 struct sched_entity *se;
3162 int enqueue = 1;
3163 long task_delta;
3164
3165 se = cfs_rq->tg->se[cpu_of(rq)];
3166
3167 cfs_rq->throttled = 0;
3168 raw_spin_lock(&cfs_b->lock);
3169 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
3170 list_del_rcu(&cfs_rq->throttled_list);
3171 raw_spin_unlock(&cfs_b->lock);
3172
3173 update_rq_clock(rq);
3174 /* update hierarchical throttle state */
3175 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3176
3177 if (!cfs_rq->load.weight)
3178 return;
3179
3180 task_delta = cfs_rq->h_nr_running;
3181 for_each_sched_entity(se) {
3182 if (se->on_rq)
3183 enqueue = 0;
3184
3185 cfs_rq = cfs_rq_of(se);
3186 if (enqueue)
3187 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3188 cfs_rq->h_nr_running += task_delta;
3189
3190 if (cfs_rq_throttled(cfs_rq))
3191 break;
3192 }
3193
3194 if (!se)
3195 rq->nr_running += task_delta;
3196
3197 /* determine whether we need to wake up potentially idle cpu */
3198 if (rq->curr == rq->idle && rq->cfs.nr_running)
3199 resched_task(rq->curr);
3200 }
3201
3202 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3203 u64 remaining, u64 expires)
3204 {
3205 struct cfs_rq *cfs_rq;
3206 u64 runtime = remaining;
3207
3208 rcu_read_lock();
3209 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3210 throttled_list) {
3211 struct rq *rq = rq_of(cfs_rq);
3212
3213 raw_spin_lock(&rq->lock);
3214 if (!cfs_rq_throttled(cfs_rq))
3215 goto next;
3216
3217 runtime = -cfs_rq->runtime_remaining + 1;
3218 if (runtime > remaining)
3219 runtime = remaining;
3220 remaining -= runtime;
3221
3222 cfs_rq->runtime_remaining += runtime;
3223 cfs_rq->runtime_expires = expires;
3224
3225 /* we check whether we're throttled above */
3226 if (cfs_rq->runtime_remaining > 0)
3227 unthrottle_cfs_rq(cfs_rq);
3228
3229 next:
3230 raw_spin_unlock(&rq->lock);
3231
3232 if (!remaining)
3233 break;
3234 }
3235 rcu_read_unlock();
3236
3237 return remaining;
3238 }
3239
3240 /*
3241 * Responsible for refilling a task_group's bandwidth and unthrottling its
3242 * cfs_rqs as appropriate. If there has been no activity within the last
3243 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3244 * used to track this state.
3245 */
3246 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3247 {
3248 u64 runtime, runtime_expires;
3249 int idle = 1, throttled;
3250
3251 raw_spin_lock(&cfs_b->lock);
3252 /* no need to continue the timer with no bandwidth constraint */
3253 if (cfs_b->quota == RUNTIME_INF)
3254 goto out_unlock;
3255
3256 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3257 /* idle depends on !throttled (for the case of a large deficit) */
3258 idle = cfs_b->idle && !throttled;
3259 cfs_b->nr_periods += overrun;
3260
3261 /* if we're going inactive then everything else can be deferred */
3262 if (idle)
3263 goto out_unlock;
3264
3265 /*
3266 * if we have relooped after returning idle once, we need to update our
3267 * status as actually running, so that other cpus doing
3268 * __start_cfs_bandwidth will stop trying to cancel us.
3269 */
3270 cfs_b->timer_active = 1;
3271
3272 __refill_cfs_bandwidth_runtime(cfs_b);
3273
3274 if (!throttled) {
3275 /* mark as potentially idle for the upcoming period */
3276 cfs_b->idle = 1;
3277 goto out_unlock;
3278 }
3279
3280 /* account preceding periods in which throttling occurred */
3281 cfs_b->nr_throttled += overrun;
3282
3283 /*
3284 * There are throttled entities so we must first use the new bandwidth
3285 * to unthrottle them before making it generally available. This
3286 * ensures that all existing debts will be paid before a new cfs_rq is
3287 * allowed to run.
3288 */
3289 runtime = cfs_b->runtime;
3290 runtime_expires = cfs_b->runtime_expires;
3291 cfs_b->runtime = 0;
3292
3293 /*
3294 * This check is repeated as we are holding onto the new bandwidth
3295 * while we unthrottle. This can potentially race with an unthrottled
3296 * group trying to acquire new bandwidth from the global pool.
3297 */
3298 while (throttled && runtime > 0) {
3299 raw_spin_unlock(&cfs_b->lock);
3300 /* we can't nest cfs_b->lock while distributing bandwidth */
3301 runtime = distribute_cfs_runtime(cfs_b, runtime,
3302 runtime_expires);
3303 raw_spin_lock(&cfs_b->lock);
3304
3305 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3306 }
3307
3308 /* return (any) remaining runtime */
3309 cfs_b->runtime = runtime;
3310 /*
3311 * While we are ensured activity in the period following an
3312 * unthrottle, this also covers the case in which the new bandwidth is
3313 * insufficient to cover the existing bandwidth deficit. (Forcing the
3314 * timer to remain active while there are any throttled entities.)
3315 */
3316 cfs_b->idle = 0;
3317 out_unlock:
3318 if (idle)
3319 cfs_b->timer_active = 0;
3320 raw_spin_unlock(&cfs_b->lock);
3321
3322 return idle;
3323 }
3324
3325 /* a cfs_rq won't donate quota below this amount */
3326 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3327 /* minimum remaining period time to redistribute slack quota */
3328 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3329 /* how long we wait to gather additional slack before distributing */
3330 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3331
3332 /*
3333 * Are we near the end of the current quota period?
3334 *
3335 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3336 * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3337 * migrate_hrtimers, base is never cleared, so we are fine.
3338 */
3339 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3340 {
3341 struct hrtimer *refresh_timer = &cfs_b->period_timer;
3342 u64 remaining;
3343
3344 /* if the call-back is running a quota refresh is already occurring */
3345 if (hrtimer_callback_running(refresh_timer))
3346 return 1;
3347
3348 /* is a quota refresh about to occur? */
3349 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3350 if (remaining < min_expire)
3351 return 1;
3352
3353 return 0;
3354 }
3355
3356 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3357 {
3358 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3359
3360 /* if there's a quota refresh soon don't bother with slack */
3361 if (runtime_refresh_within(cfs_b, min_left))
3362 return;
3363
3364 start_bandwidth_timer(&cfs_b->slack_timer,
3365 ns_to_ktime(cfs_bandwidth_slack_period));
3366 }
3367
3368 /* we know any runtime found here is valid as update_curr() precedes return */
3369 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3370 {
3371 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3372 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3373
3374 if (slack_runtime <= 0)
3375 return;
3376
3377 raw_spin_lock(&cfs_b->lock);
3378 if (cfs_b->quota != RUNTIME_INF &&
3379 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3380 cfs_b->runtime += slack_runtime;
3381
3382 /* we are under rq->lock, defer unthrottling using a timer */
3383 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3384 !list_empty(&cfs_b->throttled_cfs_rq))
3385 start_cfs_slack_bandwidth(cfs_b);
3386 }
3387 raw_spin_unlock(&cfs_b->lock);
3388
3389 /* even if it's not valid for return we don't want to try again */
3390 cfs_rq->runtime_remaining -= slack_runtime;
3391 }
3392
3393 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3394 {
3395 if (!cfs_bandwidth_used())
3396 return;
3397
3398 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3399 return;
3400
3401 __return_cfs_rq_runtime(cfs_rq);
3402 }
3403
3404 /*
3405 * This is done with a timer (instead of inline with bandwidth return) since
3406 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3407 */
3408 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3409 {
3410 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3411 u64 expires;
3412
3413 /* confirm we're still not at a refresh boundary */
3414 raw_spin_lock(&cfs_b->lock);
3415 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3416 raw_spin_unlock(&cfs_b->lock);
3417 return;
3418 }
3419
3420 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
3421 runtime = cfs_b->runtime;
3422 cfs_b->runtime = 0;
3423 }
3424 expires = cfs_b->runtime_expires;
3425 raw_spin_unlock(&cfs_b->lock);
3426
3427 if (!runtime)
3428 return;
3429
3430 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3431
3432 raw_spin_lock(&cfs_b->lock);
3433 if (expires == cfs_b->runtime_expires)
3434 cfs_b->runtime = runtime;
3435 raw_spin_unlock(&cfs_b->lock);
3436 }
3437
3438 /*
3439 * When a group wakes up we want to make sure that its quota is not already
3440 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3441 * runtime as update_curr() throttling can not not trigger until it's on-rq.
3442 */
3443 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3444 {
3445 if (!cfs_bandwidth_used())
3446 return;
3447
3448 /* an active group must be handled by the update_curr()->put() path */
3449 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3450 return;
3451
3452 /* ensure the group is not already throttled */
3453 if (cfs_rq_throttled(cfs_rq))
3454 return;
3455
3456 /* update runtime allocation */
3457 account_cfs_rq_runtime(cfs_rq, 0);
3458 if (cfs_rq->runtime_remaining <= 0)
3459 throttle_cfs_rq(cfs_rq);
3460 }
3461
3462 /* conditionally throttle active cfs_rq's from put_prev_entity() */
3463 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3464 {
3465 if (!cfs_bandwidth_used())
3466 return;
3467
3468 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3469 return;
3470
3471 /*
3472 * it's possible for a throttled entity to be forced into a running
3473 * state (e.g. set_curr_task), in this case we're finished.
3474 */
3475 if (cfs_rq_throttled(cfs_rq))
3476 return;
3477
3478 throttle_cfs_rq(cfs_rq);
3479 }
3480
3481 static inline u64 default_cfs_period(void);
3482 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
3483 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
3484
3485 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3486 {
3487 struct cfs_bandwidth *cfs_b =
3488 container_of(timer, struct cfs_bandwidth, slack_timer);
3489 do_sched_cfs_slack_timer(cfs_b);
3490
3491 return HRTIMER_NORESTART;
3492 }
3493
3494 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3495 {
3496 struct cfs_bandwidth *cfs_b =
3497 container_of(timer, struct cfs_bandwidth, period_timer);
3498 ktime_t now;
3499 int overrun;
3500 int idle = 0;
3501
3502 for (;;) {
3503 now = hrtimer_cb_get_time(timer);
3504 overrun = hrtimer_forward(timer, now, cfs_b->period);
3505
3506 if (!overrun)
3507 break;
3508
3509 idle = do_sched_cfs_period_timer(cfs_b, overrun);
3510 }
3511
3512 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3513 }
3514
3515 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3516 {
3517 raw_spin_lock_init(&cfs_b->lock);
3518 cfs_b->runtime = 0;
3519 cfs_b->quota = RUNTIME_INF;
3520 cfs_b->period = ns_to_ktime(default_cfs_period());
3521
3522 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3523 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3524 cfs_b->period_timer.function = sched_cfs_period_timer;
3525 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3526 cfs_b->slack_timer.function = sched_cfs_slack_timer;
3527 }
3528
3529 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3530 {
3531 cfs_rq->runtime_enabled = 0;
3532 INIT_LIST_HEAD(&cfs_rq->throttled_list);
3533 }
3534
3535 /* requires cfs_b->lock, may release to reprogram timer */
3536 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3537 {
3538 /*
3539 * The timer may be active because we're trying to set a new bandwidth
3540 * period or because we're racing with the tear-down path
3541 * (timer_active==0 becomes visible before the hrtimer call-back
3542 * terminates). In either case we ensure that it's re-programmed
3543 */
3544 while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3545 hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3546 /* bounce the lock to allow do_sched_cfs_period_timer to run */
3547 raw_spin_unlock(&cfs_b->lock);
3548 cpu_relax();
3549 raw_spin_lock(&cfs_b->lock);
3550 /* if someone else restarted the timer then we're done */
3551 if (cfs_b->timer_active)
3552 return;
3553 }
3554
3555 cfs_b->timer_active = 1;
3556 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
3557 }
3558
3559 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3560 {
3561 hrtimer_cancel(&cfs_b->period_timer);
3562 hrtimer_cancel(&cfs_b->slack_timer);
3563 }
3564
3565 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3566 {
3567 struct cfs_rq *cfs_rq;
3568
3569 for_each_leaf_cfs_rq(rq, cfs_rq) {
3570 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3571
3572 if (!cfs_rq->runtime_enabled)
3573 continue;
3574
3575 /*
3576 * clock_task is not advancing so we just need to make sure
3577 * there's some valid quota amount
3578 */
3579 cfs_rq->runtime_remaining = cfs_b->quota;
3580 if (cfs_rq_throttled(cfs_rq))
3581 unthrottle_cfs_rq(cfs_rq);
3582 }
3583 }
3584
3585 #else /* CONFIG_CFS_BANDWIDTH */
3586 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3587 {
3588 return rq_of(cfs_rq)->clock_task;
3589 }
3590
3591 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3592 unsigned long delta_exec) {}
3593 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3594 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3595 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3596
3597 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3598 {
3599 return 0;
3600 }
3601
3602 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3603 {
3604 return 0;
3605 }
3606
3607 static inline int throttled_lb_pair(struct task_group *tg,
3608 int src_cpu, int dest_cpu)
3609 {
3610 return 0;
3611 }
3612
3613 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3614
3615 #ifdef CONFIG_FAIR_GROUP_SCHED
3616 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3617 #endif
3618
3619 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3620 {
3621 return NULL;
3622 }
3623 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3624 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
3625
3626 #endif /* CONFIG_CFS_BANDWIDTH */
3627
3628 /**************************************************
3629 * CFS operations on tasks:
3630 */
3631
3632 #ifdef CONFIG_SCHED_HRTICK
3633 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3634 {
3635 struct sched_entity *se = &p->se;
3636 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3637
3638 WARN_ON(task_rq(p) != rq);
3639
3640 if (cfs_rq->nr_running > 1) {
3641 u64 slice = sched_slice(cfs_rq, se);
3642 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
3643 s64 delta = slice - ran;
3644
3645 if (delta < 0) {
3646 if (rq->curr == p)
3647 resched_task(p);
3648 return;
3649 }
3650
3651 /*
3652 * Don't schedule slices shorter than 10000ns, that just
3653 * doesn't make sense. Rely on vruntime for fairness.
3654 */
3655 if (rq->curr != p)
3656 delta = max_t(s64, 10000LL, delta);
3657
3658 hrtick_start(rq, delta);
3659 }
3660 }
3661
3662 /*
3663 * called from enqueue/dequeue and updates the hrtick when the
3664 * current task is from our class and nr_running is low enough
3665 * to matter.
3666 */
3667 static void hrtick_update(struct rq *rq)
3668 {
3669 struct task_struct *curr = rq->curr;
3670
3671 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
3672 return;
3673
3674 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
3675 hrtick_start_fair(rq, curr);
3676 }
3677 #else /* !CONFIG_SCHED_HRTICK */
3678 static inline void
3679 hrtick_start_fair(struct rq *rq, struct task_struct *p)
3680 {
3681 }
3682
3683 static inline void hrtick_update(struct rq *rq)
3684 {
3685 }
3686 #endif
3687
3688 #if defined(CONFIG_SCHED_HMP) || defined(CONFIG_MTK_SCHED_CMP)
3689
3690 /* CPU cluster statistics for task migration control */
3691 #define HMP_GB (0x1000)
3692 #define HMP_SELECT_RQ (0x2000)
3693 #define HMP_LB (0x4000)
3694 #define HMP_MAX_LOAD (NICE_0_LOAD - 1)
3695
3696
3697 struct clb_env {
3698 struct clb_stats bstats;
3699 struct clb_stats lstats;
3700 int btarget, ltarget;
3701
3702 struct cpumask *bcpus;
3703 struct cpumask *lcpus;
3704
3705 unsigned int flags;
3706 struct mcheck {
3707 int status; /* Details of this migration check */
3708 int result; /* Indicate whether we should perform this task migration */
3709 } mcheck;
3710 };
3711
3712 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu);
3713
3714 static void collect_cluster_stats(struct clb_stats *clbs,
3715 struct cpumask *cluster_cpus, int target)
3716 {
3717 #define HMP_RESOLUTION_SCALING (4)
3718 #define hmp_scale_down(w) ((w) >> HMP_RESOLUTION_SCALING)
3719
3720 /* Update cluster informatics */
3721 int cpu;
3722 for_each_cpu(cpu, cluster_cpus) {
3723 if(cpu_online(cpu)) {
3724 clbs->ncpu ++;
3725 clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running;
3726 clbs->load_avg += cpu_rq(cpu)->cfs.avg.load_avg_ratio;
3727 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
3728 clbs->nr_normal_prio_task += cfs_nr_normal_prio(cpu);
3729 clbs->nr_dequeuing_low_prio += cfs_nr_dequeuing_low_prio(cpu);
3730 #endif
3731 }
3732 }
3733
3734 if(!clbs->ncpu || NR_CPUS == target || !cpumask_test_cpu(target,cluster_cpus))
3735 return;
3736
3737 clbs->cpu_power = (int) arch_scale_freq_power(NULL, target);
3738
3739 /* Scale current CPU compute capacity in accordance with frequency */
3740 clbs->cpu_capacity = HMP_MAX_LOAD;
3741 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
3742 if (hmp_data.freqinvar_load_scale_enabled) {
3743 cpu = cpumask_any(cluster_cpus);
3744 if (freq_scale[cpu].throttling == 1){
3745 clbs->cpu_capacity *= freq_scale[cpu].curr_scale;
3746 }else {
3747 clbs->cpu_capacity *= freq_scale[cpu].max;
3748 }
3749 clbs->cpu_capacity >>= SCHED_FREQSCALE_SHIFT;
3750
3751 if (clbs->cpu_capacity > HMP_MAX_LOAD){
3752 clbs->cpu_capacity = HMP_MAX_LOAD;
3753 }
3754 }
3755 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
3756 if (topology_cpu_inv_power_en()) {
3757 cpu = cpumask_any(cluster_cpus);
3758 if (topology_cpu_throttling(cpu))
3759 clbs->cpu_capacity *=
3760 (topology_cpu_capacity(cpu) << CPUPOWER_FREQSCALE_SHIFT)
3761 / (topology_max_cpu_capacity(cpu)+1);
3762 else
3763 clbs->cpu_capacity *= topology_max_cpu_capacity(cpu);
3764 clbs->cpu_capacity >>= CPUPOWER_FREQSCALE_SHIFT;
3765
3766 if (clbs->cpu_capacity > HMP_MAX_LOAD){
3767 clbs->cpu_capacity = HMP_MAX_LOAD;
3768 }
3769 }
3770 #endif
3771
3772 /*
3773 * Calculate available CPU capacity
3774 * Calculate available task space
3775 *
3776 * Why load ratio should be multiplied by the number of task ?
3777 * The task is the entity of scheduling unit so that we should consider
3778 * it in scheduler. Only considering task load is not enough.
3779 * Thus, multiplying the number of tasks can adjust load ratio to a more
3780 * reasonable value.
3781 */
3782 clbs->load_avg /= clbs->ncpu;
3783 clbs->acap = clbs->cpu_capacity - cpu_rq(target)->cfs.avg.load_avg_ratio;
3784 clbs->scaled_acap = hmp_scale_down(clbs->acap);
3785 clbs->scaled_atask = cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.load_avg_ratio;
3786 clbs->scaled_atask = clbs->cpu_capacity - clbs->scaled_atask;
3787 clbs->scaled_atask = hmp_scale_down(clbs->scaled_atask);
3788
3789 mt_sched_printf("[%s] cpu/cluster:%d/%02lx load/len:%lu/%u stats:%d,%d,%d,%d,%d,%d,%d,%d\n", __func__,
3790 target, *cpumask_bits(cluster_cpus),
3791 cpu_rq(target)->cfs.avg.load_avg_ratio, cpu_rq(target)->cfs.h_nr_running,
3792 clbs->ncpu, clbs->ntask, clbs->load_avg, clbs->cpu_capacity,
3793 clbs->acap, clbs->scaled_acap, clbs->scaled_atask, clbs->threshold);
3794 }
3795
3796 //#define USE_HMP_DYNAMIC_THRESHOLD
3797 #if defined(CONFIG_SCHED_HMP) && defined(USE_HMP_DYNAMIC_THRESHOLD)
3798 static inline void hmp_dynamic_threshold(struct clb_env *clbenv);
3799 #endif
3800
3801 /*
3802 * Task Dynamic Migration Threshold Adjustment.
3803 *
3804 * If the workload between clusters is not balanced, adjust migration
3805 * threshold in an attempt to move task precisely.
3806 *
3807 * Diff. = Max Threshold - Min Threshold
3808 *
3809 * Dynamic UP-Threshold =
3810 * B_nacap B_natask
3811 * Max Threshold - Diff. x ----------------- x -------------------
3812 * B_nacap + L_nacap B_natask + L_natask
3813 *
3814 *
3815 * Dynamic Down-Threshold =
3816 * L_nacap L_natask
3817 * Min Threshold + Diff. x ----------------- x -------------------
3818 * B_nacap + L_nacap B_natask + L_natask
3819 */
3820 static void adj_threshold(struct clb_env *clbenv)
3821 {
3822 #define TSKLD_SHIFT (2)
3823 #define POSITIVE(x) ((int)(x) < 0 ? 0 : (x))
3824
3825 int bcpu, lcpu;
3826 unsigned long b_cap=0, l_cap=0;
3827 unsigned long b_load=0, l_load=0;
3828 unsigned long b_task=0, l_task=0;
3829 int b_nacap, l_nacap, b_natask, l_natask;
3830
3831 #if defined(CONFIG_SCHED_HMP) && defined(USE_HMP_DYNAMIC_THRESHOLD)
3832 hmp_dynamic_threshold(clbenv);
3833 return;
3834 #endif
3835
3836 bcpu = clbenv->btarget;
3837 lcpu = clbenv->ltarget;
3838 if (bcpu < nr_cpu_ids) {
3839 b_load = cpu_rq(bcpu)->cfs.avg.load_avg_ratio;
3840 b_task = cpu_rq(bcpu)->cfs.h_nr_running;
3841 }
3842 if (lcpu < nr_cpu_ids) {
3843 l_load = cpu_rq(lcpu)->cfs.avg.load_avg_ratio;
3844 l_task = cpu_rq(lcpu)->cfs.h_nr_running;
3845 }
3846
3847 #ifdef CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY
3848 if (bcpu < nr_cpu_ids) {
3849 b_cap = topology_cpu_capacity(bcpu);
3850 }
3851 if (lcpu < nr_cpu_ids) {
3852 l_cap = topology_cpu_capacity(lcpu);
3853 }
3854
3855 b_nacap = POSITIVE(b_cap - b_load);
3856 b_natask = POSITIVE(b_cap - ((b_task * b_load) >> TSKLD_SHIFT));
3857 l_nacap = POSITIVE(l_cap - l_load);
3858 l_natask = POSITIVE(l_cap - ((l_task * l_load) >> TSKLD_SHIFT));
3859 #else /* !CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY */
3860 b_cap = clbenv->bstats.cpu_power;
3861 l_cap = clbenv->lstats.cpu_power;
3862 b_nacap = POSITIVE(clbenv->bstats.scaled_acap *
3863 clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));
3864 b_natask = POSITIVE(clbenv->bstats.scaled_atask *
3865 clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));
3866 l_nacap = POSITIVE(clbenv->lstats.scaled_acap);
3867 l_natask = POSITIVE(clbenv->bstats.scaled_atask);
3868
3869 #endif /* CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY */
3870
3871 clbenv->bstats.threshold = HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask /
3872 ((b_nacap + l_nacap) * (b_natask + l_natask)+1);
3873 clbenv->lstats.threshold = HMP_MAX_LOAD * l_nacap * l_natask /
3874 ((b_nacap + l_nacap) * (b_natask + l_natask)+1);
3875
3876 mt_sched_printf("[%s]\tup/dl:%4d/%4d L(%d:%4lu,%4lu/%4lu) b(%d:%4lu,%4lu/%4lu)\n", __func__,
3877 clbenv->bstats.threshold, clbenv->lstats.threshold,
3878 lcpu, l_load, l_task, l_cap,
3879 bcpu, b_load, b_task, b_cap);
3880 }
3881
3882 static void sched_update_clbstats(struct clb_env *clbenv)
3883 {
3884 collect_cluster_stats(&clbenv->bstats, clbenv->bcpus, clbenv->btarget);
3885 collect_cluster_stats(&clbenv->lstats, clbenv->lcpus, clbenv->ltarget);
3886 adj_threshold(clbenv);
3887 }
3888 #endif /* #if defined(CONFIG_SCHED_HMP) || defined(CONFIG_SCHED_CMP) */
3889
3890
3891 #ifdef CONFIG_SCHED_HMP
3892 /*
3893 * Heterogenous multiprocessor (HMP) optimizations
3894 *
3895 * The cpu types are distinguished using a list of hmp_domains
3896 * which each represent one cpu type using a cpumask.
3897 * The list is assumed ordered by compute capacity with the
3898 * fastest domain first.
3899 */
3900 DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
3901 /* We need to know which cpus are fast and slow. */
3902 extern struct cpumask hmp_fast_cpu_mask;
3903 extern struct cpumask hmp_slow_cpu_mask;
3904
3905 extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
3906
3907 /* Setup hmp_domains */
3908 static int __init hmp_cpu_mask_setup(void)
3909 {
3910 char buf[64];
3911 struct hmp_domain *domain;
3912 struct list_head *pos;
3913 int dc, cpu;
3914
3915 #if defined(CONFIG_SCHED_HMP_ENHANCEMENT) || \
3916 defined(CONFIG_MT_RT_SCHED) || defined(CONFIG_MT_RT_SCHED_LOG)
3917 cpumask_clear(&hmp_fast_cpu_mask);
3918 cpumask_clear(&hmp_slow_cpu_mask);
3919 #endif
3920
3921 pr_debug("Initializing HMP scheduler:\n");
3922
3923 /* Initialize hmp_domains using platform code */
3924 arch_get_hmp_domains(&hmp_domains);
3925 if (list_empty(&hmp_domains)) {
3926 pr_debug("HMP domain list is empty!\n");
3927 return 0;
3928 }
3929
3930 /* Print hmp_domains */
3931 dc = 0;
3932 list_for_each(pos, &hmp_domains) {
3933 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3934 cpulist_scnprintf(buf, 64, &domain->possible_cpus);
3935 pr_debug(" HMP domain %d: %s\n", dc, buf);
3936
3937 /*
3938 * According to the description in "arch_get_hmp_domains",
3939 * Fastest domain is at head of list. Thus, the fast-cpu mask should
3940 * be initialized first, followed by slow-cpu mask.
3941 */
3942 #if defined(CONFIG_SCHED_HMP_ENHANCEMENT) || \
3943 defined(CONFIG_MT_RT_SCHED) || defined(CONFIG_MT_RT_SCHED_LOG)
3944 if(cpumask_empty(&hmp_fast_cpu_mask)) {
3945 cpumask_copy(&hmp_fast_cpu_mask,&domain->possible_cpus);
3946 for_each_cpu(cpu, &hmp_fast_cpu_mask)
3947 pr_debug(" HMP fast cpu : %d\n",cpu);
3948 } else if (cpumask_empty(&hmp_slow_cpu_mask)){
3949 cpumask_copy(&hmp_slow_cpu_mask,&domain->possible_cpus);
3950 for_each_cpu(cpu, &hmp_slow_cpu_mask)
3951 pr_debug(" HMP slow cpu : %d\n",cpu);
3952 }
3953 #endif
3954
3955 for_each_cpu_mask(cpu, domain->possible_cpus) {
3956 per_cpu(hmp_cpu_domain, cpu) = domain;
3957 }
3958 dc++;
3959 }
3960
3961 return 1;
3962 }
3963
3964 static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu)
3965 {
3966 struct hmp_domain *domain;
3967 struct list_head *pos;
3968
3969 list_for_each(pos, &hmp_domains) {
3970 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3971 if(cpumask_test_cpu(cpu, &domain->possible_cpus))
3972 return domain;
3973 }
3974 return NULL;
3975 }
3976
3977 static void hmp_online_cpu(int cpu)
3978 {
3979 struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3980
3981 if(domain)
3982 cpumask_set_cpu(cpu, &domain->cpus);
3983 }
3984
3985 static void hmp_offline_cpu(int cpu)
3986 {
3987 struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3988
3989 if(domain)
3990 cpumask_clear_cpu(cpu, &domain->cpus);
3991 }
3992
3993 /*
3994 * Migration thresholds should be in the range [0..1023]
3995 * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
3996 * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
3997 * The default values (512, 256) offer good responsiveness, but may need
3998 * tweaking suit particular needs.
3999 *
4000 * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
4001 * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
4002 * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
4003 */
4004 #ifdef CONFIG_HMP_DYNAMIC_THRESHOLD
4005 unsigned int hmp_up_threshold = 1023;
4006 unsigned int hmp_down_threshold = 0;
4007 #else
4008 unsigned int hmp_up_threshold = 512;
4009 unsigned int hmp_down_threshold = 256;
4010 #endif
4011
4012 unsigned int hmp_next_up_threshold = 4096;
4013 unsigned int hmp_next_down_threshold = 4096;
4014 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4015 #define hmp_last_up_migration(cpu) \
4016 cpu_rq(cpu)->cfs.avg.hmp_last_up_migration
4017 #define hmp_last_down_migration(cpu) \
4018 cpu_rq(cpu)->cfs.avg.hmp_last_down_migration
4019 static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p,
4020 int prev_cpu, int new_cpu);
4021 #else
4022 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
4023 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
4024 #endif
4025 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
4026 int *min_cpu);
4027
4028 /* Check if cpu is in fastest hmp_domain */
4029 static inline unsigned int hmp_cpu_is_fastest(int cpu)
4030 {
4031 struct list_head *pos;
4032
4033 pos = &hmp_cpu_domain(cpu)->hmp_domains;
4034 return pos == hmp_domains.next;
4035 }
4036
4037 /* Check if cpu is in slowest hmp_domain */
4038 static inline unsigned int hmp_cpu_is_slowest(int cpu)
4039 {
4040 struct list_head *pos;
4041
4042 pos = &hmp_cpu_domain(cpu)->hmp_domains;
4043 return list_is_last(pos, &hmp_domains);
4044 }
4045
4046 /* Next (slower) hmp_domain relative to cpu */
4047 static inline struct hmp_domain *hmp_slower_domain(int cpu)
4048 {
4049 struct list_head *pos;
4050
4051 pos = &hmp_cpu_domain(cpu)->hmp_domains;
4052 return list_entry(pos->next, struct hmp_domain, hmp_domains);
4053 }
4054
4055 /* Previous (faster) hmp_domain relative to cpu */
4056 static inline struct hmp_domain *hmp_faster_domain(int cpu)
4057 {
4058 struct list_head *pos;
4059
4060 pos = &hmp_cpu_domain(cpu)->hmp_domains;
4061 return list_entry(pos->prev, struct hmp_domain, hmp_domains);
4062 }
4063
4064 /*
4065 * Selects a cpu in previous (faster) hmp_domain
4066 * Note that cpumask_any_and() returns the first cpu in the cpumask
4067 */
4068 static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
4069 int cpu)
4070 {
4071 int lowest_cpu=NR_CPUS;
4072 __always_unused int lowest_ratio = hmp_domain_min_load(hmp_faster_domain(cpu), &lowest_cpu);
4073 /*
4074 * If the lowest-loaded CPU in the domain is allowed by the task affinity
4075 * select that one, otherwise select one which is allowed
4076 */
4077 if(lowest_cpu < nr_cpu_ids && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk)))
4078 return lowest_cpu;
4079 else
4080 return cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
4081 tsk_cpus_allowed(tsk));
4082 }
4083
4084 /*
4085 * Selects a cpu in next (slower) hmp_domain
4086 * Note that cpumask_any_and() returns the first cpu in the cpumask
4087 */
4088 static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
4089 int cpu)
4090 {
4091 int lowest_cpu=NR_CPUS;
4092 __always_unused int lowest_ratio = hmp_domain_min_load(hmp_slower_domain(cpu), &lowest_cpu);
4093 /*
4094 * If the lowest-loaded CPU in the domain is allowed by the task affinity
4095 * select that one, otherwise select one which is allowed
4096 */
4097 if(lowest_cpu < nr_cpu_ids && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk)))
4098 return lowest_cpu;
4099 else
4100 return cpumask_any_and(&hmp_slower_domain(cpu)->cpus,
4101 tsk_cpus_allowed(tsk));
4102 }
4103
4104 static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
4105 {
4106 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4107 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4108 hmp_last_up_migration(cpu) = cfs_rq_clock_task(cfs_rq);
4109 hmp_last_down_migration(cpu) = 0;
4110 #else
4111 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4112
4113 se->avg.hmp_last_up_migration = cfs_rq_clock_task(cfs_rq);
4114 se->avg.hmp_last_down_migration = 0;
4115 #endif
4116 }
4117
4118 static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
4119 {
4120 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4121 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4122 hmp_last_down_migration(cpu) = cfs_rq_clock_task(cfs_rq);
4123 hmp_last_up_migration(cpu) = 0;
4124 #else
4125 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4126
4127 se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq);
4128 se->avg.hmp_last_up_migration = 0;
4129 #endif
4130 }
4131
4132 #ifdef CONFIG_HMP_VARIABLE_SCALE
4133 /*
4134 * Heterogenous multiprocessor (HMP) optimizations
4135 *
4136 * These functions allow to change the growing speed of the load_avg_ratio
4137 * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
4138 * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
4139 *
4140 * These functions also allow to change the up and down threshold of HMP
4141 * using /sys/kernel/hmp/{up,down}_threshold.
4142 * Both must be between 0 and 1023. The threshold that is compared
4143 * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
4144 *
4145 * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
4146 * task with a load of 0 will reach the threshold after 64ms of busy loop.
4147 *
4148 * Changing load_avg_periods_ms has the same effect than changing the
4149 * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
4150 * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
4151 * could trigger overflows.
4152 * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
4153 * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
4154 * could be overflowed for a weight > 2^12 even is the load_avg_contrib
4155 * should still be a 32bits result. This would not happen by multiplicating
4156 * delta time by 1/22 and setting load_avg_period_ms = 706.
4157 */
4158
4159 /*
4160 * By scaling the delta time it end-up increasing or decrease the
4161 * growing speed of the per entity load_avg_ratio
4162 * The scale factor hmp_data.multiplier is a fixed point
4163 * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
4164 */
4165 static u64 hmp_variable_scale_convert(u64 delta)
4166 {
4167 u64 high = delta >> 32ULL;
4168 u64 low = delta & 0xffffffffULL;
4169 low *= hmp_data.multiplier;
4170 high *= hmp_data.multiplier;
4171 return (low >> HMP_VARIABLE_SCALE_SHIFT)
4172 + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
4173 }
4174
4175 static ssize_t hmp_show(struct kobject *kobj,
4176 struct attribute *attr, char *buf)
4177 {
4178 ssize_t ret = 0;
4179 struct hmp_global_attr *hmp_attr =
4180 container_of(attr, struct hmp_global_attr, attr);
4181 int temp = *(hmp_attr->value);
4182 if (hmp_attr->to_sysfs != NULL)
4183 temp = hmp_attr->to_sysfs(temp);
4184 ret = sprintf(buf, "%d\n", temp);
4185 return ret;
4186 }
4187
4188 static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
4189 const char *buf, size_t count)
4190 {
4191 int temp;
4192 ssize_t ret = count;
4193 struct hmp_global_attr *hmp_attr =
4194 container_of(attr, struct hmp_global_attr, attr);
4195 char *str = vmalloc(count + 1);
4196 if (str == NULL)
4197 return -ENOMEM;
4198 memcpy(str, buf, count);
4199 str[count] = 0;
4200 if (sscanf(str, "%d", &temp) < 1)
4201 ret = -EINVAL;
4202 else {
4203 if (hmp_attr->from_sysfs != NULL)
4204 temp = hmp_attr->from_sysfs(temp);
4205 if (temp < 0)
4206 ret = -EINVAL;
4207 else
4208 *(hmp_attr->value) = temp;
4209 }
4210 vfree(str);
4211 return ret;
4212 }
4213
4214 static int hmp_period_tofrom_sysfs(int value)
4215 {
4216 return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
4217 }
4218
4219 /* max value for threshold is 1024 */
4220 static int hmp_theshold_from_sysfs(int value)
4221 {
4222 if (value > 1024)
4223 return -1;
4224 return value;
4225 }
4226 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4227 /* freqinvar control is only 0,1 off/on */
4228 static int hmp_freqinvar_from_sysfs(int value)
4229 {
4230 if (value < 0 || value > 1)
4231 return -1;
4232 return value;
4233 }
4234 #endif
4235 static void hmp_attr_add(
4236 const char *name,
4237 int *value,
4238 int (*to_sysfs)(int),
4239 int (*from_sysfs)(int))
4240 {
4241 int i = 0;
4242 while (hmp_data.attributes[i] != NULL) {
4243 i++;
4244 if (i >= HMP_DATA_SYSFS_MAX)
4245 return;
4246 }
4247 hmp_data.attr[i].attr.mode = 0644;
4248 hmp_data.attr[i].show = hmp_show;
4249 hmp_data.attr[i].store = hmp_store;
4250 hmp_data.attr[i].attr.name = name;
4251 hmp_data.attr[i].value = value;
4252 hmp_data.attr[i].to_sysfs = to_sysfs;
4253 hmp_data.attr[i].from_sysfs = from_sysfs;
4254 hmp_data.attributes[i] = &hmp_data.attr[i].attr;
4255 hmp_data.attributes[i + 1] = NULL;
4256 }
4257
4258 static int hmp_attr_init(void)
4259 {
4260 int ret;
4261 memset(&hmp_data, sizeof(hmp_data), 0);
4262 /* by default load_avg_period_ms == LOAD_AVG_PERIOD
4263 * meaning no change
4264 */
4265 /* LOAD_AVG_PERIOD is too short to trigger heavy task indicator
4266 so we change it to LOAD_AVG_VARIABLE_PERIOD */
4267 hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_VARIABLE_PERIOD);
4268
4269 hmp_attr_add("load_avg_period_ms",
4270 &hmp_data.multiplier,
4271 hmp_period_tofrom_sysfs,
4272 hmp_period_tofrom_sysfs);
4273 hmp_attr_add("up_threshold",
4274 &hmp_up_threshold,
4275 NULL,
4276 hmp_theshold_from_sysfs);
4277 hmp_attr_add("down_threshold",
4278 &hmp_down_threshold,
4279 NULL,
4280 hmp_theshold_from_sysfs);
4281 hmp_attr_add("init_task_load_period",
4282 &init_task_load_period,
4283 NULL,
4284 NULL);
4285 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4286 /* default frequency-invariant scaling ON */
4287 hmp_data.freqinvar_load_scale_enabled = 1;
4288 hmp_attr_add("frequency_invariant_load_scale",
4289 &hmp_data.freqinvar_load_scale_enabled,
4290 NULL,
4291 hmp_freqinvar_from_sysfs);
4292 #endif
4293 hmp_data.attr_group.name = "hmp";
4294 hmp_data.attr_group.attrs = hmp_data.attributes;
4295 ret = sysfs_create_group(kernel_kobj,
4296 &hmp_data.attr_group);
4297 return 0;
4298 }
4299 late_initcall(hmp_attr_init);
4300 #endif /* CONFIG_HMP_VARIABLE_SCALE */
4301
4302 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
4303 int *min_cpu)
4304 {
4305 int cpu;
4306 int min_cpu_runnable_temp = NR_CPUS;
4307 unsigned long min_runnable_load = INT_MAX;
4308 unsigned long contrib;
4309
4310 for_each_cpu_mask(cpu, hmpd->cpus) {
4311 /* don't use the divisor in the loop, just at the end */
4312 contrib = cpu_rq(cpu)->avg.runnable_avg_sum * scale_load_down(1024);
4313 if (contrib < min_runnable_load) {
4314 min_runnable_load = contrib;
4315 min_cpu_runnable_temp = cpu;
4316 }
4317 }
4318
4319 if (min_cpu)
4320 *min_cpu = min_cpu_runnable_temp;
4321
4322 /* domain will often have at least one empty CPU */
4323 return min_runnable_load ? min_runnable_load / (LOAD_AVG_MAX + 1) : 0;
4324 }
4325
4326 /*
4327 * Calculate the task starvation
4328 * This is the ratio of actually running time vs. runnable time.
4329 * If the two are equal the task is getting the cpu time it needs or
4330 * it is alone on the cpu and the cpu is fully utilized.
4331 */
4332 static inline unsigned int hmp_task_starvation(struct sched_entity *se)
4333 {
4334 u32 starvation;
4335
4336 starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD);
4337 starvation /= (se->avg.runnable_avg_sum + 1);
4338
4339 return scale_load(starvation);
4340 }
4341
4342 static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
4343 {
4344 int min_usage;
4345 int dest_cpu = NR_CPUS;
4346
4347 if (hmp_cpu_is_slowest(cpu))
4348 return NR_CPUS;
4349
4350 /* Is the current domain fully loaded? */
4351 /* load < ~50% */
4352 min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL);
4353 if (min_usage < (NICE_0_LOAD>>1))
4354 return NR_CPUS;
4355
4356 /* Is the task alone on the cpu? */
4357 if (cpu_rq(cpu)->cfs.nr_running < 2)
4358 return NR_CPUS;
4359
4360 /* Is the task actually starving? */
4361 /* >=25% ratio running/runnable = starving */
4362 if (hmp_task_starvation(se) > 768)
4363 return NR_CPUS;
4364
4365 /* Does the slower domain have spare cycles? */
4366 min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu);
4367 /* load > 50% */
4368 if (min_usage > NICE_0_LOAD/2)
4369 return NR_CPUS;
4370
4371 if (cpumask_test_cpu(dest_cpu, &hmp_slower_domain(cpu)->cpus))
4372 return dest_cpu;
4373
4374 return NR_CPUS;
4375 }
4376 #endif /* CONFIG_SCHED_HMP */
4377
4378
4379 #ifdef CONFIG_MTK_SCHED_CMP
4380 /* Check if cpu is in fastest hmp_domain */
4381 unsigned int cmp_up_threshold = 512;
4382 unsigned int cmp_down_threshold = 256;
4383 #endif /* CONFIG_MTK_SCHED_CMP */
4384
4385 #ifdef CONFIG_MTK_SCHED_CMP_TGS
4386 static void sched_tg_enqueue_fair(struct rq *rq, struct task_struct *p)
4387 {
4388 int id;
4389 unsigned long flags;
4390 struct task_struct *tg = p->group_leader;
4391
4392 if (group_leader_is_empty(p))
4393 return;
4394 id = get_cluster_id(rq->cpu);
4395 if (unlikely(WARN_ON(id < 0)))
4396 return;
4397
4398 raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags);
4399 tg->thread_group_info[id].cfs_nr_running++;
4400 raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags);
4401 }
4402
4403 static void sched_tg_dequeue_fair(struct rq *rq, struct task_struct *p)
4404 {
4405 int id;
4406 unsigned long flags;
4407 struct task_struct *tg = p->group_leader;
4408
4409 if (group_leader_is_empty(p))
4410 return;
4411 id = get_cluster_id(rq->cpu);
4412 if (unlikely(WARN_ON(id < 0)))
4413 return;
4414
4415 raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags);
4416 tg->thread_group_info[id].cfs_nr_running--;
4417 raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags);
4418 }
4419
4420 #endif
4421 /*
4422 * The enqueue_task method is called before nr_running is
4423 * increased. Here we update the fair scheduling stats and
4424 * then put the task into the rbtree:
4425 */
4426 static void
4427 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4428 {
4429 struct cfs_rq *cfs_rq;
4430 struct sched_entity *se = &p->se;
4431
4432 for_each_sched_entity(se) {
4433 if (se->on_rq)
4434 break;
4435 cfs_rq = cfs_rq_of(se);
4436 enqueue_entity(cfs_rq, se, flags);
4437
4438 /*
4439 * end evaluation on encountering a throttled cfs_rq
4440 *
4441 * note: in the case of encountering a throttled cfs_rq we will
4442 * post the final h_nr_running increment below.
4443 */
4444 if (cfs_rq_throttled(cfs_rq))
4445 break;
4446 cfs_rq->h_nr_running++;
4447
4448 flags = ENQUEUE_WAKEUP;
4449 }
4450
4451 for_each_sched_entity(se) {
4452 cfs_rq = cfs_rq_of(se);
4453 cfs_rq->h_nr_running++;
4454
4455 if (cfs_rq_throttled(cfs_rq))
4456 break;
4457
4458 update_cfs_shares(cfs_rq);
4459 update_entity_load_avg(se, 1);
4460 }
4461
4462 if (!se) {
4463 update_rq_runnable_avg(rq, rq->nr_running);
4464 inc_nr_running(rq);
4465 #ifndef CONFIG_CFS_BANDWIDTH
4466 BUG_ON(rq->cfs.nr_running > rq->cfs.h_nr_running);
4467 #endif
4468 }
4469 hrtick_update(rq);
4470 #ifdef CONFIG_HMP_TRACER
4471 trace_sched_runqueue_length(rq->cpu,rq->nr_running);
4472 trace_sched_cfs_length(rq->cpu,rq->cfs.h_nr_running);
4473 #endif
4474 #ifdef CONFIG_MET_SCHED_HMP
4475 RqLen(rq->cpu,rq->nr_running);
4476 CfsLen(rq->cpu,rq->cfs.h_nr_running);
4477 #endif
4478
4479 #ifdef CONFIG_MTK_SCHED_CMP_TGS
4480 sched_tg_enqueue_fair(rq, p);
4481 #endif
4482 }
4483
4484 static void set_next_buddy(struct sched_entity *se);
4485
4486 /*
4487 * The dequeue_task method is called before nr_running is
4488 * decreased. We remove the task from the rbtree and
4489 * update the fair scheduling stats:
4490 */
4491 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4492 {
4493 struct cfs_rq *cfs_rq;
4494 struct sched_entity *se = &p->se;
4495 int task_sleep = flags & DEQUEUE_SLEEP;
4496
4497 for_each_sched_entity(se) {
4498 cfs_rq = cfs_rq_of(se);
4499 dequeue_entity(cfs_rq, se, flags);
4500
4501 /*
4502 * end evaluation on encountering a throttled cfs_rq
4503 *
4504 * note: in the case of encountering a throttled cfs_rq we will
4505 * post the final h_nr_running decrement below.
4506 */
4507 if (cfs_rq_throttled(cfs_rq))
4508 break;
4509 cfs_rq->h_nr_running--;
4510
4511 /* Don't dequeue parent if it has other entities besides us */
4512 if (cfs_rq->load.weight) {
4513 /*
4514 * Bias pick_next to pick a task from this cfs_rq, as
4515 * p is sleeping when it is within its sched_slice.
4516 */
4517 if (task_sleep && parent_entity(se))
4518 set_next_buddy(parent_entity(se));
4519
4520 /* avoid re-evaluating load for this entity */
4521 se = parent_entity(se);
4522 break;
4523 }
4524 flags |= DEQUEUE_SLEEP;
4525 }
4526
4527 for_each_sched_entity(se) {
4528 cfs_rq = cfs_rq_of(se);
4529 cfs_rq->h_nr_running--;
4530
4531 if (cfs_rq_throttled(cfs_rq))
4532 break;
4533
4534 update_cfs_shares(cfs_rq);
4535 update_entity_load_avg(se, 1);
4536 }
4537
4538 if (!se) {
4539 dec_nr_running(rq);
4540 #ifndef CONFIG_CFS_BANDWIDTH
4541 BUG_ON(rq->cfs.nr_running > rq->cfs.h_nr_running);
4542 #endif
4543 update_rq_runnable_avg(rq, 1);
4544 }
4545 hrtick_update(rq);
4546 #ifdef CONFIG_HMP_TRACER
4547 trace_sched_runqueue_length(rq->cpu,rq->nr_running);
4548 trace_sched_cfs_length(rq->cpu,rq->cfs.h_nr_running);
4549 #endif
4550 #ifdef CONFIG_MET_SCHED_HMP
4551 RqLen(rq->cpu,rq->nr_running);
4552 CfsLen(rq->cpu,rq->cfs.h_nr_running);
4553 #endif
4554
4555 #ifdef CONFIG_MTK_SCHED_CMP_TGS
4556 sched_tg_dequeue_fair(rq, p);
4557 #endif
4558 }
4559
4560 #ifdef CONFIG_SMP
4561 /* Used instead of source_load when we know the type == 0 */
4562 static unsigned long weighted_cpuload(const int cpu)
4563 {
4564 return cpu_rq(cpu)->cfs.runnable_load_avg;
4565 }
4566
4567 /*
4568 * Return a low guess at the load of a migration-source cpu weighted
4569 * according to the scheduling class and "nice" value.
4570 *
4571 * We want to under-estimate the load of migration sources, to
4572 * balance conservatively.
4573 */
4574 static unsigned long source_load(int cpu, int type)
4575 {
4576 struct rq *rq = cpu_rq(cpu);
4577 unsigned long total = weighted_cpuload(cpu);
4578
4579 if (type == 0 || !sched_feat(LB_BIAS))
4580 return total;
4581
4582 return min(rq->cpu_load[type-1], total);
4583 }
4584
4585 /*
4586 * Return a high guess at the load of a migration-target cpu weighted
4587 * according to the scheduling class and "nice" value.
4588 */
4589 static unsigned long target_load(int cpu, int type)
4590 {
4591 struct rq *rq = cpu_rq(cpu);
4592 unsigned long total = weighted_cpuload(cpu);
4593
4594 if (type == 0 || !sched_feat(LB_BIAS))
4595 return total;
4596
4597 return max(rq->cpu_load[type-1], total);
4598 }
4599
4600 static unsigned long power_of(int cpu)
4601 {
4602 return cpu_rq(cpu)->cpu_power;
4603 }
4604
4605 static unsigned long cpu_avg_load_per_task(int cpu)
4606 {
4607 struct rq *rq = cpu_rq(cpu);
4608 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
4609 unsigned long load_avg = rq->cfs.runnable_load_avg;
4610
4611 if (nr_running)
4612 return load_avg / nr_running;
4613
4614 return 0;
4615 }
4616
4617
4618 static void task_waking_fair(struct task_struct *p)
4619 {
4620 struct sched_entity *se = &p->se;
4621 struct cfs_rq *cfs_rq = cfs_rq_of(se);
4622 u64 min_vruntime;
4623
4624 #ifndef CONFIG_64BIT
4625 u64 min_vruntime_copy;
4626
4627 do {
4628 min_vruntime_copy = cfs_rq->min_vruntime_copy;
4629 smp_rmb();
4630 min_vruntime = cfs_rq->min_vruntime;
4631 } while (min_vruntime != min_vruntime_copy);
4632 #else
4633 min_vruntime = cfs_rq->min_vruntime;
4634 #endif
4635
4636 se->vruntime -= min_vruntime;
4637 }
4638
4639 #ifdef CONFIG_FAIR_GROUP_SCHED
4640 /*
4641 * effective_load() calculates the load change as seen from the root_task_group
4642 *
4643 * Adding load to a group doesn't make a group heavier, but can cause movement
4644 * of group shares between cpus. Assuming the shares were perfectly aligned one
4645 * can calculate the shift in shares.
4646 *
4647 * Calculate the effective load difference if @wl is added (subtracted) to @tg
4648 * on this @cpu and results in a total addition (subtraction) of @wg to the
4649 * total group weight.
4650 *
4651 * Given a runqueue weight distribution (rw_i) we can compute a shares
4652 * distribution (s_i) using:
4653 *
4654 * s_i = rw_i / \Sum rw_j (1)
4655 *
4656 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4657 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4658 * shares distribution (s_i):
4659 *
4660 * rw_i = { 2, 4, 1, 0 }
4661 * s_i = { 2/7, 4/7, 1/7, 0 }
4662 *
4663 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4664 * task used to run on and the CPU the waker is running on), we need to
4665 * compute the effect of waking a task on either CPU and, in case of a sync
4666 * wakeup, compute the effect of the current task going to sleep.
4667 *
4668 * So for a change of @wl to the local @cpu with an overall group weight change
4669 * of @wl we can compute the new shares distribution (s'_i) using:
4670 *
4671 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
4672 *
4673 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4674 * differences in waking a task to CPU 0. The additional task changes the
4675 * weight and shares distributions like:
4676 *
4677 * rw'_i = { 3, 4, 1, 0 }
4678 * s'_i = { 3/8, 4/8, 1/8, 0 }
4679 *
4680 * We can then compute the difference in effective weight by using:
4681 *
4682 * dw_i = S * (s'_i - s_i) (3)
4683 *
4684 * Where 'S' is the group weight as seen by its parent.
4685 *
4686 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4687 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4688 * 4/7) times the weight of the group.
4689 */
4690 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4691 {
4692 struct sched_entity *se = tg->se[cpu];
4693
4694 if (!tg->parent) /* the trivial, non-cgroup case */
4695 return wl;
4696
4697 for_each_sched_entity(se) {
4698 long w, W;
4699
4700 tg = se->my_q->tg;
4701
4702 /*
4703 * W = @wg + \Sum rw_j
4704 */
4705 W = wg + calc_tg_weight(tg, se->my_q);
4706
4707 /*
4708 * w = rw_i + @wl
4709 */
4710 w = se->my_q->load.weight + wl;
4711
4712 /*
4713 * wl = S * s'_i; see (2)
4714 */
4715 if (W > 0 && w < W)
4716 wl = (w * tg->shares) / W;
4717 else
4718 wl = tg->shares;
4719
4720 /*
4721 * Per the above, wl is the new se->load.weight value; since
4722 * those are clipped to [MIN_SHARES, ...) do so now. See
4723 * calc_cfs_shares().
4724 */
4725 if (wl < MIN_SHARES)
4726 wl = MIN_SHARES;
4727
4728 /*
4729 * wl = dw_i = S * (s'_i - s_i); see (3)
4730 */
4731 wl -= se->load.weight;
4732
4733 /*
4734 * Recursively apply this logic to all parent groups to compute
4735 * the final effective load change on the root group. Since
4736 * only the @tg group gets extra weight, all parent groups can
4737 * only redistribute existing shares. @wl is the shift in shares
4738 * resulting from this level per the above.
4739 */
4740 wg = 0;
4741 }
4742
4743 return wl;
4744 }
4745 #else
4746
4747 static inline unsigned long effective_load(struct task_group *tg, int cpu,
4748 unsigned long wl, unsigned long wg)
4749 {
4750 return wl;
4751 }
4752
4753 #endif
4754
4755 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4756 {
4757 s64 this_load, load;
4758 int idx, this_cpu, prev_cpu;
4759 unsigned long tl_per_task;
4760 struct task_group *tg;
4761 unsigned long weight;
4762 int balanced;
4763
4764 idx = sd->wake_idx;
4765 this_cpu = smp_processor_id();
4766 prev_cpu = task_cpu(p);
4767 load = source_load(prev_cpu, idx);
4768 this_load = target_load(this_cpu, idx);
4769
4770 /*
4771 * If sync wakeup then subtract the (maximum possible)
4772 * effect of the currently running task from the load
4773 * of the current CPU:
4774 */
4775 if (sync) {
4776 tg = task_group(current);
4777 weight = current->se.load.weight;
4778
4779 this_load += effective_load(tg, this_cpu, -weight, -weight);
4780 load += effective_load(tg, prev_cpu, 0, -weight);
4781 }
4782
4783 tg = task_group(p);
4784 weight = p->se.load.weight;
4785
4786 /*
4787 * In low-load situations, where prev_cpu is idle and this_cpu is idle
4788 * due to the sync cause above having dropped this_load to 0, we'll
4789 * always have an imbalance, but there's really nothing you can do
4790 * about that, so that's good too.
4791 *
4792 * Otherwise check if either cpus are near enough in load to allow this
4793 * task to be woken on this_cpu.
4794 */
4795 if (this_load > 0) {
4796 s64 this_eff_load, prev_eff_load;
4797
4798 this_eff_load = 100;
4799 this_eff_load *= power_of(prev_cpu);
4800 this_eff_load *= this_load +
4801 effective_load(tg, this_cpu, weight, weight);
4802
4803 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4804 prev_eff_load *= power_of(this_cpu);
4805 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4806
4807 balanced = this_eff_load <= prev_eff_load;
4808 } else
4809 balanced = true;
4810
4811 /*
4812 * If the currently running task will sleep within
4813 * a reasonable amount of time then attract this newly
4814 * woken task:
4815 */
4816 if (sync && balanced)
4817 return 1;
4818
4819 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4820 tl_per_task = cpu_avg_load_per_task(this_cpu);
4821
4822 if (balanced ||
4823 (this_load <= load &&
4824 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
4825 /*
4826 * This domain has SD_WAKE_AFFINE and
4827 * p is cache cold in this domain, and
4828 * there is no bad imbalance.
4829 */
4830 schedstat_inc(sd, ttwu_move_affine);
4831 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4832
4833 return 1;
4834 }
4835 return 0;
4836 }
4837
4838 /*
4839 * find_idlest_group finds and returns the least busy CPU group within the
4840 * domain.
4841 */
4842 static struct sched_group *
4843 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4844 int this_cpu, int load_idx)
4845 {
4846 struct sched_group *idlest = NULL, *group = sd->groups;
4847 unsigned long min_load = ULONG_MAX, this_load = 0;
4848 int imbalance = 100 + (sd->imbalance_pct-100)/2;
4849
4850 do {
4851 unsigned long load, avg_load;
4852 int local_group;
4853 int i;
4854
4855 /* Skip over this group if it has no CPUs allowed */
4856 if (!cpumask_intersects(sched_group_cpus(group),
4857 tsk_cpus_allowed(p)))
4858 continue;
4859
4860 local_group = cpumask_test_cpu(this_cpu,
4861 sched_group_cpus(group));
4862
4863 /* Tally up the load of all CPUs in the group */
4864 avg_load = 0;
4865
4866 for_each_cpu(i, sched_group_cpus(group)) {
4867 /* Bias balancing toward cpus of our domain */
4868 if (local_group)
4869 load = source_load(i, load_idx);
4870 else
4871 load = target_load(i, load_idx);
4872
4873 avg_load += load;
4874
4875 mt_sched_printf("find_idlest_group cpu=%d avg=%lu",
4876 i, avg_load);
4877 }
4878
4879 /* Adjust by relative CPU power of the group */
4880 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
4881
4882 if (local_group) {
4883 this_load = avg_load;
4884 mt_sched_printf("find_idlest_group this_load=%lu",
4885 this_load);
4886 } else if (avg_load < min_load) {
4887 min_load = avg_load;
4888 idlest = group;
4889 mt_sched_printf("find_idlest_group min_load=%lu",
4890 min_load);
4891 }
4892 } while (group = group->next, group != sd->groups);
4893
4894 if (!idlest || 100*this_load < imbalance*min_load){
4895 mt_sched_printf("find_idlest_group fail this_load=%lu min_load=%lu, imbalance=%d",
4896 this_load, min_load, imbalance);
4897 return NULL;
4898 }
4899 return idlest;
4900 }
4901
4902 /*
4903 * find_idlest_cpu - find the idlest cpu among the cpus in group.
4904 */
4905 static int
4906 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4907 {
4908 unsigned long load, min_load = ULONG_MAX;
4909 int idlest = -1;
4910 int i;
4911
4912 /* Traverse only the allowed CPUs */
4913 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4914 load = weighted_cpuload(i);
4915
4916 if (load < min_load || (load == min_load && i == this_cpu)) {
4917 min_load = load;
4918 idlest = i;
4919 }
4920 }
4921
4922 return idlest;
4923 }
4924
4925 /*
4926 * Try and locate an idle CPU in the sched_domain.
4927 */
4928 static int select_idle_sibling(struct task_struct *p, int target)
4929 {
4930 struct sched_domain *sd;
4931 struct sched_group *sg;
4932 int i = task_cpu(p);
4933
4934 if (idle_cpu(target))
4935 return target;
4936
4937 /*
4938 * If the prevous cpu is cache affine and idle, don't be stupid.
4939 */
4940 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4941 return i;
4942
4943 /*
4944 * Otherwise, iterate the domains and find an elegible idle cpu.
4945 */
4946 sd = rcu_dereference(per_cpu(sd_llc, target));
4947 for_each_lower_domain(sd) {
4948 sg = sd->groups;
4949 do {
4950 if (!cpumask_intersects(sched_group_cpus(sg),
4951 tsk_cpus_allowed(p)))
4952 goto next;
4953
4954 for_each_cpu(i, sched_group_cpus(sg)) {
4955 if (i == target || !idle_cpu(i))
4956 goto next;
4957 }
4958
4959 target = cpumask_first_and(sched_group_cpus(sg),
4960 tsk_cpus_allowed(p));
4961 goto done;
4962 next:
4963 sg = sg->next;
4964 } while (sg != sd->groups);
4965 }
4966 done:
4967 return target;
4968 }
4969
4970 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
4971 /*
4972 * @p: the task want to be located at.
4973 * @clid: the CPU cluster id to be search for the target CPU
4974 * @target: the appropriate CPU for task p, updated by this function.
4975 *
4976 * Return:
4977 *
4978 * 1 on success
4979 * 0 if target CPU is not found in this CPU cluster
4980 */
4981 static int cmp_find_idle_cpu(struct task_struct *p, int clid, int *target)
4982 {
4983 struct cpumask cls_cpus;
4984 int j;
4985
4986 get_cluster_cpus(&cls_cpus, clid, true);
4987 *target = cpumask_any_and(&cls_cpus, tsk_cpus_allowed(p));
4988 for_each_cpu(j, &cls_cpus) {
4989 if (idle_cpu(j) && cpumask_test_cpu(j, tsk_cpus_allowed(p))) {
4990 *target = j;
4991 break;
4992 }
4993 }
4994 if (*target >= nr_cpu_ids)
4995 return 0; // task is not allow in this CPU cluster
4996 mt_sched_printf("wakeup %d %s cpu=%d, max_clid/max_idle_clid=%d",
4997 p->pid, p->comm, *target, clid);
4998
4999 return 1;
5000 }
5001
5002 #if !defined(CONFIG_SCHED_HMP)
5003 #define TGS_WAKEUP_EXPERIMENT
5004 #endif
5005 static int cmp_select_task_rq_fair(struct task_struct *p, int sd_flag, int *cpu)
5006 {
5007 int i, j;
5008 int max_cnt=0, tskcnt;
5009 int tgs_clid=-1;
5010 int idle_cnt, max_idle_cnt=0;
5011 int in_prev=0, prev_cluster=0;
5012 struct cpumask cls_cpus;
5013 int num_cluster;
5014
5015 num_cluster=arch_get_nr_clusters();
5016 for(i=0; i< num_cluster; i++) {
5017 tskcnt= p->group_leader->thread_group_info[i].nr_running;
5018 idle_cnt = 0;
5019 get_cluster_cpus(&cls_cpus, i, true);
5020
5021 for_each_cpu(j, &cls_cpus) {
5022 #ifdef TGS_WAKEUP_EXPERIMENT
5023 if (arch_is_big_little()) {
5024 int bcpu = arch_cpu_is_big(j);
5025 if (bcpu && p->se.avg.load_avg_ratio >= cmp_up_threshold) {
5026 in_prev = 0;
5027 tgs_clid = i;
5028 mt_sched_printf("[heavy task] wakeup load=%ld up_th=%u pid=%d name=%s cpu=%d, tgs_clid=%d in_prev=%d",
5029 p->se.avg.load_avg_ratio, cmp_up_threshold, p->pid, p->comm, *cpu, tgs_clid, in_prev);
5030 goto find_idle_cpu;
5031 }
5032 if (!bcpu && p->se.avg.load_avg_ratio < cmp_down_threshold) {
5033 in_prev = 0;
5034 tgs_clid = i;
5035 mt_sched_printf("[light task] wakeup load=%ld down_th=%u pid=%d name=%s cpu=%d, tgs_clid=%d in_prev=%d",
5036 p->se.avg.load_avg_ratio, cmp_down_threshold, p->pid, p->comm, *cpu, tgs_clid, in_prev);
5037 goto find_idle_cpu;
5038 }
5039 }
5040 #endif
5041 if (idle_cpu(j))
5042 idle_cnt++;
5043 }
5044 mt_sched_printf("wakeup load=%ld pid=%d name=%s clid=%d idle_cnt=%d tskcnt=%d max_cnt=%d, cls_cpus=%02lx, onlineCPU=%02lx",
5045 p->se.avg.load_avg_ratio, p->pid, p->comm, i, idle_cnt, tskcnt, max_cnt,
5046 *cpumask_bits(&cls_cpus), *cpumask_bits(cpu_online_mask));
5047
5048 if (idle_cnt == 0)
5049 continue;
5050
5051 if (i == get_cluster_id(*cpu))
5052 prev_cluster = 1;
5053
5054 if (tskcnt > 0) {
5055 if ( (tskcnt > max_cnt) || ((tskcnt == max_cnt) && prev_cluster)) {
5056 in_prev = prev_cluster;
5057 tgs_clid = i;
5058 max_cnt = tskcnt;
5059 }
5060 } else if (0 == max_cnt) {
5061 if ((idle_cnt > max_idle_cnt) || ((idle_cnt == max_idle_cnt) && prev_cluster)) {
5062 in_prev = prev_cluster;
5063 tgs_clid = i ;
5064 max_idle_cnt = idle_cnt;
5065 }
5066
5067 }
5068 mt_sched_printf("wakeup %d %s i=%d idle_cnt=%d tgs_clid=%d max_cnt=%d max_idle_cnt=%d in_prev=%d",
5069 p->pid, p->comm, i, idle_cnt, tgs_clid, max_cnt, max_idle_cnt, in_prev);
5070 }
5071
5072 #ifdef TGS_WAKEUP_EXPERIMENT
5073 find_idle_cpu:
5074 #endif
5075 mt_sched_printf("wakeup %d %s cpu=%d, tgs_clid=%d in_prev=%d",
5076 p->pid, p->comm, *cpu, tgs_clid, in_prev);
5077
5078 if(-1 != tgs_clid && !in_prev && cmp_find_idle_cpu(p, tgs_clid, cpu))
5079 return 1;
5080
5081 return 0;
5082 }
5083 #endif
5084
5085 #ifdef CONFIG_MTK_SCHED_TRACERS
5086 #define LB_RESET 0
5087 #define LB_AFFINITY 0x10
5088 #define LB_BUDDY 0x20
5089 #define LB_FORK 0x30
5090 #define LB_CMP_SHIFT 8
5091 #define LB_CMP 0x4000
5092 #define LB_SMP_SHIFT 16
5093 #define LB_SMP 0x500000
5094 #define LB_HMP_SHIFT 24
5095 #define LB_HMP 0x60000000
5096 #endif
5097
5098 /*
5099 * sched_balance_self: balance the current task (running on cpu) in domains
5100 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
5101 * SD_BALANCE_EXEC.
5102 *
5103 * Balance, ie. select the least loaded group.
5104 *
5105 * Returns the target CPU number, or the same CPU if no balancing is needed.
5106 *
5107 * preempt must be disabled.
5108 */
5109 static int
5110 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
5111 {
5112 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5113 int cpu = smp_processor_id();
5114 int prev_cpu = task_cpu(p);
5115 int new_cpu = cpu;
5116 int want_affine = 0;
5117 int sync = wake_flags & WF_SYNC;
5118 #if defined(CONFIG_SCHED_HMP) && !defined(CONFIG_SCHED_HMP_ENHANCEMENT)
5119 int target_cpu = nr_cpu_ids;
5120 #endif
5121 #ifdef CONFIG_MTK_SCHED_TRACERS
5122 int policy = 0;
5123 #endif
5124 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5125 int cmp_cpu;
5126 int cmp_cpu_found=0;
5127 #endif
5128 #ifdef CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK
5129 int buddy_cpu = per_cpu(sd_pack_buddy, cpu);
5130 #endif
5131
5132 if (p->nr_cpus_allowed == 1)
5133 {
5134 #ifdef CONFIG_MTK_SCHED_TRACERS
5135 trace_sched_select_task_rq(p, (LB_AFFINITY | prev_cpu), prev_cpu, prev_cpu);
5136 #endif
5137 return prev_cpu;
5138 }
5139
5140 #ifdef CONFIG_HMP_PACK_SMALL_TASK
5141 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5142 if (check_pack_buddy(cpu, p) && PA_ENABLE) {
5143 PACK_FROM_CPUX_TO_CPUY_COUNT[cpu][per_cpu(sd_pack_buddy, cpu)]++;
5144
5145 #ifdef CONFIG_HMP_TRACER
5146 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_PACK_FORM_CPUX_TO_CPUY, p->pid, cpu, per_cpu(sd_pack_buddy, cpu));
5147 #endif /* CONFIG_HMP_TRACER */
5148
5149 if(PA_MON_ENABLE) {
5150 if(strcmp(p->comm, PA_MON) == 0 && cpu != per_cpu(sd_pack_buddy, cpu)) {
5151 printk(KERN_EMERG "[PA] %s PACK From CPU%d to CPU%d\n", p->comm, cpu, per_cpu(sd_pack_buddy, cpu));
5152 printk(KERN_EMERG "[PA] Buddy RQ Usage = %u, Period = %u, NR = %u\n",
5153 per_cpu(BUDDY_CPU_RQ_USAGE, per_cpu(sd_pack_buddy, cpu)),
5154 per_cpu(BUDDY_CPU_RQ_PERIOD, per_cpu(sd_pack_buddy, cpu)),
5155 per_cpu(BUDDY_CPU_RQ_NR, per_cpu(sd_pack_buddy, cpu)));
5156 printk(KERN_EMERG "[PA] Task Usage = %u, Period = %u\n",
5157 per_cpu(TASK_USGAE, cpu),
5158 per_cpu(TASK_PERIOD, cpu));
5159 }
5160 }
5161 #else /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5162 if (check_pack_buddy(cpu, p)) {
5163 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5164 #ifdef CONFIG_MTK_SCHED_TRACERS
5165 new_cpu = per_cpu(sd_pack_buddy, cpu);
5166 trace_sched_select_task_rq(p, (LB_BUDDY | new_cpu), prev_cpu, new_cpu);
5167 #endif
5168 return per_cpu(sd_pack_buddy, cpu);
5169 }
5170 #elif defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK)
5171 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5172 if (PA_ENABLE && (sd_flag & SD_BALANCE_WAKE) && (check_pack_buddy(buddy_cpu, p))) {
5173 #else
5174 if ((sd_flag & SD_BALANCE_WAKE) && (check_pack_buddy(buddy_cpu, p))) {
5175 #endif
5176 struct thread_group_info_t *src_tginfo, *dst_tginfo;
5177 src_tginfo = &p->group_leader->thread_group_info[get_cluster_id(prev_cpu)]; //Compare with previous cpu(Not current cpu)
5178 dst_tginfo = &p->group_leader->thread_group_info[get_cluster_id(buddy_cpu)];
5179 if((get_cluster_id(prev_cpu) == get_cluster_id(buddy_cpu)) ||
5180 (src_tginfo->nr_running < dst_tginfo->nr_running))
5181 {
5182 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5183 PACK_FROM_CPUX_TO_CPUY_COUNT[cpu][buddy_cpu]++;
5184 mt_sched_printf("[PA]pid=%d, Pack to CPU%d(CPU%d's buddy)\n", p->pid,buddy_cpu,cpu);
5185 if(PA_MON_ENABLE) {
5186 u8 i=0;
5187 for(i=0;i<4; i++) {
5188 if(strcmp(p->comm, &PA_MON[i][0]) == 0) {
5189 TASK_PACK_CPU_COUNT[i][buddy_cpu]++;
5190 printk(KERN_EMERG "[PA] %s PACK to CPU%d(CPU%d's buddy), pre(cpu%d)\n", p->comm, buddy_cpu,cpu, prev_cpu);
5191 printk(KERN_EMERG "[PA] Buddy RQ Usage = %u, Period = %u, NR = %u\n",
5192 per_cpu(BUDDY_CPU_RQ_USAGE, buddy_cpu),
5193 per_cpu(BUDDY_CPU_RQ_PERIOD, buddy_cpu),
5194 per_cpu(BUDDY_CPU_RQ_NR, buddy_cpu));
5195 printk(KERN_EMERG "[PA] Task Usage = %u, Period = %u\n",
5196 per_cpu(TASK_USGAE, cpu),
5197 per_cpu(TASK_PERIOD, cpu));
5198 break;
5199 }
5200 }
5201 }
5202 #endif //CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5203 #ifdef CONFIG_MTK_SCHED_TRACERS
5204 trace_sched_select_task_rq(p, (LB_BUDDY | buddy_cpu), prev_cpu, buddy_cpu);
5205 #endif
5206 return buddy_cpu;
5207 }
5208 }
5209 #endif /* CONFIG_HMP_PACK_SMALL_TASK */
5210
5211 #ifdef CONFIG_SCHED_HMP
5212 /* always put non-kernel forking tasks on a big domain */
5213 if (p->mm && (sd_flag & SD_BALANCE_FORK)) {
5214 if(hmp_cpu_is_fastest(prev_cpu)) {
5215 struct hmp_domain *hmpdom = list_entry(&hmp_cpu_domain(prev_cpu)->hmp_domains, struct hmp_domain, hmp_domains);
5216 __always_unused int lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu);
5217 if(new_cpu < nr_cpu_ids && cpumask_test_cpu(new_cpu,tsk_cpus_allowed(p)))
5218 {
5219 #ifdef CONFIG_MTK_SCHED_TRACERS
5220 trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
5221 #endif
5222 return new_cpu;
5223 }
5224 else
5225 {
5226 new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
5227 tsk_cpus_allowed(p));
5228 if(new_cpu < nr_cpu_ids)
5229 {
5230 #ifdef CONFIG_MTK_SCHED_TRACERS
5231 trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
5232 #endif
5233 return new_cpu;
5234 }
5235 }
5236 } else {
5237 new_cpu = hmp_select_faster_cpu(p, prev_cpu);
5238 if (new_cpu < nr_cpu_ids)
5239 {
5240 #ifdef CONFIG_MTK_SCHED_TRACERS
5241 trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
5242 #endif
5243 return new_cpu;
5244 }
5245 }
5246 // to recover new_cpu value
5247 if (new_cpu >= nr_cpu_ids)
5248 new_cpu = cpu;
5249 }
5250 #endif
5251
5252 if (sd_flag & SD_BALANCE_WAKE) {
5253 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5254 want_affine = 1;
5255 new_cpu = prev_cpu;
5256 }
5257
5258 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5259 cmp_cpu = prev_cpu;
5260 cmp_cpu_found = cmp_select_task_rq_fair(p, sd_flag, &cmp_cpu);
5261 if (cmp_cpu_found && (cmp_cpu < nr_cpu_ids)) {
5262 cpu = cmp_cpu;
5263 new_cpu = cmp_cpu;
5264 #ifdef CONFIG_MTK_SCHED_TRACERS
5265 policy |= (new_cpu << LB_CMP_SHIFT);
5266 policy |= LB_CMP;
5267 #endif
5268 mt_sched_printf("wakeup %d %s sd_flag=%x cmp_cpu_found=%d, cpu=%d, want_affine=%d ",
5269 p->pid, p->comm, sd_flag, cmp_cpu_found, cpu, want_affine);
5270 goto cmp_found;
5271 }
5272 #endif
5273 rcu_read_lock();
5274 for_each_domain(cpu, tmp) {
5275 mt_sched_printf("wakeup %d %s tmp->flags=%x, cpu=%d, prev_cpu=%d, new_cpu=%d",
5276 p->pid, p->comm, tmp->flags, cpu, prev_cpu, new_cpu);
5277
5278 if (!(tmp->flags & SD_LOAD_BALANCE))
5279 continue;
5280
5281 /*
5282 * If both cpu and prev_cpu are part of this domain,
5283 * cpu is a valid SD_WAKE_AFFINE target.
5284 */
5285 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5286 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5287 affine_sd = tmp;
5288 break;
5289 }
5290
5291 if (tmp->flags & sd_flag)
5292 sd = tmp;
5293 }
5294
5295 if (affine_sd) {
5296 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
5297 prev_cpu = cpu;
5298
5299 new_cpu = select_idle_sibling(p, prev_cpu);
5300 goto unlock;
5301 }
5302
5303 mt_sched_printf("wakeup %d %s sd=%p", p->pid, p->comm, sd);
5304
5305 while (sd) {
5306 int load_idx = sd->forkexec_idx;
5307 struct sched_group *group;
5308 int weight;
5309
5310 mt_sched_printf("wakeup %d %s find_idlest_group cpu=%d sd->flags=%x sd_flag=%x",
5311 p->pid, p->comm, cpu, sd->flags, sd_flag);
5312
5313 if (!(sd->flags & sd_flag)) {
5314 sd = sd->child;
5315 continue;
5316 }
5317
5318 if (sd_flag & SD_BALANCE_WAKE)
5319 load_idx = sd->wake_idx;
5320
5321 mt_sched_printf("wakeup %d %s find_idlest_group cpu=%d",
5322 p->pid, p->comm, cpu);
5323 group = find_idlest_group(sd, p, cpu, load_idx);
5324 if (!group) {
5325 sd = sd->child;
5326 mt_sched_printf("wakeup %d %s find_idlest_group child",
5327 p->pid, p->comm);
5328 continue;
5329 }
5330
5331 new_cpu = find_idlest_cpu(group, p, cpu);
5332 if (new_cpu == -1 || new_cpu == cpu) {
5333 /* Now try balancing at a lower domain level of cpu */
5334 sd = sd->child;
5335 mt_sched_printf("wakeup %d %s find_idlest_cpu sd->child=%p",
5336 p->pid, p->comm, sd);
5337 continue;
5338 }
5339
5340 /* Now try balancing at a lower domain level of new_cpu */
5341 mt_sched_printf("wakeup %d %s find_idlest_cpu cpu=%d sd=%p",
5342 p->pid, p->comm, new_cpu, sd);
5343 cpu = new_cpu;
5344 weight = sd->span_weight;
5345 sd = NULL;
5346 for_each_domain(cpu, tmp) {
5347 if (weight <= tmp->span_weight)
5348 break;
5349 if (tmp->flags & sd_flag)
5350 sd = tmp;
5351 mt_sched_printf("wakeup %d %s sd=%p weight=%d, tmp->span_weight=%d",
5352 p->pid, p->comm, sd, weight, tmp->span_weight);
5353 }
5354 /* while loop will break here if sd == NULL */
5355 }
5356
5357 #ifdef CONFIG_MTK_SCHED_TRACERS
5358 policy |= (new_cpu << LB_SMP_SHIFT);
5359 policy |= LB_SMP;
5360 #endif
5361
5362 unlock:
5363 rcu_read_unlock();
5364 mt_sched_printf("wakeup %d %s new_cpu=%x", p->pid, p->comm, new_cpu);
5365
5366 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5367 cmp_found:
5368 #endif
5369
5370 #ifdef CONFIG_SCHED_HMP
5371 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
5372 new_cpu = hmp_select_task_rq_fair(sd_flag, p, prev_cpu, new_cpu);
5373 #ifdef CONFIG_MTK_SCHED_TRACERS
5374 policy |= (new_cpu << LB_HMP_SHIFT);
5375 policy |= LB_HMP;
5376 #endif
5377
5378 #else
5379 if (hmp_up_migration(prev_cpu, &target_cpu, &p->se)) {
5380 new_cpu = hmp_select_faster_cpu(p, prev_cpu);
5381 hmp_next_up_delay(&p->se, new_cpu);
5382 trace_sched_hmp_migrate(p, new_cpu, 0);
5383 return new_cpu;
5384 }
5385 if (hmp_down_migration(prev_cpu, &p->se)) {
5386 new_cpu = hmp_select_slower_cpu(p, prev_cpu);
5387 hmp_next_down_delay(&p->se, new_cpu);
5388 trace_sched_hmp_migrate(p, new_cpu, 0);
5389 return new_cpu;
5390 }
5391 /* Make sure that the task stays in its previous hmp domain */
5392 if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
5393 return prev_cpu;
5394 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
5395 #endif /* CONFIG_SCHED_HMP */
5396
5397 #ifdef CONFIG_MTK_SCHED_TRACERS
5398 trace_sched_select_task_rq(p, policy, prev_cpu, new_cpu);
5399 #endif
5400
5401 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5402 if(PA_MON_ENABLE) {
5403 if(strcmp(p->comm, PA_MON) == 0 && cpu != new_cpu) {
5404 printk(KERN_EMERG "[PA] %s Select From CPU%d to CPU%d\n", p->comm, cpu, new_cpu);
5405 }
5406 }
5407 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5408
5409 return new_cpu;
5410 }
5411
5412 /*
5413 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5414 * cfs_rq_of(p) references at time of call are still valid and identify the
5415 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
5416 * other assumptions, including the state of rq->lock, should be made.
5417 */
5418 static void
5419 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
5420 {
5421 struct sched_entity *se = &p->se;
5422 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5423
5424 /*
5425 * Load tracking: accumulate removed load so that it can be processed
5426 * when we next update owning cfs_rq under rq->lock. Tasks contribute
5427 * to blocked load iff they have a positive decay-count. It can never
5428 * be negative here since on-rq tasks have decay-count == 0.
5429 */
5430 if (se->avg.decay_count) {
5431 se->avg.decay_count = -__synchronize_entity_decay(se);
5432 atomic_long_add(se->avg.load_avg_contrib,
5433 &cfs_rq->removed_load);
5434 }
5435 }
5436 #endif /* CONFIG_SMP */
5437
5438 static unsigned long
5439 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5440 {
5441 unsigned long gran = sysctl_sched_wakeup_granularity;
5442
5443 /*
5444 * Since its curr running now, convert the gran from real-time
5445 * to virtual-time in his units.
5446 *
5447 * By using 'se' instead of 'curr' we penalize light tasks, so
5448 * they get preempted easier. That is, if 'se' < 'curr' then
5449 * the resulting gran will be larger, therefore penalizing the
5450 * lighter, if otoh 'se' > 'curr' then the resulting gran will
5451 * be smaller, again penalizing the lighter task.
5452 *
5453 * This is especially important for buddies when the leftmost
5454 * task is higher priority than the buddy.
5455 */
5456 return calc_delta_fair(gran, se);
5457 }
5458
5459 /*
5460 * Should 'se' preempt 'curr'.
5461 *
5462 * |s1
5463 * |s2
5464 * |s3
5465 * g
5466 * |<--->|c
5467 *
5468 * w(c, s1) = -1
5469 * w(c, s2) = 0
5470 * w(c, s3) = 1
5471 *
5472 */
5473 static int
5474 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5475 {
5476 s64 gran, vdiff = curr->vruntime - se->vruntime;
5477
5478 if (vdiff <= 0)
5479 return -1;
5480
5481 gran = wakeup_gran(curr, se);
5482 if (vdiff > gran)
5483 return 1;
5484
5485 return 0;
5486 }
5487
5488 static void set_last_buddy(struct sched_entity *se)
5489 {
5490 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5491 return;
5492
5493 for_each_sched_entity(se)
5494 cfs_rq_of(se)->last = se;
5495 }
5496
5497 static void set_next_buddy(struct sched_entity *se)
5498 {
5499 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5500 return;
5501
5502 for_each_sched_entity(se)
5503 cfs_rq_of(se)->next = se;
5504 }
5505
5506 static void set_skip_buddy(struct sched_entity *se)
5507 {
5508 for_each_sched_entity(se)
5509 cfs_rq_of(se)->skip = se;
5510 }
5511
5512 /*
5513 * Preempt the current task with a newly woken task if needed:
5514 */
5515 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5516 {
5517 struct task_struct *curr = rq->curr;
5518 struct sched_entity *se = &curr->se, *pse = &p->se;
5519 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5520 int scale = cfs_rq->nr_running >= sched_nr_latency;
5521 int next_buddy_marked = 0;
5522
5523 if (unlikely(se == pse))
5524 return;
5525
5526 /*
5527 * This is possible from callers such as move_task(), in which we
5528 * unconditionally check_prempt_curr() after an enqueue (which may have
5529 * lead to a throttle). This both saves work and prevents false
5530 * next-buddy nomination below.
5531 */
5532 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5533 return;
5534
5535 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5536 set_next_buddy(pse);
5537 next_buddy_marked = 1;
5538 }
5539
5540 /*
5541 * We can come here with TIF_NEED_RESCHED already set from new task
5542 * wake up path.
5543 *
5544 * Note: this also catches the edge-case of curr being in a throttled
5545 * group (e.g. via set_curr_task), since update_curr() (in the
5546 * enqueue of curr) will have resulted in resched being set. This
5547 * prevents us from potentially nominating it as a false LAST_BUDDY
5548 * below.
5549 */
5550 if (test_tsk_need_resched(curr))
5551 return;
5552
5553 /* Idle tasks are by definition preempted by non-idle tasks. */
5554 if (unlikely(curr->policy == SCHED_IDLE) &&
5555 likely(p->policy != SCHED_IDLE))
5556 goto preempt;
5557
5558 /*
5559 * Batch and idle tasks do not preempt non-idle tasks (their preemption
5560 * is driven by the tick):
5561 */
5562 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5563 return;
5564
5565 find_matching_se(&se, &pse);
5566 update_curr(cfs_rq_of(se));
5567 BUG_ON(!pse);
5568 if (wakeup_preempt_entity(se, pse) == 1) {
5569 /*
5570 * Bias pick_next to pick the sched entity that is
5571 * triggering this preemption.
5572 */
5573 if (!next_buddy_marked)
5574 set_next_buddy(pse);
5575 goto preempt;
5576 }
5577
5578 return;
5579
5580 preempt:
5581 resched_task(curr);
5582 /*
5583 * Only set the backward buddy when the current task is still
5584 * on the rq. This can happen when a wakeup gets interleaved
5585 * with schedule on the ->pre_schedule() or idle_balance()
5586 * point, either of which can * drop the rq lock.
5587 *
5588 * Also, during early boot the idle thread is in the fair class,
5589 * for obvious reasons its a bad idea to schedule back to it.
5590 */
5591 if (unlikely(!se->on_rq || curr == rq->idle))
5592 return;
5593
5594 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5595 set_last_buddy(se);
5596 }
5597
5598 static struct task_struct *pick_next_task_fair(struct rq *rq)
5599 {
5600 struct task_struct *p;
5601 struct cfs_rq *cfs_rq = &rq->cfs;
5602 struct sched_entity *se;
5603
5604 // in case nr_running!=0 but h_nr_running==0
5605 if (!cfs_rq->nr_running || !cfs_rq->h_nr_running)
5606 return NULL;
5607
5608 do {
5609 se = pick_next_entity(cfs_rq);
5610 set_next_entity(cfs_rq, se);
5611 cfs_rq = group_cfs_rq(se);
5612 } while (cfs_rq);
5613
5614 p = task_of(se);
5615 if (hrtick_enabled(rq))
5616 hrtick_start_fair(rq, p);
5617
5618 return p;
5619 }
5620
5621 /*
5622 * Account for a descheduled task:
5623 */
5624 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5625 {
5626 struct sched_entity *se = &prev->se;
5627 struct cfs_rq *cfs_rq;
5628
5629 for_each_sched_entity(se) {
5630 cfs_rq = cfs_rq_of(se);
5631 put_prev_entity(cfs_rq, se);
5632 }
5633 }
5634
5635 /*
5636 * sched_yield() is very simple
5637 *
5638 * The magic of dealing with the ->skip buddy is in pick_next_entity.
5639 */
5640 static void yield_task_fair(struct rq *rq)
5641 {
5642 struct task_struct *curr = rq->curr;
5643 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5644 struct sched_entity *se = &curr->se;
5645
5646 /*
5647 * Are we the only task in the tree?
5648 */
5649 if (unlikely(rq->nr_running == 1))
5650 return;
5651
5652 clear_buddies(cfs_rq, se);
5653
5654 if (curr->policy != SCHED_BATCH) {
5655 update_rq_clock(rq);
5656 /*
5657 * Update run-time statistics of the 'current'.
5658 */
5659 update_curr(cfs_rq);
5660 /*
5661 * Tell update_rq_clock() that we've just updated,
5662 * so we don't do microscopic update in schedule()
5663 * and double the fastpath cost.
5664 */
5665 rq->skip_clock_update = 1;
5666 }
5667
5668 set_skip_buddy(se);
5669 }
5670
5671 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5672 {
5673 struct sched_entity *se = &p->se;
5674
5675 /* throttled hierarchies are not runnable */
5676 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5677 return false;
5678
5679 /* Tell the scheduler that we'd really like pse to run next. */
5680 set_next_buddy(se);
5681
5682 yield_task_fair(rq);
5683
5684 return true;
5685 }
5686
5687 #ifdef CONFIG_SMP
5688 /**************************************************
5689 * Fair scheduling class load-balancing methods.
5690 *
5691 * BASICS
5692 *
5693 * The purpose of load-balancing is to achieve the same basic fairness the
5694 * per-cpu scheduler provides, namely provide a proportional amount of compute
5695 * time to each task. This is expressed in the following equation:
5696 *
5697 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
5698 *
5699 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5700 * W_i,0 is defined as:
5701 *
5702 * W_i,0 = \Sum_j w_i,j (2)
5703 *
5704 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5705 * is derived from the nice value as per prio_to_weight[].
5706 *
5707 * The weight average is an exponential decay average of the instantaneous
5708 * weight:
5709 *
5710 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
5711 *
5712 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
5713 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5714 * can also include other factors [XXX].
5715 *
5716 * To achieve this balance we define a measure of imbalance which follows
5717 * directly from (1):
5718 *
5719 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
5720 *
5721 * We them move tasks around to minimize the imbalance. In the continuous
5722 * function space it is obvious this converges, in the discrete case we get
5723 * a few fun cases generally called infeasible weight scenarios.
5724 *
5725 * [XXX expand on:
5726 * - infeasible weights;
5727 * - local vs global optima in the discrete case. ]
5728 *
5729 *
5730 * SCHED DOMAINS
5731 *
5732 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5733 * for all i,j solution, we create a tree of cpus that follows the hardware
5734 * topology where each level pairs two lower groups (or better). This results
5735 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5736 * tree to only the first of the previous level and we decrease the frequency
5737 * of load-balance at each level inv. proportional to the number of cpus in
5738 * the groups.
5739 *
5740 * This yields:
5741 *
5742 * log_2 n 1 n
5743 * \Sum { --- * --- * 2^i } = O(n) (5)
5744 * i = 0 2^i 2^i
5745 * `- size of each group
5746 * | | `- number of cpus doing load-balance
5747 * | `- freq
5748 * `- sum over all levels
5749 *
5750 * Coupled with a limit on how many tasks we can migrate every balance pass,
5751 * this makes (5) the runtime complexity of the balancer.
5752 *
5753 * An important property here is that each CPU is still (indirectly) connected
5754 * to every other cpu in at most O(log n) steps:
5755 *
5756 * The adjacency matrix of the resulting graph is given by:
5757 *
5758 * log_2 n
5759 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
5760 * k = 0
5761 *
5762 * And you'll find that:
5763 *
5764 * A^(log_2 n)_i,j != 0 for all i,j (7)
5765 *
5766 * Showing there's indeed a path between every cpu in at most O(log n) steps.
5767 * The task movement gives a factor of O(m), giving a convergence complexity
5768 * of:
5769 *
5770 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
5771 *
5772 *
5773 * WORK CONSERVING
5774 *
5775 * In order to avoid CPUs going idle while there's still work to do, new idle
5776 * balancing is more aggressive and has the newly idle cpu iterate up the domain
5777 * tree itself instead of relying on other CPUs to bring it work.
5778 *
5779 * This adds some complexity to both (5) and (8) but it reduces the total idle
5780 * time.
5781 *
5782 * [XXX more?]
5783 *
5784 *
5785 * CGROUPS
5786 *
5787 * Cgroups make a horror show out of (2), instead of a simple sum we get:
5788 *
5789 * s_k,i
5790 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
5791 * S_k
5792 *
5793 * Where
5794 *
5795 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
5796 *
5797 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5798 *
5799 * The big problem is S_k, its a global sum needed to compute a local (W_i)
5800 * property.
5801 *
5802 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5803 * rewrite all of this once again.]
5804 */
5805
5806 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5807
5808 #define LBF_ALL_PINNED 0x01
5809 #define LBF_NEED_BREAK 0x02
5810 #define LBF_SOME_PINNED 0x04
5811
5812 struct lb_env {
5813 struct sched_domain *sd;
5814
5815 struct rq *src_rq;
5816 int src_cpu;
5817
5818 int dst_cpu;
5819 struct rq *dst_rq;
5820
5821 struct cpumask *dst_grpmask;
5822 int new_dst_cpu;
5823 enum cpu_idle_type idle;
5824 long imbalance;
5825 /* The set of CPUs under consideration for load-balancing */
5826 struct cpumask *cpus;
5827
5828 unsigned int flags;
5829
5830 unsigned int loop;
5831 unsigned int loop_break;
5832 unsigned int loop_max;
5833 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
5834 int mt_check_cache_in_idle;
5835 #endif
5836 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5837 unsigned int fail_reason;
5838 #endif
5839 };
5840
5841 /*
5842 * move_task - move a task from one runqueue to another runqueue.
5843 * Both runqueues must be locked.
5844 */
5845 static void move_task(struct task_struct *p, struct lb_env *env)
5846 {
5847 deactivate_task(env->src_rq, p, 0);
5848 set_task_cpu(p, env->dst_cpu);
5849 activate_task(env->dst_rq, p, 0);
5850 check_preempt_curr(env->dst_rq, p, 0);
5851
5852 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5853 if(PA_MON_ENABLE) {
5854 if(strcmp(p->comm, PA_MON) == 0) {
5855 printk(KERN_EMERG "[PA] %s Balance From CPU%d to CPU%d\n", p->comm, env->src_rq->cpu, env->dst_rq->cpu);
5856 }
5857 }
5858 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5859
5860 }
5861
5862 /*
5863 * Is this task likely cache-hot:
5864 */
5865 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
5866 static int
5867 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd, int mt_check_cache_in_idle)
5868 #else
5869 static int
5870 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
5871 #endif
5872 {
5873 s64 delta;
5874
5875 if (p->sched_class != &fair_sched_class)
5876 return 0;
5877
5878 if (unlikely(p->policy == SCHED_IDLE))
5879 return 0;
5880
5881 /*
5882 * Buddy candidates are cache hot:
5883 */
5884 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
5885 if (!mt_check_cache_in_idle){
5886 if ( !this_rq()->nr_running && (task_rq(p)->nr_running >= 2) )
5887 return 0;
5888 }
5889 #endif
5890 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
5891 (&p->se == cfs_rq_of(&p->se)->next ||
5892 &p->se == cfs_rq_of(&p->se)->last))
5893 return 1;
5894
5895 if (sysctl_sched_migration_cost == -1)
5896 return 1;
5897 if (sysctl_sched_migration_cost == 0)
5898 return 0;
5899
5900 delta = now - p->se.exec_start;
5901
5902 return delta < (s64)sysctl_sched_migration_cost;
5903 }
5904
5905 /*
5906 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5907 */
5908 static
5909 int can_migrate_task(struct task_struct *p, struct lb_env *env)
5910 {
5911 int tsk_cache_hot = 0;
5912 /*
5913 * We do not migrate tasks that are:
5914 * 1) throttled_lb_pair, or
5915 * 2) cannot be migrated to this CPU due to cpus_allowed, or
5916 * 3) running (obviously), or
5917 * 4) are cache-hot on their current CPU.
5918 */
5919 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5920 return 0;
5921
5922 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5923 int cpu;
5924
5925 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5926 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5927 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_AFFINITY);
5928 if(mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){
5929 char strings[128]="";
5930 snprintf(strings, 128, "%d:balance fail:affinity:%d:%d:%s:0x%lu"
5931 , env->dst_cpu, env->src_cpu, p->pid, p->comm, p->cpus_allowed.bits[0]);
5932 trace_sched_lbprof_log(strings);
5933 }
5934 #endif
5935
5936 /*
5937 * Remember if this task can be migrated to any other cpu in
5938 * our sched_group. We may want to revisit it if we couldn't
5939 * meet load balance goals by pulling other tasks on src_cpu.
5940 *
5941 * Also avoid computing new_dst_cpu if we have already computed
5942 * one in current iteration.
5943 */
5944 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
5945 return 0;
5946
5947 /* Prevent to re-select dst_cpu via env's cpus */
5948 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5949 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5950 env->flags |= LBF_SOME_PINNED;
5951 env->new_dst_cpu = cpu;
5952 break;
5953 }
5954 }
5955
5956 return 0;
5957 }
5958
5959 /* Record that we found atleast one task that could run on dst_cpu */
5960 env->flags &= ~LBF_ALL_PINNED;
5961
5962 if (task_running(env->src_rq, p)) {
5963 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5964 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5965 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_RUNNING);
5966 if( mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){
5967 char strings[128]="";
5968 snprintf(strings, 128, "%d:balance fail:running:%d:%d:%s"
5969 , env->dst_cpu, env->src_cpu, p->pid, p->comm);
5970 trace_sched_lbprof_log(strings);
5971 }
5972 #endif
5973 return 0;
5974 }
5975
5976 /*
5977 * Aggressive migration if:
5978 * 1) task is cache cold, or
5979 * 2) too many balance attempts have failed.
5980 */
5981 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
5982 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd, env->mt_check_cache_in_idle);
5983 #else
5984 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
5985 #endif
5986 if (!tsk_cache_hot ||
5987 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5988
5989 if (tsk_cache_hot) {
5990 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5991 schedstat_inc(p, se.statistics.nr_forced_migrations);
5992 }
5993
5994 return 1;
5995 }
5996
5997 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5998 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5999 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_CACHEHOT);
6000 if(mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){
6001 char strings[128]="";
6002 snprintf(strings, 128, "%d:balance fail:cache hot:%d:%d:%s"
6003 , env->dst_cpu, env->src_cpu, p->pid, p->comm);
6004 trace_sched_lbprof_log(strings);
6005 }
6006 #endif
6007 return 0;
6008 }
6009
6010 /*
6011 * move_one_task tries to move exactly one task from busiest to this_rq, as
6012 * part of active balancing operations within "domain".
6013 * Returns 1 if successful and 0 otherwise.
6014 *
6015 * Called with both runqueues locked.
6016 */
6017 static int move_one_task(struct lb_env *env)
6018 {
6019 struct task_struct *p, *n;
6020 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
6021 env->mt_check_cache_in_idle = 1;
6022 #endif
6023 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
6024 mt_lbprof_stat_set(env->fail_reason, MT_LBPROF_NO_TRIGGER);
6025 #endif
6026
6027 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6028 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6029 if(need_lazy_balance(env->dst_cpu, env->src_cpu, p))
6030 continue;
6031 #endif
6032 if (!can_migrate_task(p, env))
6033 continue;
6034
6035 move_task(p, env);
6036 /*
6037 * Right now, this is only the second place move_task()
6038 * is called, so we can safely collect move_task()
6039 * stats here rather than inside move_task().
6040 */
6041 schedstat_inc(env->sd, lb_gained[env->idle]);
6042 return 1;
6043 }
6044 return 0;
6045 }
6046
6047 static unsigned long task_h_load(struct task_struct *p);
6048
6049 static const unsigned int sched_nr_migrate_break = 32;
6050
6051 /* in second round load balance, we migrate heavy load_weight task
6052 as long as RT tasks exist in busy cpu*/
6053 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
6054 #define over_imbalance(lw, im) \
6055 (((lw)/2 > (im)) && \
6056 ((env->mt_check_cache_in_idle==1) || \
6057 (env->src_rq->rt.rt_nr_running==0) || \
6058 (pulled>0)))
6059 #else
6060 #define over_imbalance(lw, im) (((lw) / 2) > (im))
6061 #endif
6062
6063 /*
6064 * move_tasks tries to move up to imbalance weighted load from busiest to
6065 * this_rq, as part of a balancing operation within domain "sd".
6066 * Returns 1 if successful and 0 otherwise.
6067 *
6068 * Called with both runqueues locked.
6069 */
6070 static int move_tasks(struct lb_env *env)
6071 {
6072 struct list_head *tasks = &env->src_rq->cfs_tasks;
6073 struct task_struct *p;
6074 unsigned long load;
6075 int pulled = 0;
6076
6077 if (env->imbalance <= 0)
6078 return 0;
6079
6080 mt_sched_printf("move_tasks start ");
6081
6082 while (!list_empty(tasks)) {
6083 p = list_first_entry(tasks, struct task_struct, se.group_node);
6084
6085 env->loop++;
6086 /* We've more or less seen every task there is, call it quits */
6087 if (env->loop > env->loop_max)
6088 break;
6089
6090 /* take a breather every nr_migrate tasks */
6091 if (env->loop > env->loop_break) {
6092 env->loop_break += sched_nr_migrate_break;
6093 env->flags |= LBF_NEED_BREAK;
6094 break;
6095 }
6096 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6097 if(need_lazy_balance(env->dst_cpu, env->src_cpu, p))
6098 goto next;
6099 #endif
6100 if (!can_migrate_task(p, env))
6101 goto next;
6102
6103 load = task_h_load(p);
6104
6105 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
6106 goto next;
6107
6108 if (over_imbalance(load, env->imbalance))
6109 {
6110 goto next;
6111 }
6112
6113 move_task(p, env);
6114 pulled++;
6115 env->imbalance -= load;
6116
6117 #ifdef CONFIG_PREEMPT
6118 /*
6119 * NEWIDLE balancing is a source of latency, so preemptible
6120 * kernels will stop after the first task is pulled to minimize
6121 * the critical section.
6122 */
6123 if (env->idle == CPU_NEWLY_IDLE)
6124 break;
6125 #endif
6126
6127 /*
6128 * We only want to steal up to the prescribed amount of
6129 * weighted load.
6130 */
6131 if (env->imbalance <= 0)
6132 break;
6133
6134 continue;
6135 next:
6136 list_move_tail(&p->se.group_node, tasks);
6137 }
6138
6139 /*
6140 * Right now, this is one of only two places move_task() is called,
6141 * so we can safely collect move_task() stats here rather than
6142 * inside move_task().
6143 */
6144 schedstat_add(env->sd, lb_gained[env->idle], pulled);
6145
6146 mt_sched_printf("move_tasks end");
6147
6148 return pulled;
6149 }
6150
6151 #ifdef CONFIG_MTK_SCHED_CMP
6152 #ifdef CONFIG_MTK_SCHED_CMP_TGS
6153 static int cmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
6154 {
6155 struct sched_domain *sd = env->sd;
6156
6157 BUG_ON(sd == NULL);
6158
6159 if (!(sd->flags & SD_BALANCE_TG))
6160 return 0;
6161
6162 if (arch_is_multi_cluster()) {
6163 int src_clid, dst_clid;
6164 int src_nr_cpus;
6165 struct thread_group_info_t *src_tginfo, *dst_tginfo;
6166
6167 src_clid = get_cluster_id(env->src_cpu);
6168 dst_clid = get_cluster_id(env->dst_cpu);
6169 BUG_ON(dst_clid == -1 || src_clid == -1);
6170 BUG_ON(p == NULL || p->group_leader == NULL);
6171 src_tginfo = &p->group_leader->thread_group_info[src_clid];
6172 dst_tginfo = &p->group_leader->thread_group_info[dst_clid];
6173 src_nr_cpus = nr_cpus_in_cluster(src_clid, false);
6174
6175 #ifdef CONFIG_MT_SCHED_INFO
6176 mt_sched_printf("check rule0: pid=%d comm=%s load=%ld src:clid=%d tginfo->nr_running=%ld nr_cpus=%d load_avg_ratio=%ld",
6177 p->pid, p->comm, p->se.avg.load_avg_ratio,
6178 src_clid, src_tginfo->nr_running, src_nr_cpus,
6179 src_tginfo->load_avg_ratio);
6180 #endif
6181 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6182 if ( (!thread_group_empty(p)) &&
6183 (src_tginfo->nr_running <= src_nr_cpus) &&
6184 (src_tginfo->nr_running > dst_tginfo->nr_running)){
6185 mt_sched_printf("hit ruleA: bypass pid=%d comm=%s src:nr_running=%lu nr_cpus=%d dst:nr_running=%lu",
6186 p->pid, p->comm, src_tginfo->nr_running, src_nr_cpus, dst_tginfo->nr_running);
6187 return 0;
6188 }
6189 #endif
6190 }
6191 return 1;
6192 }
6193
6194 static int need_migrate_task_immediately(struct task_struct *p,
6195 struct lb_env *env, struct clb_env *clbenv)
6196 {
6197 struct sched_domain *sd = env->sd;
6198
6199 BUG_ON(sd == NULL);
6200
6201 if (arch_is_big_little()) {
6202 mt_sched_printf("[%s] b.L arch", __func__);
6203 #ifdef CONFIG_MT_SCHED_INFO
6204 mt_sched_printf("check rule0: pid=%d comm=%s src=%d dst=%d p->prio=%d p->se.avg.load_avg_ratio=%ld",
6205 p->pid, p->comm, env->src_cpu, env->dst_cpu, p->prio, p->se.avg.load_avg_ratio);
6206 #endif
6207 /* from LITTLE to big */
6208 if (arch_cpu_is_little(env->src_cpu) && arch_cpu_is_big(env->dst_cpu)) {
6209 BUG_ON(env->src_cpu != clbenv->ltarget);
6210 if (p->se.avg.load_avg_ratio >= clbenv->bstats.threshold)
6211 return 1;
6212
6213 /* from big to LITTLE */
6214 } else if (arch_cpu_is_big(env->src_cpu) && arch_cpu_is_little(env->dst_cpu)) {
6215 BUG_ON(env->src_cpu != clbenv->btarget);
6216 if (p->se.avg.load_avg_ratio < clbenv->lstats.threshold)
6217 return 1;
6218 }
6219 return 0;
6220 }
6221
6222 if (arch_is_multi_cluster() && (sd->flags & SD_BALANCE_TG)) {
6223 int src_clid, dst_clid;
6224 int src_nr_cpus;
6225 struct thread_group_info_t *src_tginfo, *dst_tginfo;
6226
6227 src_clid = get_cluster_id(env->src_cpu);
6228 dst_clid = get_cluster_id(env->dst_cpu);
6229 BUG_ON(dst_clid == -1 || src_clid == -1);
6230 BUG_ON(p == NULL || p->group_leader == NULL);
6231 src_tginfo = &p->group_leader->thread_group_info[src_clid];
6232 dst_tginfo = &p->group_leader->thread_group_info[dst_clid];
6233 src_nr_cpus = nr_cpus_in_cluster(src_clid, false);
6234 mt_sched_printf("[%s] L.L arch", __func__);
6235
6236 if ((p->se.avg.load_avg_ratio*4 >= NICE_0_LOAD*3) &&
6237 src_tginfo->nr_running > src_nr_cpus &&
6238 src_tginfo->load_avg_ratio*10 > NICE_0_LOAD*src_nr_cpus*9) {
6239 //pr_warn("[%s] hit rule0, candidate_load_move/load_move (%ld/%ld)\n",
6240 // __func__, candidate_load_move, env->imbalance);
6241 return 1;
6242 }
6243 }
6244
6245 return 0;
6246 }
6247 #endif
6248
6249 /*
6250 * move_tasks tries to move up to load_move weighted load from busiest to
6251 * this_rq, as part of a balancing operation within domain "sd".
6252 * Returns 1 if successful and 0 otherwise.
6253 *
6254 * Called with both runqueues locked.
6255 */
6256 static int cmp_move_tasks(struct sched_domain *sd, struct lb_env *env)
6257 {
6258 struct list_head *tasks = &env->src_rq->cfs_tasks;
6259 struct task_struct *p;
6260 unsigned long load = 0;
6261 int pulled = 0;
6262
6263 long tg_load_move, other_load_move;
6264 struct list_head tg_tasks, other_tasks;
6265 int src_clid, dst_clid;
6266 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6267 struct cpumask tmp, *cpus = &tmp;
6268 #endif
6269 #ifdef MTK_QUICK
6270 int flag = 0;
6271 #endif
6272 struct clb_env clbenv;
6273 struct cpumask srcmask, dstmask;
6274
6275 if (env->imbalance <= 0)
6276 return 0;
6277
6278 other_load_move = env->imbalance;
6279 INIT_LIST_HEAD(&other_tasks);
6280
6281 // if (sd->flags & SD_BALANCE_TG) {
6282 tg_load_move = env->imbalance;
6283 INIT_LIST_HEAD(&tg_tasks);
6284 src_clid = get_cluster_id(env->src_cpu);
6285 dst_clid = get_cluster_id(env->dst_cpu);
6286 BUG_ON(dst_clid == -1 || src_clid == -1);
6287
6288 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6289 get_cluster_cpus(cpus, src_clid, true);
6290 #endif
6291 mt_sched_printf("move_tasks_tg start: src:cpu=%d clid=%d runnable_load=%lu dst:cpu=%d clid=%d runnable_load=%lu imbalance=%ld curr->on_rq=%d",
6292 env->src_cpu, src_clid, cpu_rq(env->src_cpu)->cfs.runnable_load_avg,
6293 env->dst_cpu, dst_clid, cpu_rq(env->dst_cpu)->cfs.runnable_load_avg,
6294 env->imbalance, env->dst_rq->curr->on_rq);
6295 // }
6296
6297 mt_sched_printf("max=%d busiest->nr_running=%d",
6298 env->loop_max, cpu_rq(env->src_cpu)->nr_running);
6299
6300 if (arch_is_big_little()) {
6301 get_cluster_cpus(&srcmask, src_clid, true);
6302 get_cluster_cpus(&dstmask, dst_clid, true);
6303 memset(&clbenv, 0, sizeof(clbenv));
6304 clbenv.flags |= HMP_LB;
6305 clbenv.ltarget = arch_cpu_is_little(env->src_cpu) ? env->src_cpu : env->dst_cpu;
6306 clbenv.btarget = arch_cpu_is_big(env->src_cpu) ? env->src_cpu : env->dst_cpu;
6307 clbenv.lcpus = arch_cpu_is_little(env->src_cpu) ? &srcmask : &dstmask;
6308 clbenv.bcpus = arch_cpu_is_big(env->src_cpu) ? &srcmask : &dstmask;
6309 sched_update_clbstats(&clbenv);
6310 }
6311
6312 while (!list_empty(tasks)) {
6313 struct thread_group_info_t *src_tginfo, *dst_tginfo;
6314
6315 p = list_first_entry(tasks, struct task_struct, se.group_node);
6316
6317 #ifdef CONFIG_MT_SCHED_INFO
6318 mt_sched_printf("check: pid=%d comm=%s load_avg_contrib=%lu h_load=%lu runnable_load_avg=%lu loop=%d, env->imbalance=%ld tg_load_move=%ld",
6319 p->pid, p->comm, p->se.avg.load_avg_contrib,
6320 task_cfs_rq(p)->h_load, task_cfs_rq(p)->runnable_load_avg,
6321 env->loop, env->imbalance, tg_load_move);
6322 #endif
6323 env->loop++;
6324 /* We've more or less seen every task there is, call it quits */
6325 if (env->loop > env->loop_max)
6326 break;
6327
6328 #if 0 // TO check
6329 /* take a breather every nr_migrate tasks */
6330 if (env->loop > env->loop_break) {
6331 env->loop_break += sched_nr_migrate_break;
6332 env->flags |= LBF_NEED_BREAK;
6333 break;
6334 }
6335 #endif
6336 BUG_ON(p == NULL || p->group_leader == NULL);
6337 src_tginfo = &p->group_leader->thread_group_info[src_clid];
6338 dst_tginfo = &p->group_leader->thread_group_info[dst_clid];
6339
6340 /* rule0 */
6341 if (!can_migrate_task(p, env)) {
6342 mt_sched_printf("can not migrate: pid=%d comm=%s",
6343 p->pid, p->comm);
6344 goto next;
6345 }
6346
6347 load = task_h_load(p);
6348
6349 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) {
6350 mt_sched_printf("can not migrate: pid=%d comm=%s sched_feat",
6351 p->pid, p->comm );
6352 goto next;
6353 }
6354
6355 if (over_imbalance(load, env->imbalance)) {
6356 mt_sched_printf("can not migrate: pid=%d comm=%s load=%ld imbalance=%ld",
6357 p->pid, p->comm, load, env->imbalance );
6358 goto next;
6359 }
6360
6361 /* meet rule0 , migrate immediately */
6362 if (need_migrate_task_immediately(p, env, &clbenv)) {
6363 pulled++;
6364 env->imbalance -= load;
6365 tg_load_move -= load;
6366 other_load_move -= load;
6367 mt_sched_printf("hit rule0: pid=%d comm=%s load=%ld imbalance=%ld tg_imbalance=%ld other_load_move=%ld",
6368 p->pid, p->comm, load, env->imbalance, tg_load_move, other_load_move);
6369 move_task(p, env);
6370 if (env->imbalance <= 0)
6371 break;
6372 continue;
6373 }
6374
6375 /* for TGS */
6376 if (!cmp_can_migrate_task(p, env))
6377 goto next;
6378
6379 if (sd->flags & SD_BALANCE_TG){
6380 if (over_imbalance(load, tg_load_move)) {
6381 mt_sched_printf("can not migrate: pid=%d comm=%s load=%ld imbalance=%ld",
6382 p->pid, p->comm, load, tg_load_move );
6383 goto next;
6384 }
6385
6386 #ifdef MTK_QUICK
6387 if (candidate_load_move <= 0) {
6388 mt_sched_printf("check: pid=%d comm=%s candidate_load_move=%d",
6389 p->pid, p->comm, candidate_load_move);
6390 goto next;
6391 }
6392 #endif
6393
6394 /* rule1, single thread */
6395 #ifdef CONFIG_MT_SCHED_INFO
6396 mt_sched_printf("check rule1: pid=%d p->comm=%s thread_group_cnt=%lu thread_group_empty(p)=%d",
6397 p->pid, p->comm,
6398 p->group_leader->thread_group_info[0].nr_running +
6399 p->group_leader->thread_group_info[1].nr_running,
6400 thread_group_empty(p));
6401 #endif
6402
6403 if (thread_group_empty(p)) {
6404 list_move_tail(&p->se.group_node, &tg_tasks);
6405 tg_load_move -= load;
6406 other_load_move -= load;
6407 mt_sched_printf("hit rule1: pid=%d p->comm=%s load=%ld tg_imbalance=%ld",
6408 p->pid, p->comm, load, tg_load_move);
6409 continue;
6410 }
6411
6412 /* rule2 */
6413 #ifdef CONFIG_MT_SCHED_INFO
6414 mt_sched_printf("check rule2: pid=%d p->comm=%s %ld, %ld, %ld, %ld, %ld",
6415 p->pid, p->comm, src_tginfo->nr_running, src_tginfo->cfs_nr_running, dst_tginfo->nr_running,
6416 p->se.avg.load_avg_ratio, src_tginfo->load_avg_ratio);
6417 #endif
6418 if ((src_tginfo->nr_running < dst_tginfo->nr_running) &&
6419 ((p->se.avg.load_avg_ratio * src_tginfo->cfs_nr_running) <=
6420 src_tginfo->load_avg_ratio)) {
6421 list_move_tail(&p->se.group_node, &tg_tasks);
6422 tg_load_move -= load;
6423 other_load_move -= load;
6424 mt_sched_printf("hit rule2: pid=%d p->comm=%s load=%ld tg_imbalance=%ld",
6425 p->pid, p->comm, load, tg_load_move);
6426 continue;
6427 }
6428
6429 if (over_imbalance(load, other_load_move))
6430 goto next;
6431 /*
6432 if (other_load_move <= 0)
6433 goto next;
6434 */
6435
6436 list_move_tail(&p->se.group_node, &other_tasks);
6437 other_load_move -= load;
6438 continue;
6439 }else{
6440 list_move_tail(&p->se.group_node, &other_tasks);
6441 other_load_move -= load;
6442 continue;
6443 }
6444
6445 // ytchang
6446 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6447 if(need_lazy_balance(env->dst_cpu, env->src_cpu, p))
6448 goto next;
6449 #endif
6450
6451 next:
6452 /* original rule */
6453 list_move_tail(&p->se.group_node, tasks);
6454 } // end of while()
6455
6456 if ( sd->flags & SD_BALANCE_TG){
6457 while (!list_empty(&tg_tasks)) {
6458 p = list_first_entry(&tg_tasks, struct task_struct, se.group_node);
6459 list_move_tail(&p->se.group_node, tasks);
6460
6461 if (env->imbalance > 0) {
6462 load = task_h_load(p);
6463 if (over_imbalance(load, env->imbalance)){
6464 mt_sched_printf("overload rule1,2: pid=%d p->comm=%s load=%ld imbalance=%ld",
6465 p->pid, p->comm, load, env->imbalance);
6466 #ifdef MTK_QUICK
6467
6468 flag=1;
6469 #endif
6470 continue;
6471 }
6472
6473 move_task(p, env);
6474 env->imbalance -= load;
6475 pulled++;
6476
6477 mt_sched_printf("migrate hit rule1,2: pid=%d p->comm=%s load=%ld imbalance=%ld",
6478 p->pid, p->comm, load, env->imbalance);
6479 }
6480 }
6481 }
6482
6483 mt_sched_printf("move_tasks_tg finish rule migrate");
6484
6485 while (!list_empty(&other_tasks)) {
6486 p = list_first_entry(&other_tasks, struct task_struct, se.group_node);
6487 list_move_tail(&p->se.group_node, tasks);
6488
6489 #ifdef MTK_QUICK
6490 if (!flag && (env->imbalance > 0)) {
6491 #else
6492 if (env->imbalance > 0) {
6493 #endif
6494 load = task_h_load(p);
6495
6496 if (over_imbalance(load, env->imbalance)){
6497 mt_sched_printf("overload others: pid=%d p->comm=%s load=%ld imbalance=%ld",
6498 p->pid, p->comm, load, env->imbalance);
6499 continue;
6500 }
6501
6502 move_task(p, env);
6503 env->imbalance -= load;
6504 pulled++;
6505
6506 mt_sched_printf("migrate others: pid=%d p->comm=%s load=%ld imbalance=%ld",
6507 p->pid, p->comm, load, env->imbalance);
6508 }
6509 }
6510
6511 /*
6512 * Right now, this is one of only two places move_task() is called,
6513 * so we can safely collect move_task() stats here rather than
6514 * inside move_task().
6515 */
6516 schedstat_add(env->sd, lb_gained[env->idle], pulled);
6517
6518 mt_sched_printf("move_tasks_tg finish pulled=%d imbalance=%ld", pulled, env->imbalance);
6519
6520 return pulled;
6521 }
6522
6523 #endif /* CONFIG_MTK_SCHED_CMP */
6524
6525
6526 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6527 static int need_lazy_balance(int dst_cpu, int src_cpu, struct task_struct *p)
6528 {
6529 /* Lazy balnace for small task
6530 1. src cpu is buddy cpu
6531 2. src cpu is not busy cpu
6532 3. p is light task
6533 */
6534 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
6535 if ( PA_ENABLE && cpumask_test_cpu(src_cpu, &buddy_cpu_map) &&
6536 !is_buddy_busy(src_cpu) && is_light_task(p)) {
6537 #else
6538 if (cpumask_test_cpu(src_cpu, &buddy_cpu_map) &&
6539 !is_buddy_busy(src_cpu) && is_light_task(p)) {
6540 #endif
6541 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
6542 unsigned int i;
6543 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[src_cpu][dst_cpu]++;
6544 mt_sched_printf("[PA]pid=%d, Lazy balance from CPU%d to CPU%d\n)\n", p->pid, src_cpu, dst_cpu);
6545 for(i=0;i<4;i++) {
6546 if(PA_MON_ENABLE && (strcmp(p->comm, &PA_MON[i][0]) == 0)) {
6547 printk(KERN_EMERG "[PA] %s Lazy balance from CPU%d to CPU%d\n", p->comm, src_cpu, dst_cpu);
6548 // printk(KERN_EMERG "[PA] src_cpu RQ Usage = %u, Period = %u, NR = %u\n",
6549 // per_cpu(BUDDY_CPU_RQ_USAGE, src_cpu),
6550 // per_cpu(BUDDY_CPU_RQ_PERIOD, src_cpu),
6551 // per_cpu(BUDDY_CPU_RQ_NR, src_cpu));
6552 // printk(KERN_EMERG "[PA] Task Usage = %u, Period = %u\n",
6553 // p->se.avg.usage_avg_sum,
6554 // p->se.avg.runnable_avg_period);
6555 }
6556 }
6557 #endif
6558 return 1;
6559 }
6560 else
6561 return 0;
6562 }
6563 #endif
6564 #ifdef CONFIG_FAIR_GROUP_SCHED
6565 /*
6566 * update tg->load_weight by folding this cpu's load_avg
6567 */
6568 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
6569 {
6570 struct sched_entity *se = tg->se[cpu];
6571 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
6572
6573 /* throttled entities do not contribute to load */
6574 if (throttled_hierarchy(cfs_rq))
6575 return;
6576
6577 update_cfs_rq_blocked_load(cfs_rq, 1);
6578
6579 if (se) {
6580 update_entity_load_avg(se, 1);
6581 /*
6582 * We pivot on our runnable average having decayed to zero for
6583 * list removal. This generally implies that all our children
6584 * have also been removed (modulo rounding error or bandwidth
6585 * control); however, such cases are rare and we can fix these
6586 * at enqueue.
6587 *
6588 * TODO: fix up out-of-order children on enqueue.
6589 */
6590 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
6591 list_del_leaf_cfs_rq(cfs_rq);
6592 } else {
6593 struct rq *rq = rq_of(cfs_rq);
6594 update_rq_runnable_avg(rq, rq->nr_running);
6595 }
6596 }
6597
6598 static void update_blocked_averages(int cpu)
6599 {
6600 struct rq *rq = cpu_rq(cpu);
6601 struct cfs_rq *cfs_rq;
6602 unsigned long flags;
6603
6604 raw_spin_lock_irqsave(&rq->lock, flags);
6605 update_rq_clock(rq);
6606 /*
6607 * Iterates the task_group tree in a bottom up fashion, see
6608 * list_add_leaf_cfs_rq() for details.
6609 */
6610 for_each_leaf_cfs_rq(rq, cfs_rq) {
6611 /*
6612 * Note: We may want to consider periodically releasing
6613 * rq->lock about these updates so that creating many task
6614 * groups does not result in continually extending hold time.
6615 */
6616 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
6617 }
6618
6619 raw_spin_unlock_irqrestore(&rq->lock, flags);
6620 }
6621
6622 /*
6623 * Compute the cpu's hierarchical load factor for each task group.
6624 * This needs to be done in a top-down fashion because the load of a child
6625 * group is a fraction of its parents load.
6626 */
6627 static int tg_load_down(struct task_group *tg, void *data)
6628 {
6629 unsigned long load;
6630 long cpu = (long)data;
6631
6632 if (!tg->parent) {
6633 /*
6634 * rq's sched_avg is not updated accordingly. adopt rq's
6635 * corresponding cfs_rq runnable loading instead.
6636 *
6637 * a003a25b sched: Consider runnable load average...
6638 *
6639
6640 load = cpu_rq(cpu)->avg.load_avg_contrib;
6641
6642 */
6643 load = cpu_rq(cpu)->cfs.runnable_load_avg;
6644 } else {
6645 load = tg->parent->cfs_rq[cpu]->h_load;
6646 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
6647 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
6648 }
6649
6650 tg->cfs_rq[cpu]->h_load = load;
6651
6652 return 0;
6653 }
6654
6655 static void update_h_load(long cpu)
6656 {
6657 rcu_read_lock();
6658 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
6659 rcu_read_unlock();
6660 }
6661
6662 static unsigned long task_h_load(struct task_struct *p)
6663 {
6664 struct cfs_rq *cfs_rq = task_cfs_rq(p);
6665
6666 return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
6667 cfs_rq->runnable_load_avg + 1);
6668 }
6669 #else
6670 static inline void update_blocked_averages(int cpu)
6671 {
6672 }
6673
6674 static inline void update_h_load(long cpu)
6675 {
6676 }
6677
6678 static unsigned long task_h_load(struct task_struct *p)
6679 {
6680 return p->se.avg.load_avg_contrib;
6681 }
6682 #endif
6683
6684 /********** Helpers for find_busiest_group ************************/
6685 /*
6686 * sd_lb_stats - Structure to store the statistics of a sched_domain
6687 * during load balancing.
6688 */
6689 struct sd_lb_stats {
6690 struct sched_group *busiest; /* Busiest group in this sd */
6691 struct sched_group *this; /* Local group in this sd */
6692 unsigned long total_load; /* Total load of all groups in sd */
6693 unsigned long total_pwr; /* Total power of all groups in sd */
6694 unsigned long avg_load; /* Average load across all groups in sd */
6695
6696 /** Statistics of this group */
6697 unsigned long this_load;
6698 unsigned long this_load_per_task;
6699 unsigned long this_nr_running;
6700 unsigned long this_has_capacity;
6701 unsigned int this_idle_cpus;
6702
6703 /* Statistics of the busiest group */
6704 unsigned int busiest_idle_cpus;
6705 unsigned long max_load;
6706 unsigned long busiest_load_per_task;
6707 unsigned long busiest_nr_running;
6708 unsigned long busiest_group_capacity;
6709 unsigned long busiest_has_capacity;
6710 unsigned int busiest_group_weight;
6711
6712 int group_imb; /* Is there imbalance in this sd */
6713 };
6714
6715 /*
6716 * sg_lb_stats - stats of a sched_group required for load_balancing
6717 */
6718 struct sg_lb_stats {
6719 unsigned long avg_load; /*Avg load across the CPUs of the group */
6720 unsigned long group_load; /* Total load over the CPUs of the group */
6721 unsigned long sum_nr_running; /* Nr tasks running in the group */
6722 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6723 unsigned long group_capacity;
6724 unsigned long idle_cpus;
6725 unsigned long group_weight;
6726 int group_imb; /* Is there an imbalance in the group ? */
6727 int group_has_capacity; /* Is there extra capacity in the group? */
6728 };
6729
6730 /**
6731 * get_sd_load_idx - Obtain the load index for a given sched domain.
6732 * @sd: The sched_domain whose load_idx is to be obtained.
6733 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
6734 */
6735 static inline int get_sd_load_idx(struct sched_domain *sd,
6736 enum cpu_idle_type idle)
6737 {
6738 int load_idx;
6739
6740 switch (idle) {
6741 case CPU_NOT_IDLE:
6742 load_idx = sd->busy_idx;
6743 break;
6744
6745 case CPU_NEWLY_IDLE:
6746 load_idx = sd->newidle_idx;
6747 break;
6748 default:
6749 load_idx = sd->idle_idx;
6750 break;
6751 }
6752
6753 return load_idx;
6754 }
6755
6756 static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
6757 {
6758 return SCHED_POWER_SCALE;
6759 }
6760
6761 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
6762 {
6763 return default_scale_freq_power(sd, cpu);
6764 }
6765
6766 static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
6767 {
6768 unsigned long weight = sd->span_weight;
6769 unsigned long smt_gain = sd->smt_gain;
6770
6771 smt_gain /= weight;
6772
6773 return smt_gain;
6774 }
6775
6776 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
6777 {
6778 return default_scale_smt_power(sd, cpu);
6779 }
6780
6781 static unsigned long scale_rt_power(int cpu)
6782 {
6783 struct rq *rq = cpu_rq(cpu);
6784 u64 total, available, age_stamp, avg;
6785
6786 /*
6787 * Since we're reading these variables without serialization make sure
6788 * we read them once before doing sanity checks on them.
6789 */
6790 age_stamp = ACCESS_ONCE(rq->age_stamp);
6791 avg = ACCESS_ONCE(rq->rt_avg);
6792
6793 total = sched_avg_period() + (rq->clock - age_stamp);
6794
6795 if (unlikely(total < avg)) {
6796 /* Ensures that power won't end up being negative */
6797 available = 0;
6798 } else {
6799 available = total - avg;
6800 }
6801
6802 if (unlikely((s64)total < SCHED_POWER_SCALE))
6803 total = SCHED_POWER_SCALE;
6804
6805 total >>= SCHED_POWER_SHIFT;
6806
6807 return div_u64(available, total);
6808 }
6809
6810 static void update_cpu_power(struct sched_domain *sd, int cpu)
6811 {
6812 unsigned long weight = sd->span_weight;
6813 unsigned long power = SCHED_POWER_SCALE;
6814 struct sched_group *sdg = sd->groups;
6815
6816 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
6817 if (sched_feat(ARCH_POWER))
6818 power *= arch_scale_smt_power(sd, cpu);
6819 else
6820 power *= default_scale_smt_power(sd, cpu);
6821
6822 power >>= SCHED_POWER_SHIFT;
6823 }
6824
6825 sdg->sgp->power_orig = power;
6826
6827 if (sched_feat(ARCH_POWER))
6828 power *= arch_scale_freq_power(sd, cpu);
6829 else
6830 power *= default_scale_freq_power(sd, cpu);
6831
6832 power >>= SCHED_POWER_SHIFT;
6833
6834 power *= scale_rt_power(cpu);
6835 power >>= SCHED_POWER_SHIFT;
6836
6837 if (!power)
6838 power = 1;
6839
6840 cpu_rq(cpu)->cpu_power = power;
6841 sdg->sgp->power = power;
6842 }
6843
6844 void update_group_power(struct sched_domain *sd, int cpu)
6845 {
6846 struct sched_domain *child = sd->child;
6847 struct sched_group *group, *sdg = sd->groups;
6848 unsigned long power;
6849 unsigned long interval;
6850
6851 interval = msecs_to_jiffies(sd->balance_interval);
6852 interval = clamp(interval, 1UL, max_load_balance_interval);
6853 sdg->sgp->next_update = jiffies + interval;
6854
6855 if (!child) {
6856 update_cpu_power(sd, cpu);
6857 return;
6858 }
6859
6860 power = 0;
6861
6862 if (child->flags & SD_OVERLAP) {
6863 /*
6864 * SD_OVERLAP domains cannot assume that child groups
6865 * span the current group.
6866 */
6867
6868 for_each_cpu(cpu, sched_group_cpus(sdg))
6869 power += power_of(cpu);
6870 } else {
6871 /*
6872 * !SD_OVERLAP domains can assume that child groups
6873 * span the current group.
6874 */
6875
6876 group = child->groups;
6877 do {
6878 power += group->sgp->power;
6879 group = group->next;
6880 } while (group != child->groups);
6881 }
6882
6883 sdg->sgp->power_orig = sdg->sgp->power = power;
6884 }
6885
6886 /*
6887 * Try and fix up capacity for tiny siblings, this is needed when
6888 * things like SD_ASYM_PACKING need f_b_g to select another sibling
6889 * which on its own isn't powerful enough.
6890 *
6891 * See update_sd_pick_busiest() and check_asym_packing().
6892 */
6893 static inline int
6894 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
6895 {
6896 /*
6897 * Only siblings can have significantly less than SCHED_POWER_SCALE
6898 */
6899 if (!(sd->flags & SD_SHARE_CPUPOWER))
6900 return 0;
6901
6902 /*
6903 * If ~90% of the cpu_power is still there, we're good.
6904 */
6905 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
6906 return 1;
6907
6908 return 0;
6909 }
6910
6911 /**
6912 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6913 * @env: The load balancing environment.
6914 * @group: sched_group whose statistics are to be updated.
6915 * @load_idx: Load index of sched_domain of this_cpu for load calc.
6916 * @local_group: Does group contain this_cpu.
6917 * @balance: Should we balance.
6918 * @sgs: variable to hold the statistics for this group.
6919 */
6920 static inline void update_sg_lb_stats(struct lb_env *env,
6921 struct sched_group *group, int load_idx,
6922 int local_group, int *balance, struct sg_lb_stats *sgs)
6923 {
6924 unsigned long nr_running, max_nr_running, min_nr_running;
6925 unsigned long load, max_cpu_load, min_cpu_load;
6926 unsigned int balance_cpu = -1, first_idle_cpu = 0;
6927 unsigned long avg_load_per_task = 0;
6928 int i;
6929
6930 if (local_group)
6931 balance_cpu = group_balance_cpu(group);
6932
6933 /* Tally up the load of all CPUs in the group */
6934 max_cpu_load = 0;
6935 min_cpu_load = ~0UL;
6936 max_nr_running = 0;
6937 min_nr_running = ~0UL;
6938
6939 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6940 struct rq *rq = cpu_rq(i);
6941
6942 nr_running = rq->nr_running;
6943
6944 /* Bias balancing toward cpus of our domain */
6945 if (local_group) {
6946 if (idle_cpu(i) && !first_idle_cpu &&
6947 cpumask_test_cpu(i, sched_group_mask(group))) {
6948 first_idle_cpu = 1;
6949 balance_cpu = i;
6950 }
6951
6952 load = target_load(i, load_idx);
6953 } else {
6954 load = source_load(i, load_idx);
6955 if (load > max_cpu_load)
6956 max_cpu_load = load;
6957 if (min_cpu_load > load)
6958 min_cpu_load = load;
6959
6960 if (nr_running > max_nr_running)
6961 max_nr_running = nr_running;
6962 if (min_nr_running > nr_running)
6963 min_nr_running = nr_running;
6964
6965 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
6966 if((load_idx > 0) && (load == cpu_rq(i)->cpu_load[load_idx-1]))
6967 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_HISTORY);
6968 #endif
6969 }
6970
6971 sgs->group_load += load;
6972 sgs->sum_nr_running += nr_running;
6973 sgs->sum_weighted_load += weighted_cpuload(i);
6974 if (idle_cpu(i))
6975 sgs->idle_cpus++;
6976 }
6977
6978 /*
6979 * First idle cpu or the first cpu(busiest) in this sched group
6980 * is eligible for doing load balancing at this and above
6981 * domains. In the newly idle case, we will allow all the cpu's
6982 * to do the newly idle load balance.
6983 */
6984 if (local_group) {
6985 if (env->idle != CPU_NEWLY_IDLE) {
6986 if (balance_cpu != env->dst_cpu) {
6987 *balance = 0;
6988 return;
6989 }
6990 update_group_power(env->sd, env->dst_cpu);
6991 } else if (time_after_eq(jiffies, group->sgp->next_update))
6992 update_group_power(env->sd, env->dst_cpu);
6993 }
6994
6995 /* Adjust by relative CPU power of the group */
6996 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
6997
6998 /*
6999 * Consider the group unbalanced when the imbalance is larger
7000 * than the average weight of a task.
7001 *
7002 * APZ: with cgroup the avg task weight can vary wildly and
7003 * might not be a suitable number - should we keep a
7004 * normalized nr_running number somewhere that negates
7005 * the hierarchy?
7006 */
7007 if (sgs->sum_nr_running)
7008 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
7009
7010 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
7011 (max_nr_running - min_nr_running) > 1)
7012 sgs->group_imb = 1;
7013
7014 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
7015 SCHED_POWER_SCALE);
7016 if (!sgs->group_capacity)
7017 sgs->group_capacity = fix_small_capacity(env->sd, group);
7018 sgs->group_weight = group->group_weight;
7019
7020 if (sgs->group_capacity > sgs->sum_nr_running)
7021 sgs->group_has_capacity = 1;
7022 }
7023
7024 /**
7025 * update_sd_pick_busiest - return 1 on busiest group
7026 * @env: The load balancing environment.
7027 * @sds: sched_domain statistics
7028 * @sg: sched_group candidate to be checked for being the busiest
7029 * @sgs: sched_group statistics
7030 *
7031 * Determine if @sg is a busier group than the previously selected
7032 * busiest group.
7033 */
7034 static bool update_sd_pick_busiest(struct lb_env *env,
7035 struct sd_lb_stats *sds,
7036 struct sched_group *sg,
7037 struct sg_lb_stats *sgs)
7038 {
7039 if (sgs->avg_load <= sds->max_load) {
7040 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_PICK_BUSIEST_FAIL_1);
7041 return false;
7042 }
7043
7044 if (sgs->sum_nr_running > sgs->group_capacity)
7045 return true;
7046
7047 if (sgs->group_imb)
7048 return true;
7049
7050 /*
7051 * ASYM_PACKING needs to move all the work to the lowest
7052 * numbered CPUs in the group, therefore mark all groups
7053 * higher than ourself as busy.
7054 */
7055 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
7056 env->dst_cpu < group_first_cpu(sg)) {
7057 if (!sds->busiest)
7058 return true;
7059
7060 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
7061 return true;
7062 }
7063
7064 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_PICK_BUSIEST_FAIL_2);
7065 return false;
7066 }
7067
7068 /**
7069 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
7070 * @env: The load balancing environment.
7071 * @balance: Should we balance.
7072 * @sds: variable to hold the statistics for this sched_domain.
7073 */
7074 static inline void update_sd_lb_stats(struct lb_env *env,
7075 int *balance, struct sd_lb_stats *sds)
7076 {
7077 struct sched_domain *child = env->sd->child;
7078 struct sched_group *sg = env->sd->groups;
7079 struct sg_lb_stats sgs;
7080 int load_idx, prefer_sibling = 0;
7081
7082 if (child && child->flags & SD_PREFER_SIBLING)
7083 prefer_sibling = 1;
7084
7085 load_idx = get_sd_load_idx(env->sd, env->idle);
7086
7087 do {
7088 int local_group;
7089
7090 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
7091 memset(&sgs, 0, sizeof(sgs));
7092 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
7093
7094 if (local_group && !(*balance))
7095 return;
7096
7097 sds->total_load += sgs.group_load;
7098 sds->total_pwr += sg->sgp->power;
7099
7100 /*
7101 * In case the child domain prefers tasks go to siblings
7102 * first, lower the sg capacity to one so that we'll try
7103 * and move all the excess tasks away. We lower the capacity
7104 * of a group only if the local group has the capacity to fit
7105 * these excess tasks, i.e. nr_running < group_capacity. The
7106 * extra check prevents the case where you always pull from the
7107 * heaviest group when it is already under-utilized (possible
7108 * with a large weight task outweighs the tasks on the system).
7109 */
7110 if (prefer_sibling && !local_group && sds->this_has_capacity)
7111 sgs.group_capacity = min(sgs.group_capacity, 1UL);
7112
7113 if (local_group) {
7114 sds->this_load = sgs.avg_load;
7115 sds->this = sg;
7116 sds->this_nr_running = sgs.sum_nr_running;
7117 sds->this_load_per_task = sgs.sum_weighted_load;
7118 sds->this_has_capacity = sgs.group_has_capacity;
7119 sds->this_idle_cpus = sgs.idle_cpus;
7120 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
7121 sds->max_load = sgs.avg_load;
7122 sds->busiest = sg;
7123 sds->busiest_nr_running = sgs.sum_nr_running;
7124 sds->busiest_idle_cpus = sgs.idle_cpus;
7125 sds->busiest_group_capacity = sgs.group_capacity;
7126 sds->busiest_load_per_task = sgs.sum_weighted_load;
7127 sds->busiest_has_capacity = sgs.group_has_capacity;
7128 sds->busiest_group_weight = sgs.group_weight;
7129 sds->group_imb = sgs.group_imb;
7130 }
7131
7132 sg = sg->next;
7133 } while (sg != env->sd->groups);
7134 }
7135
7136 /**
7137 * check_asym_packing - Check to see if the group is packed into the
7138 * sched doman.
7139 *
7140 * This is primarily intended to used at the sibling level. Some
7141 * cores like POWER7 prefer to use lower numbered SMT threads. In the
7142 * case of POWER7, it can move to lower SMT modes only when higher
7143 * threads are idle. When in lower SMT modes, the threads will
7144 * perform better since they share less core resources. Hence when we
7145 * have idle threads, we want them to be the higher ones.
7146 *
7147 * This packing function is run on idle threads. It checks to see if
7148 * the busiest CPU in this domain (core in the P7 case) has a higher
7149 * CPU number than the packing function is being run on. Here we are
7150 * assuming lower CPU number will be equivalent to lower a SMT thread
7151 * number.
7152 *
7153 * Returns 1 when packing is required and a task should be moved to
7154 * this CPU. The amount of the imbalance is returned in *imbalance.
7155 *
7156 * @env: The load balancing environment.
7157 * @sds: Statistics of the sched_domain which is to be packed
7158 */
7159 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
7160 {
7161 int busiest_cpu;
7162
7163 if (!(env->sd->flags & SD_ASYM_PACKING))
7164 return 0;
7165
7166 if (!sds->busiest)
7167 return 0;
7168
7169 busiest_cpu = group_first_cpu(sds->busiest);
7170 if (env->dst_cpu > busiest_cpu)
7171 return 0;
7172
7173 env->imbalance = DIV_ROUND_CLOSEST(
7174 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
7175
7176 return 1;
7177 }
7178
7179 /**
7180 * fix_small_imbalance - Calculate the minor imbalance that exists
7181 * amongst the groups of a sched_domain, during
7182 * load balancing.
7183 * @env: The load balancing environment.
7184 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
7185 */
7186 static inline
7187 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7188 {
7189 unsigned long tmp, pwr_now = 0, pwr_move = 0;
7190 unsigned int imbn = 2;
7191 unsigned long scaled_busy_load_per_task;
7192
7193 if (sds->this_nr_running) {
7194 sds->this_load_per_task /= sds->this_nr_running;
7195 if (sds->busiest_load_per_task >
7196 sds->this_load_per_task)
7197 imbn = 1;
7198 } else {
7199 sds->this_load_per_task =
7200 cpu_avg_load_per_task(env->dst_cpu);
7201 }
7202
7203 scaled_busy_load_per_task = sds->busiest_load_per_task
7204 * SCHED_POWER_SCALE;
7205 scaled_busy_load_per_task /= sds->busiest->sgp->power;
7206
7207 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
7208 (scaled_busy_load_per_task * imbn)) {
7209 env->imbalance = sds->busiest_load_per_task;
7210 return;
7211 }
7212
7213 /*
7214 * OK, we don't have enough imbalance to justify moving tasks,
7215 * however we may be able to increase total CPU power used by
7216 * moving them.
7217 */
7218
7219 pwr_now += sds->busiest->sgp->power *
7220 min(sds->busiest_load_per_task, sds->max_load);
7221 pwr_now += sds->this->sgp->power *
7222 min(sds->this_load_per_task, sds->this_load);
7223 pwr_now /= SCHED_POWER_SCALE;
7224
7225 /* Amount of load we'd subtract */
7226 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
7227 sds->busiest->sgp->power;
7228 if (sds->max_load > tmp)
7229 pwr_move += sds->busiest->sgp->power *
7230 min(sds->busiest_load_per_task, sds->max_load - tmp);
7231
7232 /* Amount of load we'd add */
7233 if (sds->max_load * sds->busiest->sgp->power <
7234 sds->busiest_load_per_task * SCHED_POWER_SCALE)
7235 tmp = (sds->max_load * sds->busiest->sgp->power) /
7236 sds->this->sgp->power;
7237 else
7238 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
7239 sds->this->sgp->power;
7240 pwr_move += sds->this->sgp->power *
7241 min(sds->this_load_per_task, sds->this_load + tmp);
7242 pwr_move /= SCHED_POWER_SCALE;
7243
7244 /* Move if we gain throughput */
7245 if (pwr_move > pwr_now)
7246 env->imbalance = sds->busiest_load_per_task;
7247 }
7248
7249 /**
7250 * calculate_imbalance - Calculate the amount of imbalance present within the
7251 * groups of a given sched_domain during load balance.
7252 * @env: load balance environment
7253 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
7254 */
7255 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7256 {
7257 unsigned long max_pull, load_above_capacity = ~0UL;
7258
7259 sds->busiest_load_per_task /= sds->busiest_nr_running;
7260 if (sds->group_imb) {
7261 sds->busiest_load_per_task =
7262 min(sds->busiest_load_per_task, sds->avg_load);
7263 }
7264
7265 /*
7266 * In the presence of smp nice balancing, certain scenarios can have
7267 * max load less than avg load(as we skip the groups at or below
7268 * its cpu_power, while calculating max_load..)
7269 */
7270 if (sds->max_load < sds->avg_load) {
7271 env->imbalance = 0;
7272 return fix_small_imbalance(env, sds);
7273 }
7274
7275 if (!sds->group_imb) {
7276 /*
7277 * Don't want to pull so many tasks that a group would go idle.
7278 */
7279 load_above_capacity = (sds->busiest_nr_running -
7280 sds->busiest_group_capacity);
7281
7282 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
7283
7284 load_above_capacity /= sds->busiest->sgp->power;
7285 }
7286
7287 /*
7288 * We're trying to get all the cpus to the average_load, so we don't
7289 * want to push ourselves above the average load, nor do we wish to
7290 * reduce the max loaded cpu below the average load. At the same time,
7291 * we also don't want to reduce the group load below the group capacity
7292 * (so that we can implement power-savings policies etc). Thus we look
7293 * for the minimum possible imbalance.
7294 * Be careful of negative numbers as they'll appear as very large values
7295 * with unsigned longs.
7296 */
7297 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
7298
7299 /* How much load to actually move to equalise the imbalance */
7300 env->imbalance = min(max_pull * sds->busiest->sgp->power,
7301 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
7302 / SCHED_POWER_SCALE;
7303
7304 /*
7305 * if *imbalance is less than the average load per runnable task
7306 * there is no guarantee that any tasks will be moved so we'll have
7307 * a think about bumping its value to force at least one task to be
7308 * moved
7309 */
7310 if (env->imbalance < sds->busiest_load_per_task)
7311 return fix_small_imbalance(env, sds);
7312
7313 }
7314
7315 /******* find_busiest_group() helpers end here *********************/
7316
7317 /**
7318 * find_busiest_group - Returns the busiest group within the sched_domain
7319 * if there is an imbalance. If there isn't an imbalance, and
7320 * the user has opted for power-savings, it returns a group whose
7321 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
7322 * such a group exists.
7323 *
7324 * Also calculates the amount of weighted load which should be moved
7325 * to restore balance.
7326 *
7327 * @env: The load balancing environment.
7328 * @balance: Pointer to a variable indicating if this_cpu
7329 * is the appropriate cpu to perform load balancing at this_level.
7330 *
7331 * Returns: - the busiest group if imbalance exists.
7332 * - If no imbalance and user has opted for power-savings balance,
7333 * return the least loaded group whose CPUs can be
7334 * put to idle by rebalancing its tasks onto our group.
7335 */
7336 static struct sched_group *
7337 find_busiest_group(struct lb_env *env, int *balance)
7338 {
7339 struct sd_lb_stats sds;
7340
7341 memset(&sds, 0, sizeof(sds));
7342
7343 /*
7344 * Compute the various statistics relavent for load balancing at
7345 * this level.
7346 */
7347 update_sd_lb_stats(env, balance, &sds);
7348
7349 /*
7350 * this_cpu is not the appropriate cpu to perform load balancing at
7351 * this level.
7352 */
7353 if (!(*balance)){
7354 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_BALANCE);
7355 goto ret;
7356 }
7357
7358 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
7359 check_asym_packing(env, &sds))
7360 return sds.busiest;
7361
7362 /* There is no busy sibling group to pull tasks from */
7363 if (!sds.busiest || sds.busiest_nr_running == 0){
7364 if(!sds.busiest){
7365 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_BUSIEST_NO_TASK);
7366 }else{
7367 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_BUSIEST);
7368 }
7369 goto out_balanced;
7370 }
7371
7372 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
7373
7374 /*
7375 * If the busiest group is imbalanced the below checks don't
7376 * work because they assumes all things are equal, which typically
7377 * isn't true due to cpus_allowed constraints and the like.
7378 */
7379 if (sds.group_imb)
7380 goto force_balance;
7381
7382 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
7383 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
7384 !sds.busiest_has_capacity)
7385 goto force_balance;
7386
7387 /*
7388 * If the local group is more busy than the selected busiest group
7389 * don't try and pull any tasks.
7390 */
7391 if (sds.this_load >= sds.max_load){
7392 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_LARGER_THAN);
7393 goto out_balanced;
7394 }
7395
7396 /*
7397 * Don't pull any tasks if this group is already above the domain
7398 * average load.
7399 */
7400 if (sds.this_load >= sds.avg_load){
7401 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_LARGER_THAN);
7402 goto out_balanced;
7403 }
7404
7405 if (env->idle == CPU_IDLE) {
7406 /*
7407 * This cpu is idle. If the busiest group load doesn't
7408 * have more tasks than the number of available cpu's and
7409 * there is no imbalance between this and busiest group
7410 * wrt to idle cpu's, it is balanced.
7411 */
7412 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
7413 sds.busiest_nr_running <= sds.busiest_group_weight)
7414 goto out_balanced;
7415 } else {
7416 /*
7417 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
7418 * imbalance_pct to be conservative.
7419 */
7420 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load){
7421 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_CHECK_FAIL);
7422 goto out_balanced;
7423 }
7424 }
7425
7426 force_balance:
7427 /* Looks like there is an imbalance. Compute it */
7428 calculate_imbalance(env, &sds);
7429 return sds.busiest;
7430
7431 out_balanced:
7432 ret:
7433 env->imbalance = 0;
7434 return NULL;
7435 }
7436
7437 /*
7438 * find_busiest_queue - find the busiest runqueue among the cpus in group.
7439 */
7440 static struct rq *find_busiest_queue(struct lb_env *env,
7441 struct sched_group *group)
7442 {
7443 struct rq *busiest = NULL, *rq;
7444 unsigned long max_load = 0;
7445 int i;
7446
7447 for_each_cpu(i, sched_group_cpus(group)) {
7448 unsigned long power = power_of(i);
7449 unsigned long capacity = DIV_ROUND_CLOSEST(power,
7450 SCHED_POWER_SCALE);
7451 unsigned long wl;
7452
7453 if (!capacity)
7454 capacity = fix_small_capacity(env->sd, group);
7455
7456 if (!cpumask_test_cpu(i, env->cpus))
7457 continue;
7458
7459 rq = cpu_rq(i);
7460 wl = weighted_cpuload(i);
7461
7462 /*
7463 * When comparing with imbalance, use weighted_cpuload()
7464 * which is not scaled with the cpu power.
7465 */
7466 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
7467 continue;
7468
7469 /*
7470 * For the load comparisons with the other cpu's, consider
7471 * the weighted_cpuload() scaled with the cpu power, so that
7472 * the load can be moved away from the cpu that is potentially
7473 * running at a lower capacity.
7474 */
7475 wl = (wl * SCHED_POWER_SCALE) / power;
7476
7477 if (wl > max_load) {
7478 max_load = wl;
7479 busiest = rq;
7480 }
7481 }
7482
7483 return busiest;
7484 }
7485
7486 /*
7487 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
7488 * so long as it is large enough.
7489 */
7490 #define MAX_PINNED_INTERVAL 512
7491
7492 /* Working cpumask for load_balance and load_balance_newidle. */
7493 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7494
7495 static int need_active_balance(struct lb_env *env)
7496 {
7497 struct sched_domain *sd = env->sd;
7498
7499 if (env->idle == CPU_NEWLY_IDLE) {
7500
7501 /*
7502 * ASYM_PACKING needs to force migrate tasks from busy but
7503 * higher numbered CPUs in order to pack all tasks in the
7504 * lowest numbered CPUs.
7505 */
7506 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
7507 return 1;
7508 }
7509
7510 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
7511 }
7512
7513 static int active_load_balance_cpu_stop(void *data);
7514
7515 /*
7516 * Check this_cpu to ensure it is balanced within domain. Attempt to move
7517 * tasks if there is an imbalance.
7518 */
7519 static int load_balance(int this_cpu, struct rq *this_rq,
7520 struct sched_domain *sd, enum cpu_idle_type idle,
7521 int *balance)
7522 {
7523 int ld_moved, cur_ld_moved, active_balance = 0;
7524 struct sched_group *group;
7525 struct rq *busiest;
7526 unsigned long flags;
7527 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
7528
7529 struct lb_env env = {
7530 .sd = sd,
7531 .dst_cpu = this_cpu,
7532 .dst_rq = this_rq,
7533 .dst_grpmask = sched_group_cpus(sd->groups),
7534 .idle = idle,
7535 .loop_break = sched_nr_migrate_break,
7536 .cpus = cpus,
7537 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7538 .fail_reason= MT_LBPROF_NO_TRIGGER,
7539 #endif
7540 };
7541
7542 /*
7543 * For NEWLY_IDLE load_balancing, we don't need to consider
7544 * other cpus in our group
7545 */
7546 if (idle == CPU_NEWLY_IDLE)
7547 env.dst_grpmask = NULL;
7548
7549 cpumask_copy(cpus, cpu_active_mask);
7550
7551 schedstat_inc(sd, lb_count[idle]);
7552
7553 redo:
7554 group = find_busiest_group(&env, balance);
7555
7556 if (*balance == 0)
7557 goto out_balanced;
7558
7559 if (!group) {
7560 schedstat_inc(sd, lb_nobusyg[idle]);
7561 if(mt_lbprof_test(env.fail_reason, MT_LBPROF_HISTORY)){
7562 int tmp_cpu;
7563 for_each_cpu(tmp_cpu, cpu_possible_mask){
7564 if (tmp_cpu == this_rq->cpu)
7565 continue;
7566 mt_lbprof_update_state(tmp_cpu, MT_LBPROF_BALANCE_FAIL_STATE);
7567 }
7568 }
7569 goto out_balanced;
7570 }
7571
7572 busiest = find_busiest_queue(&env, group);
7573 if (!busiest) {
7574 schedstat_inc(sd, lb_nobusyq[idle]);
7575 mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_NOBUSYQ);
7576 goto out_balanced;
7577 }
7578
7579 #ifdef CONFIG_HMP_LAZY_BALANCE
7580
7581 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7582 if (PA_ENABLE && LB_ENABLE) {
7583 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7584
7585 if (per_cpu(sd_pack_buddy, this_cpu) == busiest->cpu && !is_buddy_busy(per_cpu(sd_pack_buddy, this_cpu))) {
7586
7587 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7588 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[this_cpu][busiest->cpu]++;
7589
7590 #ifdef CONFIG_HMP_TRACER
7591 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_BALANCE_FORM_CPUX_TO_CPUY, 0, this_cpu, busiest->cpu);
7592 #endif /* CONFIG_HMP_TRACER */
7593
7594 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7595
7596 schedstat_inc(sd, lb_nobusyq[idle]);
7597 goto out_balanced;
7598 }
7599
7600 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7601 }
7602 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7603
7604 #endif /* CONFIG_HMP_LAZY_BALANCE */
7605
7606 BUG_ON(busiest == env.dst_rq);
7607
7608 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
7609
7610 ld_moved = 0;
7611 if (busiest->nr_running > 1) {
7612 /*
7613 * Attempt to move tasks. If find_busiest_group has found
7614 * an imbalance but busiest->nr_running <= 1, the group is
7615 * still unbalanced. ld_moved simply stays zero, so it is
7616 * correctly treated as an imbalance.
7617 */
7618 env.flags |= LBF_ALL_PINNED;
7619 env.src_cpu = busiest->cpu;
7620 env.src_rq = busiest;
7621 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
7622 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7623 env.mt_check_cache_in_idle = 1;
7624 #endif
7625
7626 update_h_load(env.src_cpu);
7627 more_balance:
7628 local_irq_save(flags);
7629 double_rq_lock(env.dst_rq, busiest);
7630 #ifdef CONFIG_MTK_SCHED_CMP
7631 env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
7632 mt_sched_printf("1 env.loop_max=%d, busiest->nr_running=%d src=%d, dst=%d, cpus_share_cache=%d",
7633 env.loop_max, busiest->nr_running, env.src_cpu, env.dst_cpu, cpus_share_cache(env.src_cpu, env.dst_cpu));
7634 #endif /* CONFIG_MTK_SCHED_CMP */
7635 /*
7636 * cur_ld_moved - load moved in current iteration
7637 * ld_moved - cumulative load moved across iterations
7638 */
7639 #ifdef CONFIG_MTK_SCHED_CMP
7640 if (!cpus_share_cache(env.src_cpu, env.dst_cpu))
7641 cur_ld_moved = cmp_move_tasks(sd, &env);
7642 else
7643 cur_ld_moved = move_tasks(&env);
7644 #else /* !CONFIG_MTK_SCHED_CMP */
7645 cur_ld_moved = move_tasks(&env);
7646 #endif /* CONFIG_MTK_SCHED_CMP */
7647 ld_moved += cur_ld_moved;
7648 double_rq_unlock(env.dst_rq, busiest);
7649 local_irq_restore(flags);
7650
7651 /*
7652 * some other cpu did the load balance for us.
7653 */
7654 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
7655 resched_cpu(env.dst_cpu);
7656
7657 if (env.flags & LBF_NEED_BREAK) {
7658 env.flags &= ~LBF_NEED_BREAK;
7659 goto more_balance;
7660 }
7661
7662 /*
7663 * Revisit (affine) tasks on src_cpu that couldn't be moved to
7664 * us and move them to an alternate dst_cpu in our sched_group
7665 * where they can run. The upper limit on how many times we
7666 * iterate on same src_cpu is dependent on number of cpus in our
7667 * sched_group.
7668 *
7669 * This changes load balance semantics a bit on who can move
7670 * load to a given_cpu. In addition to the given_cpu itself
7671 * (or a ilb_cpu acting on its behalf where given_cpu is
7672 * nohz-idle), we now have balance_cpu in a position to move
7673 * load to given_cpu. In rare situations, this may cause
7674 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7675 * _independently_ and at _same_ time to move some load to
7676 * given_cpu) causing exceess load to be moved to given_cpu.
7677 * This however should not happen so much in practice and
7678 * moreover subsequent load balance cycles should correct the
7679 * excess load moved.
7680 */
7681 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
7682
7683 env.dst_rq = cpu_rq(env.new_dst_cpu);
7684 env.dst_cpu = env.new_dst_cpu;
7685 env.flags &= ~LBF_SOME_PINNED;
7686 env.loop = 0;
7687 env.loop_break = sched_nr_migrate_break;
7688
7689 /* Prevent to re-select dst_cpu via env's cpus */
7690 cpumask_clear_cpu(env.dst_cpu, env.cpus);
7691
7692 /*
7693 * Go back to "more_balance" rather than "redo" since we
7694 * need to continue with same src_cpu.
7695 */
7696 goto more_balance;
7697 }
7698
7699 /* All tasks on this runqueue were pinned by CPU affinity */
7700 if (unlikely(env.flags & LBF_ALL_PINNED)) {
7701 mt_lbprof_update_state(busiest->cpu, MT_LBPROF_ALLPINNED);
7702 cpumask_clear_cpu(cpu_of(busiest), cpus);
7703 if (!cpumask_empty(cpus)) {
7704 env.loop = 0;
7705 env.loop_break = sched_nr_migrate_break;
7706 goto redo;
7707 }
7708 goto out_balanced;
7709 }
7710
7711 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7712 /* when move tasks fil, force migration no matter cache-hot */
7713 /* use mt_check_cache_in_idle */
7714 if (!ld_moved && ((CPU_NEWLY_IDLE == idle) || (CPU_IDLE == idle) ) ) {
7715 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7716 mt_lbprof_stat_set(env.fail_reason, MT_LBPROF_DO_LB);
7717 #endif
7718 env.mt_check_cache_in_idle = 0;
7719 env.loop = 0;
7720 local_irq_save(flags);
7721 double_rq_lock(env.dst_rq, busiest);
7722 #ifdef CONFIG_MTK_SCHED_CMP
7723 env.loop_max = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
7724 mt_sched_printf("2 env.loop_max=%d, busiest->nr_running=%d",
7725 env.loop_max, busiest->nr_running);
7726 #endif /* CONFIG_MTK_SCHED_CMP */
7727 if (!env.loop)
7728 update_h_load(env.src_cpu);
7729 #ifdef CONFIG_MTK_SCHED_CMP_TGS
7730 if (!cpus_share_cache(env.src_cpu, env.dst_cpu))
7731 ld_moved = cmp_move_tasks(sd, &env);
7732 else{
7733 ld_moved = move_tasks(&env);
7734 }
7735 #else /* !CONFIG_MTK_SCHED_CMP_TGS */
7736 ld_moved = move_tasks(&env);
7737 #endif /* CONFIG_MTK_SCHED_CMP_TGS */
7738 double_rq_unlock(env.dst_rq, busiest);
7739 local_irq_restore(flags);
7740
7741 /*
7742 * some other cpu did the load balance for us.
7743 */
7744 if (ld_moved && this_cpu != smp_processor_id())
7745 resched_cpu(this_cpu);
7746 }
7747 #endif
7748 }
7749
7750 if (!ld_moved) {
7751 schedstat_inc(sd, lb_failed[idle]);
7752 mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_FAILED);
7753 if ( mt_lbprof_test(env.fail_reason, MT_LBPROF_AFFINITY) ) {
7754 mt_lbprof_update_state(busiest->cpu, MT_LBPROF_FAILURE_STATE);
7755 }else if ( mt_lbprof_test(env.fail_reason, MT_LBPROF_CACHEHOT) ) {
7756 mt_lbprof_update_state(busiest->cpu, MT_LBPROF_FAILURE_STATE);
7757 }
7758
7759 /*
7760 * Increment the failure counter only on periodic balance.
7761 * We do not want newidle balance, which can be very
7762 * frequent, pollute the failure counter causing
7763 * excessive cache_hot migrations and active balances.
7764 */
7765 if (idle != CPU_NEWLY_IDLE)
7766 sd->nr_balance_failed++;
7767 mt_lbprof_stat_inc(sd, mt_lbprof_nr_balance_failed);
7768
7769 if (need_active_balance(&env)) {
7770 raw_spin_lock_irqsave(&busiest->lock, flags);
7771
7772 /* don't kick the active_load_balance_cpu_stop,
7773 * if the curr task on busiest cpu can't be
7774 * moved to this_cpu
7775 */
7776 if (!cpumask_test_cpu(this_cpu,
7777 tsk_cpus_allowed(busiest->curr))) {
7778 raw_spin_unlock_irqrestore(&busiest->lock,
7779 flags);
7780 env.flags |= LBF_ALL_PINNED;
7781 goto out_one_pinned;
7782 }
7783
7784 /*
7785 * ->active_balance synchronizes accesses to
7786 * ->active_balance_work. Once set, it's cleared
7787 * only after active load balance is finished.
7788 */
7789 if (!busiest->active_balance) {
7790 busiest->active_balance = 1;
7791 busiest->push_cpu = this_cpu;
7792 active_balance = 1;
7793 }
7794 raw_spin_unlock_irqrestore(&busiest->lock, flags);
7795
7796 if (active_balance) {
7797 stop_one_cpu_nowait(cpu_of(busiest),
7798 active_load_balance_cpu_stop, busiest,
7799 &busiest->active_balance_work);
7800 }
7801
7802 /*
7803 * We've kicked active balancing, reset the failure
7804 * counter.
7805 */
7806 sd->nr_balance_failed = sd->cache_nice_tries+1;
7807 }
7808 } else
7809 sd->nr_balance_failed = 0;
7810
7811 if (likely(!active_balance)) {
7812 /* We were unbalanced, so reset the balancing interval */
7813 sd->balance_interval = sd->min_interval;
7814 } else {
7815 /*
7816 * If we've begun active balancing, start to back off. This
7817 * case may not be covered by the all_pinned logic if there
7818 * is only 1 task on the busy runqueue (because we don't call
7819 * move_tasks).
7820 */
7821 if (sd->balance_interval < sd->max_interval)
7822 sd->balance_interval *= 2;
7823 }
7824
7825 goto out;
7826
7827 out_balanced:
7828 schedstat_inc(sd, lb_balanced[idle]);
7829
7830 sd->nr_balance_failed = 0;
7831 mt_lbprof_stat_set(sd->mt_lbprof_nr_balance_failed, 0);
7832
7833 out_one_pinned:
7834 /* tune up the balancing interval */
7835 if (((env.flags & LBF_ALL_PINNED) &&
7836 sd->balance_interval < MAX_PINNED_INTERVAL) ||
7837 (sd->balance_interval < sd->max_interval))
7838 sd->balance_interval *= 2;
7839
7840 ld_moved = 0;
7841 out:
7842 if (ld_moved){
7843 mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_SUCCESS);
7844 mt_lbprof_stat_set(sd->mt_lbprof_nr_balance_failed, 0);
7845 }
7846
7847 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7848 if( CPU_NEWLY_IDLE == idle){
7849 char strings[128]="";
7850 snprintf(strings, 128, "%d:idle balance:%d:0x%x ", this_cpu, ld_moved, env.fail_reason);
7851 mt_lbprof_rqinfo(strings);
7852 trace_sched_lbprof_log(strings);
7853 }else{
7854 char strings[128]="";
7855 snprintf(strings, 128, "%d:periodic balance:%d:0x%x ", this_cpu, ld_moved, env.fail_reason);
7856 mt_lbprof_rqinfo(strings);
7857 trace_sched_lbprof_log(strings);
7858 }
7859 #endif
7860
7861 return ld_moved;
7862 }
7863
7864 /*
7865 * idle_balance is called by schedule() if this_cpu is about to become
7866 * idle. Attempts to pull tasks from other CPUs.
7867 */
7868 void idle_balance(int this_cpu, struct rq *this_rq)
7869 {
7870 struct sched_domain *sd;
7871 int pulled_task = 0;
7872 unsigned long next_balance = jiffies + HZ;
7873 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) || defined(CONFIG_MT_LOAD_BALANCE_PROFILER)
7874 unsigned long counter = 0;
7875 #endif
7876
7877 this_rq->idle_stamp = this_rq->clock;
7878
7879 mt_lbprof_update_state_has_lock(this_cpu, MT_LBPROF_UPDATE_STATE);
7880 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7881 #ifdef CONFIG_LOCAL_TIMERS
7882 counter = localtimer_get_counter();
7883 if ( counter >= 260000 ) // 20ms
7884 goto must_do;
7885 if ( time_before(jiffies + 2, this_rq->next_balance) ) // 20ms
7886 goto must_do;
7887 #endif
7888 #endif
7889
7890 if (this_rq->avg_idle < sysctl_sched_migration_cost){
7891 #if defined(CONFIG_MT_LOAD_BALANCE_PROFILER)
7892 char strings[128]="";
7893 mt_lbprof_update_state_has_lock(this_cpu, MT_LBPROF_ALLOW_UNBLANCE_STATE);
7894 snprintf(strings, 128, "%d:idle balance bypass: %llu %lu ", this_cpu, this_rq->avg_idle, counter);
7895 mt_lbprof_rqinfo(strings);
7896 trace_sched_lbprof_log(strings);
7897 #endif
7898 return;
7899 }
7900
7901 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7902 must_do:
7903 #endif
7904
7905 /*
7906 * Drop the rq->lock, but keep IRQ/preempt disabled.
7907 */
7908 raw_spin_unlock(&this_rq->lock);
7909
7910 mt_lbprof_update_status();
7911 update_blocked_averages(this_cpu);
7912 rcu_read_lock();
7913 for_each_domain(this_cpu, sd) {
7914 unsigned long interval;
7915 int balance = 1;
7916
7917 if (!(sd->flags & SD_LOAD_BALANCE))
7918 continue;
7919
7920 if (sd->flags & SD_BALANCE_NEWIDLE) {
7921 /* If we've pulled tasks over stop searching: */
7922 pulled_task = load_balance(this_cpu, this_rq,
7923 sd, CPU_NEWLY_IDLE, &balance);
7924 }
7925
7926 interval = msecs_to_jiffies(sd->balance_interval);
7927 if (time_after(next_balance, sd->last_balance + interval))
7928 next_balance = sd->last_balance + interval;
7929 if (pulled_task) {
7930 this_rq->idle_stamp = 0;
7931 break;
7932 }
7933 }
7934 rcu_read_unlock();
7935
7936 raw_spin_lock(&this_rq->lock);
7937
7938 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
7939 /*
7940 * We are going idle. next_balance may be set based on
7941 * a busy processor. So reset next_balance.
7942 */
7943 this_rq->next_balance = next_balance;
7944 }
7945 }
7946
7947 /*
7948 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7949 * running tasks off the busiest CPU onto idle CPUs. It requires at
7950 * least 1 task to be running on each physical CPU where possible, and
7951 * avoids physical / logical imbalances.
7952 */
7953 static int active_load_balance_cpu_stop(void *data)
7954 {
7955 struct rq *busiest_rq = data;
7956 int busiest_cpu = cpu_of(busiest_rq);
7957 int target_cpu = busiest_rq->push_cpu;
7958 struct rq *target_rq = cpu_rq(target_cpu);
7959 struct sched_domain *sd;
7960
7961 raw_spin_lock_irq(&busiest_rq->lock);
7962
7963 /* make sure the requested cpu hasn't gone down in the meantime */
7964 if (unlikely(busiest_cpu != smp_processor_id() ||
7965 !busiest_rq->active_balance))
7966 goto out_unlock;
7967
7968 /* Is there any task to move? */
7969 if (busiest_rq->nr_running <= 1)
7970 goto out_unlock;
7971
7972 /*
7973 * This condition is "impossible", if it occurs
7974 * we need to fix it. Originally reported by
7975 * Bjorn Helgaas on a 128-cpu setup.
7976 */
7977 BUG_ON(busiest_rq == target_rq);
7978
7979 /* move a task from busiest_rq to target_rq */
7980 double_lock_balance(busiest_rq, target_rq);
7981
7982 /* Search for an sd spanning us and the target CPU. */
7983 rcu_read_lock();
7984 for_each_domain(target_cpu, sd) {
7985 if ((sd->flags & SD_LOAD_BALANCE) &&
7986 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7987 break;
7988 }
7989
7990 if (likely(sd)) {
7991 struct lb_env env = {
7992 .sd = sd,
7993 .dst_cpu = target_cpu,
7994 .dst_rq = target_rq,
7995 .src_cpu = busiest_rq->cpu,
7996 .src_rq = busiest_rq,
7997 .idle = CPU_IDLE,
7998 };
7999
8000 schedstat_inc(sd, alb_count);
8001
8002 if (move_one_task(&env))
8003 schedstat_inc(sd, alb_pushed);
8004 else
8005 schedstat_inc(sd, alb_failed);
8006 }
8007 rcu_read_unlock();
8008 double_unlock_balance(busiest_rq, target_rq);
8009 out_unlock:
8010 busiest_rq->active_balance = 0;
8011 raw_spin_unlock_irq(&busiest_rq->lock);
8012 return 0;
8013 }
8014
8015 #ifdef CONFIG_NO_HZ_COMMON
8016 /*
8017 * idle load balancing details
8018 * - When one of the busy CPUs notice that there may be an idle rebalancing
8019 * needed, they will kick the idle load balancer, which then does idle
8020 * load balancing for all the idle CPUs.
8021 */
8022 static struct {
8023 cpumask_var_t idle_cpus_mask;
8024 atomic_t nr_cpus;
8025 unsigned long next_balance; /* in jiffy units */
8026 } nohz ____cacheline_aligned;
8027
8028
8029 static inline int find_new_ilb(int call_cpu)
8030 {
8031 #ifdef CONFIG_HMP_PACK_SMALL_TASK
8032
8033 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
8034
8035 struct sched_domain *sd;
8036
8037 int ilb_new = nr_cpu_ids;
8038
8039 int ilb_return = 0;
8040
8041 int ilb = cpumask_first(nohz.idle_cpus_mask);
8042
8043
8044 if(PA_ENABLE)
8045 {
8046 int buddy = per_cpu(sd_pack_buddy, call_cpu);
8047
8048 /*
8049 * If we have a pack buddy CPU, we try to run load balance on a CPU
8050 * that is close to the buddy.
8051 */
8052 if (buddy != -1)
8053 for_each_domain(buddy, sd) {
8054 if (sd->flags & SD_SHARE_CPUPOWER)
8055 continue;
8056
8057 ilb_new = cpumask_first_and(sched_domain_span(sd),
8058 nohz.idle_cpus_mask);
8059
8060 if (ilb_new < nr_cpu_ids)
8061 break;
8062
8063 }
8064 }
8065
8066 if (ilb < nr_cpu_ids && idle_cpu(ilb)) {
8067 ilb_return = 1;
8068 }
8069
8070 if (ilb_new < nr_cpu_ids) {
8071 if (idle_cpu(ilb_new)) {
8072 if(PA_ENABLE && ilb_return && ilb_new != ilb) {
8073 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[call_cpu][ilb]++;
8074
8075 #ifdef CONFIG_HMP_TRACER
8076 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_WAKE_UP_FORM_CPUX_TO_CPUY, 0, call_cpu, ilb);
8077 #endif /* CONFIG_HMP_TRACER */
8078
8079 }
8080 return ilb_new;
8081 }
8082 }
8083
8084 if(ilb_return) {
8085 return ilb;
8086 }
8087
8088 return nr_cpu_ids;
8089
8090 #else /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
8091
8092 struct sched_domain *sd;
8093 int ilb = cpumask_first(nohz.idle_cpus_mask);
8094 int buddy = per_cpu(sd_pack_buddy, call_cpu);
8095
8096 /*
8097 * If we have a pack buddy CPU, we try to run load balance on a CPU
8098 * that is close to the buddy.
8099 */
8100 if (buddy != -1)
8101 for_each_domain(buddy, sd) {
8102 if (sd->flags & SD_SHARE_CPUPOWER)
8103 continue;
8104
8105 ilb = cpumask_first_and(sched_domain_span(sd),
8106 nohz.idle_cpus_mask);
8107
8108 if (ilb < nr_cpu_ids)
8109 break;
8110 }
8111
8112 if (ilb < nr_cpu_ids && idle_cpu(ilb))
8113 return ilb;
8114
8115 return nr_cpu_ids;
8116
8117 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
8118
8119 #else /* CONFIG_HMP_PACK_SMALL_TASK */
8120
8121 int ilb = cpumask_first(nohz.idle_cpus_mask);
8122 #ifdef CONFIG_MTK_SCHED_CMP_TGS
8123 /* Find nohz balancing to occur in the same cluster firstly */
8124 int new_ilb;
8125 struct cpumask tmp;
8126 //Find idle cpu with online one
8127 get_cluster_cpus(&tmp, get_cluster_id(call_cpu), true);
8128 new_ilb = cpumask_first_and(nohz.idle_cpus_mask, &tmp);
8129 if (new_ilb < nr_cpu_ids && idle_cpu(new_ilb))
8130 {
8131 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
8132 if(new_ilb != ilb)
8133 {
8134 mt_sched_printf("[PA]find_new_ilb(cpu%x), new_ilb = %d, ilb = %d\n", call_cpu, new_ilb, ilb);
8135 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[call_cpu][ilb]++;
8136 }
8137 #endif
8138 return new_ilb;
8139 }
8140 #endif /* CONFIG_MTK_SCHED_CMP_TGS */
8141
8142 if (ilb < nr_cpu_ids && idle_cpu(ilb))
8143 return ilb;
8144
8145 return nr_cpu_ids;
8146
8147 #endif /* CONFIG_HMP_PACK_SMALL_TASK */
8148
8149 }
8150
8151
8152 /*
8153 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
8154 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
8155 * CPU (if there is one).
8156 */
8157 static void nohz_balancer_kick(int cpu)
8158 {
8159 int ilb_cpu;
8160
8161 nohz.next_balance++;
8162
8163 ilb_cpu = find_new_ilb(cpu);
8164
8165 if (ilb_cpu >= nr_cpu_ids)
8166 return;
8167
8168 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
8169 return;
8170 /*
8171 * Use smp_send_reschedule() instead of resched_cpu().
8172 * This way we generate a sched IPI on the target cpu which
8173 * is idle. And the softirq performing nohz idle load balance
8174 * will be run before returning from the IPI.
8175 */
8176 smp_send_reschedule(ilb_cpu);
8177 return;
8178 }
8179
8180 static inline void nohz_balance_exit_idle(int cpu)
8181 {
8182 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
8183 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
8184 atomic_dec(&nohz.nr_cpus);
8185 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8186 }
8187 }
8188
8189 static inline void set_cpu_sd_state_busy(void)
8190 {
8191 struct sched_domain *sd;
8192 int cpu = smp_processor_id();
8193
8194 rcu_read_lock();
8195 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
8196
8197 if (!sd || !sd->nohz_idle)
8198 goto unlock;
8199 sd->nohz_idle = 0;
8200
8201 for (; sd; sd = sd->parent)
8202 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
8203 unlock:
8204 rcu_read_unlock();
8205 }
8206
8207 void set_cpu_sd_state_idle(void)
8208 {
8209 struct sched_domain *sd;
8210 int cpu = smp_processor_id();
8211
8212 rcu_read_lock();
8213 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
8214
8215 if (!sd || sd->nohz_idle)
8216 goto unlock;
8217 sd->nohz_idle = 1;
8218
8219 for (; sd; sd = sd->parent)
8220 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
8221 unlock:
8222 rcu_read_unlock();
8223 }
8224
8225 /*
8226 * This routine will record that the cpu is going idle with tick stopped.
8227 * This info will be used in performing idle load balancing in the future.
8228 */
8229 void nohz_balance_enter_idle(int cpu)
8230 {
8231 /*
8232 * If this cpu is going down, then nothing needs to be done.
8233 */
8234 if (!cpu_active(cpu))
8235 return;
8236
8237 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
8238 return;
8239
8240 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
8241 atomic_inc(&nohz.nr_cpus);
8242 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8243 }
8244
8245 static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
8246 unsigned long action, void *hcpu)
8247 {
8248 switch (action & ~CPU_TASKS_FROZEN) {
8249 case CPU_DYING:
8250 nohz_balance_exit_idle(smp_processor_id());
8251 return NOTIFY_OK;
8252 default:
8253 return NOTIFY_DONE;
8254 }
8255 }
8256 #endif
8257
8258 static DEFINE_SPINLOCK(balancing);
8259
8260 /*
8261 * Scale the max load_balance interval with the number of CPUs in the system.
8262 * This trades load-balance latency on larger machines for less cross talk.
8263 */
8264 void update_max_interval(void)
8265 {
8266 max_load_balance_interval = HZ*num_online_cpus()/10;
8267 }
8268
8269 /*
8270 * It checks each scheduling domain to see if it is due to be balanced,
8271 * and initiates a balancing operation if so.
8272 *
8273 * Balancing parameters are set up in init_sched_domains.
8274 */
8275 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
8276 {
8277 int balance = 1;
8278 struct rq *rq = cpu_rq(cpu);
8279 unsigned long interval;
8280 struct sched_domain *sd;
8281 /* Earliest time when we have to do rebalance again */
8282 unsigned long next_balance = jiffies + 60*HZ;
8283 int update_next_balance = 0;
8284 int need_serialize;
8285
8286 update_blocked_averages(cpu);
8287
8288 rcu_read_lock();
8289 for_each_domain(cpu, sd) {
8290 if (!(sd->flags & SD_LOAD_BALANCE))
8291 continue;
8292
8293 interval = sd->balance_interval;
8294 if (idle != CPU_IDLE)
8295 interval *= sd->busy_factor;
8296
8297 /* scale ms to jiffies */
8298 interval = msecs_to_jiffies(interval);
8299 interval = clamp(interval, 1UL, max_load_balance_interval);
8300
8301 need_serialize = sd->flags & SD_SERIALIZE;
8302
8303 if (need_serialize) {
8304 if (!spin_trylock(&balancing))
8305 goto out;
8306 }
8307
8308 if (time_after_eq(jiffies, sd->last_balance + interval)) {
8309 if (load_balance(cpu, rq, sd, idle, &balance)) {
8310 /*
8311 * The LBF_SOME_PINNED logic could have changed
8312 * env->dst_cpu, so we can't know our idle
8313 * state even if we migrated tasks. Update it.
8314 */
8315 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
8316 }
8317 sd->last_balance = jiffies;
8318 }
8319 if (need_serialize)
8320 spin_unlock(&balancing);
8321 out:
8322 if (time_after(next_balance, sd->last_balance + interval)) {
8323 next_balance = sd->last_balance + interval;
8324 update_next_balance = 1;
8325 }
8326
8327 /*
8328 * Stop the load balance at this level. There is another
8329 * CPU in our sched group which is doing load balancing more
8330 * actively.
8331 */
8332 if (!balance)
8333 break;
8334 }
8335 rcu_read_unlock();
8336
8337 /*
8338 * next_balance will be updated only when there is a need.
8339 * When the cpu is attached to null domain for ex, it will not be
8340 * updated.
8341 */
8342 if (likely(update_next_balance))
8343 rq->next_balance = next_balance;
8344 }
8345
8346 #ifdef CONFIG_NO_HZ_COMMON
8347 /*
8348 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
8349 * rebalancing for all the cpus for whom scheduler ticks are stopped.
8350 */
8351 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
8352 {
8353 struct rq *this_rq = cpu_rq(this_cpu);
8354 struct rq *rq;
8355 int balance_cpu;
8356
8357 if (idle != CPU_IDLE ||
8358 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
8359 goto end;
8360
8361 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8362 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
8363 continue;
8364
8365 /*
8366 * If this cpu gets work to do, stop the load balancing
8367 * work being done for other cpus. Next load
8368 * balancing owner will pick it up.
8369 */
8370 if (need_resched())
8371 break;
8372
8373 rq = cpu_rq(balance_cpu);
8374
8375 raw_spin_lock_irq(&rq->lock);
8376 update_rq_clock(rq);
8377 update_idle_cpu_load(rq);
8378 raw_spin_unlock_irq(&rq->lock);
8379
8380 rebalance_domains(balance_cpu, CPU_IDLE);
8381
8382 if (time_after(this_rq->next_balance, rq->next_balance))
8383 this_rq->next_balance = rq->next_balance;
8384 }
8385 nohz.next_balance = this_rq->next_balance;
8386 end:
8387 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
8388 }
8389
8390 /*
8391 * Current heuristic for kicking the idle load balancer in the presence
8392 * of an idle cpu is the system.
8393 * - This rq has more than one task.
8394 * - At any scheduler domain level, this cpu's scheduler group has multiple
8395 * busy cpu's exceeding the group's power.
8396 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
8397 * domain span are idle.
8398 */
8399 static inline int nohz_kick_needed(struct rq *rq, int cpu)
8400 {
8401 unsigned long now = jiffies;
8402 struct sched_domain *sd;
8403
8404 if (unlikely(idle_cpu(cpu)))
8405 return 0;
8406
8407 /*
8408 * We may be recently in ticked or tickless idle mode. At the first
8409 * busy tick after returning from idle, we will update the busy stats.
8410 */
8411 set_cpu_sd_state_busy();
8412 nohz_balance_exit_idle(cpu);
8413
8414 /*
8415 * None are in tickless mode and hence no need for NOHZ idle load
8416 * balancing.
8417 */
8418 if (likely(!atomic_read(&nohz.nr_cpus)))
8419 return 0;
8420
8421 if (time_before(now, nohz.next_balance))
8422 return 0;
8423
8424 #ifdef CONFIG_SCHED_HMP
8425 /*
8426 * Bail out if there are no nohz CPUs in our
8427 * HMP domain, since we will move tasks between
8428 * domains through wakeup and force balancing
8429 * as necessary based upon task load.
8430 */
8431 if (cpumask_first_and(nohz.idle_cpus_mask,
8432 &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
8433 return 0;
8434 #endif
8435
8436 if (rq->nr_running >= 2)
8437 goto need_kick;
8438
8439 rcu_read_lock();
8440 for_each_domain(cpu, sd) {
8441 struct sched_group *sg = sd->groups;
8442 struct sched_group_power *sgp = sg->sgp;
8443 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
8444
8445 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
8446 goto need_kick_unlock;
8447
8448 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
8449 && (cpumask_first_and(nohz.idle_cpus_mask,
8450 sched_domain_span(sd)) < cpu))
8451 goto need_kick_unlock;
8452
8453 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
8454 break;
8455 }
8456 rcu_read_unlock();
8457 return 0;
8458
8459 need_kick_unlock:
8460 rcu_read_unlock();
8461 need_kick:
8462 return 1;
8463 }
8464 #else
8465 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
8466 #endif
8467
8468 #ifdef CONFIG_SCHED_HMP
8469 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
8470
8471 /*
8472 * Heterogenous Multi-Processor (HMP) - Declaration and Useful Macro
8473 */
8474
8475 /* Function Declaration */
8476 static int hmp_up_stable(int cpu);
8477 static int hmp_down_stable(int cpu);
8478 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se,
8479 struct clb_env *clbenv);
8480 static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se,
8481 struct clb_env *clbenv);
8482
8483 #define hmp_caller_is_gb(caller) ((HMP_GB == caller)?1:0)
8484
8485 #define hmp_cpu_is_fast(cpu) cpumask_test_cpu(cpu,&hmp_fast_cpu_mask)
8486 #define hmp_cpu_is_slow(cpu) cpumask_test_cpu(cpu,&hmp_slow_cpu_mask)
8487 #define hmp_cpu_stable(cpu) (hmp_cpu_is_fast(cpu)? \
8488 hmp_up_stable(cpu):hmp_down_stable(cpu))
8489
8490 #define hmp_inc(v) ((v) + 1)
8491 #define hmp_dec(v) ((v) - 1)
8492 #define hmp_pos(v) ((v) < (0) ? (0) : (v))
8493
8494 #define task_created(f) ((SD_BALANCE_EXEC == f || SD_BALANCE_FORK == f)?1:0)
8495 #define task_cpus_allowed(mask,p) cpumask_intersects(mask,tsk_cpus_allowed(p))
8496 #define task_slow_cpu_allowed(p) task_cpus_allowed(&hmp_slow_cpu_mask,p)
8497 #define task_fast_cpu_allowed(p) task_cpus_allowed(&hmp_fast_cpu_mask,p)
8498
8499 /*
8500 * Heterogenous Multi-Processor (HMP) - Utility Function
8501 */
8502
8503 /*
8504 * These functions add next up/down migration delay that prevents the task from
8505 * doing another migration in the same direction until the delay has expired.
8506 */
8507 static int hmp_up_stable(int cpu)
8508 {
8509 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8510 u64 now = cfs_rq_clock_task(cfs_rq);
8511 if (((now - hmp_last_up_migration(cpu)) >> 10) < hmp_next_up_threshold)
8512 return 0;
8513 return 1;
8514 }
8515
8516 static int hmp_down_stable(int cpu)
8517 {
8518 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8519 u64 now = cfs_rq_clock_task(cfs_rq);
8520 if (((now - hmp_last_down_migration(cpu)) >> 10) < hmp_next_down_threshold)
8521 return 0;
8522 return 1;
8523 }
8524
8525 /* Select the most appropriate CPU from hmp cluster */
8526 static unsigned int hmp_select_cpu(unsigned int caller, struct task_struct *p,
8527 struct cpumask *mask, int prev)
8528 {
8529 int curr = 0;
8530 int target = NR_CPUS;
8531 unsigned long curr_wload = 0;
8532 unsigned long target_wload = 0;
8533 struct cpumask srcp;
8534 cpumask_and(&srcp, cpu_online_mask, mask);
8535 target = cpumask_any_and(&srcp, tsk_cpus_allowed(p));
8536 if (NR_CPUS == target)
8537 goto out;
8538
8539 /*
8540 * RT class is taken into account because CPU load is multiplied
8541 * by the total number of CPU runnable tasks that includes RT tasks.
8542 */
8543 target_wload = hmp_inc(cfs_load(target));
8544 target_wload += cfs_pending_load(target);
8545 target_wload *= rq_length(target);
8546 for_each_cpu(curr, mask) {
8547 /* Check CPU status and task affinity */
8548 if(!cpu_online(curr) || !cpumask_test_cpu(curr, tsk_cpus_allowed(p)))
8549 continue;
8550
8551 /* For global load balancing, unstable CPU will be bypassed */
8552 if(hmp_caller_is_gb(caller) && !hmp_cpu_stable(curr))
8553 continue;
8554
8555 curr_wload = hmp_inc(cfs_load(curr));
8556 curr_wload += cfs_pending_load(curr);
8557 curr_wload *= rq_length(curr);
8558 if(curr_wload < target_wload) {
8559 target_wload = curr_wload;
8560 target = curr;
8561 } else if(curr_wload == target_wload && curr == prev) {
8562 target = curr;
8563 }
8564 }
8565
8566 out:
8567 return target;
8568 }
8569
8570 /*
8571 * Heterogenous Multi-Processor (HMP) - Task Runqueue Selection
8572 */
8573
8574 /* This function enhances the original task selection function */
8575 static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p,
8576 int prev_cpu, int new_cpu)
8577 {
8578 #ifdef CONFIG_HMP_TASK_ASSIGNMENT
8579 int step = 0;
8580 struct sched_entity *se = &p->se;
8581 int B_target = NR_CPUS;
8582 int L_target = NR_CPUS;
8583 struct clb_env clbenv;
8584
8585 #ifdef CONFIG_HMP_TRACER
8586 int cpu = 0;
8587 for_each_online_cpu(cpu)
8588 trace_sched_cfs_runnable_load(cpu,cfs_load(cpu),cfs_length(cpu));
8589 #endif
8590
8591 // error handling
8592 if (prev_cpu >= NR_CPUS)
8593 return new_cpu;
8594
8595 /*
8596 * Skip all the checks if only one CPU is online.
8597 * Otherwise, select the most appropriate CPU from cluster.
8598 */
8599 if (num_online_cpus() == 1)
8600 goto out;
8601 B_target = hmp_select_cpu(HMP_SELECT_RQ,p,&hmp_fast_cpu_mask,prev_cpu);
8602 L_target = hmp_select_cpu(HMP_SELECT_RQ,p,&hmp_slow_cpu_mask,prev_cpu);
8603
8604 /*
8605 * Only one cluster exists or only one cluster is allowed for this task
8606 * Case 1: return the runqueue whose load is minimum
8607 * Case 2: return original CFS runqueue selection result
8608 */
8609 #ifdef CONFIG_HMP_DISCARD_CFS_SELECTION_RESULT
8610 if(NR_CPUS == B_target && NR_CPUS == L_target)
8611 goto out;
8612 if(NR_CPUS == B_target)
8613 goto select_slow;
8614 if(NR_CPUS == L_target)
8615 goto select_fast;
8616 #else
8617 if(NR_CPUS == B_target || NR_CPUS == L_target)
8618 goto out;
8619 #endif
8620
8621 /*
8622 * Two clusters exist and both clusters are allowed for this task
8623 * Step 1: Move newly created task to the cpu where no tasks are running
8624 * Step 2: Migrate heavy-load task to big
8625 * Step 3: Migrate light-load task to LITTLE
8626 * Step 4: Make sure the task stays in its previous hmp domain
8627 */
8628 step = 1;
8629 if (task_created(sd_flag) && !task_low_priority(p->prio)) {
8630 if (!rq_length(B_target))
8631 goto select_fast;
8632 if (!rq_length(L_target))
8633 goto select_slow;
8634 }
8635 memset(&clbenv, 0, sizeof(clbenv));
8636 clbenv.flags |= HMP_SELECT_RQ;
8637 clbenv.lcpus = &hmp_slow_cpu_mask;
8638 clbenv.bcpus = &hmp_fast_cpu_mask;
8639 clbenv.ltarget = L_target;
8640 clbenv.btarget = B_target;
8641 sched_update_clbstats(&clbenv);
8642 step = 2;
8643 if (hmp_up_migration(L_target, &B_target, se, &clbenv))
8644 goto select_fast;
8645 step = 3;
8646 if (hmp_down_migration(B_target, &L_target, se, &clbenv))
8647 goto select_slow;
8648 step = 4;
8649 if (hmp_cpu_is_slow(prev_cpu))
8650 goto select_slow;
8651 goto select_fast;
8652
8653 select_fast:
8654 new_cpu = B_target;
8655 goto out;
8656 select_slow:
8657 new_cpu = L_target;
8658 goto out;
8659
8660 out:
8661
8662 // it happens when num_online_cpus=1
8663 if (new_cpu >= nr_cpu_ids)
8664 {
8665 //BUG_ON(1);
8666 new_cpu = prev_cpu;
8667 }
8668
8669 cfs_nr_pending(new_cpu)++;
8670 cfs_pending_load(new_cpu) += se_load(se);
8671 #ifdef CONFIG_HMP_TRACER
8672 trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
8673 trace_sched_hmp_select_task_rq(p,step,sd_flag,prev_cpu,new_cpu,
8674 se_load(se),&clbenv.bstats,&clbenv.lstats);
8675 #endif
8676 #ifdef CONFIG_MET_SCHED_HMP
8677 HmpLoad(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
8678 #endif
8679 #endif /* CONFIG_HMP_TASK_ASSIGNMENT */
8680 return new_cpu;
8681 }
8682
8683 /*
8684 * Heterogenous Multi-Processor (HMP) - Task Dynamic Migration Threshold
8685 */
8686
8687 /*
8688 * If the workload between clusters is not balanced, adjust migration
8689 * threshold in an attempt to move task to the cluster where the workload
8690 * is not heavy
8691 */
8692
8693 /*
8694 * According to ARM's cpu_efficiency table, the computing power of CA15 and
8695 * CA7 are 3891 and 2048 respectively. Thus, we assume big has twice the
8696 * computing power of LITTLE
8697 */
8698
8699 #define HMP_RATIO(v) ((v)*17/10)
8700
8701 #define hmp_fast_cpu_has_spare_cycles(B,cpu_load) (cpu_load < \
8702 (HMP_RATIO(B->cpu_capacity) - (B->cpu_capacity >> 2)))
8703
8704 #define hmp_task_fast_cpu_afford(B,se,cpu) (B->acap > 0 \
8705 && hmp_fast_cpu_has_spare_cycles(B,se_load(se) + cfs_load(cpu)))
8706
8707 #define hmp_fast_cpu_oversubscribed(caller,B,se,cpu) \
8708 (hmp_caller_is_gb(caller)? \
8709 !hmp_fast_cpu_has_spare_cycles(B,cfs_load(cpu)): \
8710 !hmp_task_fast_cpu_afford(B,se,cpu))
8711
8712 #define hmp_task_slow_cpu_afford(L,se) \
8713 (L->acap > 0 && L->acap >= se_load(se))
8714
8715 /* Macro used by low-priorty task filter */
8716 #define hmp_low_prio_task_up_rejected(p,B,L) \
8717 (task_low_priority(p->prio) && \
8718 (B->ntask >= B->ncpu || 0 != L->nr_normal_prio_task) && \
8719 (p->se.avg.load_avg_ratio < 800))
8720
8721 #define hmp_low_prio_task_down_allowed(p,B,L) \
8722 (task_low_priority(p->prio) && !B->nr_dequeuing_low_prio && \
8723 B->ntask >= B->ncpu && 0 != L->nr_normal_prio_task && \
8724 (p->se.avg.load_avg_ratio < 800))
8725
8726 /* Migration check result */
8727 #define HMP_BIG_NOT_OVERSUBSCRIBED (0x01)
8728 #define HMP_BIG_CAPACITY_INSUFFICIENT (0x02)
8729 #define HMP_LITTLE_CAPACITY_INSUFFICIENT (0x04)
8730 #define HMP_LOW_PRIORITY_FILTER (0x08)
8731 #define HMP_BIG_BUSY_LITTLE_IDLE (0x10)
8732 #define HMP_BIG_IDLE (0x20)
8733 #define HMP_MIGRATION_APPROVED (0x100)
8734 #define HMP_TASK_UP_MIGRATION (0x200)
8735 #define HMP_TASK_DOWN_MIGRATION (0x400)
8736
8737 /* Migration statistics */
8738 #ifdef CONFIG_HMP_TRACER
8739 struct hmp_statisic hmp_stats;
8740 #endif
8741
8742 static inline void hmp_dynamic_threshold(struct clb_env *clbenv)
8743 {
8744 struct clb_stats *L = &clbenv->lstats;
8745 struct clb_stats *B = &clbenv->bstats;
8746 unsigned int hmp_threshold_diff = hmp_up_threshold - hmp_down_threshold;
8747 unsigned int B_normalized_acap = hmp_pos(HMP_RATIO(B->scaled_acap));
8748 unsigned int B_normalized_atask = hmp_pos(HMP_RATIO(B->scaled_atask));
8749 unsigned int L_normalized_acap = hmp_pos(L->scaled_acap);
8750 unsigned int L_normalized_atask = hmp_pos(L->scaled_atask);
8751
8752 #ifdef CONFIG_HMP_DYNAMIC_THRESHOLD
8753 L->threshold = hmp_threshold_diff;
8754 L->threshold *= hmp_inc(L_normalized_acap) * hmp_inc(L_normalized_atask);
8755 L->threshold /= hmp_inc(B_normalized_acap + L_normalized_acap);
8756 L->threshold /= hmp_inc(B_normalized_atask + L_normalized_atask);
8757 L->threshold = hmp_down_threshold + L->threshold;
8758
8759 B->threshold = hmp_threshold_diff;
8760 B->threshold *= hmp_inc(B_normalized_acap) * hmp_inc(B_normalized_atask);
8761 B->threshold /= hmp_inc(B_normalized_acap + L_normalized_acap);
8762 B->threshold /= hmp_inc(B_normalized_atask + L_normalized_atask);
8763 B->threshold = hmp_up_threshold - B->threshold;
8764 #else /* !CONFIG_HMP_DYNAMIC_THRESHOLD */
8765 clbenv->lstats.threshold = hmp_down_threshold; // down threshold
8766 clbenv->bstats.threshold = hmp_up_threshold; // up threshold
8767 #endif /* CONFIG_HMP_DYNAMIC_THRESHOLD */
8768
8769 mt_sched_printf("[%s]\tup/dl:%4d/%4d bcpu(%d):%d/%d, lcpu(%d):%d/%d\n", __func__,
8770 B->threshold, L->threshold,
8771 clbenv->btarget, clbenv->bstats.cpu_capacity, clbenv->bstats.cpu_power,
8772 clbenv->ltarget, clbenv->lstats.cpu_capacity, clbenv->lstats.cpu_power);
8773 }
8774
8775 /*
8776 * Check whether this task should be migrated to big
8777 * Briefly summarize the flow as below;
8778 * 1) Migration stabilizing
8779 * 1.5) Keep all cpu busy
8780 * 2) Filter low-priorty task
8781 * 3) Check CPU capacity
8782 * 4) Check dynamic migration threshold
8783 */
8784 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se,
8785 struct clb_env *clbenv)
8786 {
8787 struct task_struct *p = task_of(se);
8788 struct clb_stats *L, *B;
8789 struct mcheck *check;
8790 int curr_cpu = cpu;
8791 unsigned int caller = clbenv->flags;
8792
8793 L = &clbenv->lstats;
8794 B = &clbenv->bstats;
8795 check = &clbenv->mcheck;
8796
8797 check->status = clbenv->flags;
8798 check->status |= HMP_TASK_UP_MIGRATION;
8799 check->result = 0;
8800
8801 /*
8802 * No migration is needed if
8803 * 1) There is only one cluster
8804 * 2) Task is already in big cluster
8805 * 3) It violates task affinity
8806 */
8807 if (!L->ncpu || !B->ncpu
8808 || cpumask_test_cpu(curr_cpu, clbenv->bcpus)
8809 || !cpumask_intersects(clbenv->bcpus, tsk_cpus_allowed(p)))
8810 goto out;
8811
8812 /*
8813 * [1] Migration stabilizing
8814 * Let the task load settle before doing another up migration.
8815 * It can prevent a bunch of tasks from migrating to a unstable CPU.
8816 */
8817 if (!hmp_up_stable(*target_cpu))
8818 goto out;
8819
8820 /* [2] Filter low-priorty task */
8821 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8822 if (hmp_low_prio_task_up_rejected(p,B,L)) {
8823 check->status |= HMP_LOW_PRIORITY_FILTER;
8824 goto trace;
8825 }
8826 #endif
8827
8828 // [2.5]if big is idle, just go to big
8829 if (rq_length(*target_cpu)==0)
8830 {
8831 check->status |= HMP_BIG_IDLE;
8832 check->status |= HMP_MIGRATION_APPROVED;
8833 check->result = 1;
8834 goto trace;
8835 }
8836
8837 /*
8838 * [3] Check CPU capacity
8839 * Forbid up-migration if big CPU can't handle this task
8840 */
8841 if (!hmp_task_fast_cpu_afford(B,se,*target_cpu)) {
8842 check->status |= HMP_BIG_CAPACITY_INSUFFICIENT;
8843 goto trace;
8844 }
8845
8846 /*
8847 * [4] Check dynamic migration threshold
8848 * Migrate task from LITTLE to big if load is greater than up-threshold
8849 */
8850 if (se_load(se) > B->threshold) {
8851 check->status |= HMP_MIGRATION_APPROVED;
8852 check->result = 1;
8853 }
8854
8855 trace:
8856 #ifdef CONFIG_HMP_TRACER
8857 if(check->result && hmp_caller_is_gb(caller))
8858 hmp_stats.nr_force_up++;
8859 trace_sched_hmp_stats(&hmp_stats);
8860 trace_sched_dynamic_threshold(task_of(se),B->threshold,check->status,
8861 curr_cpu,*target_cpu,se_load(se),B,L);
8862 #endif
8863 #ifdef CONFIG_MET_SCHED_HMP
8864 TaskTh(B->threshold,L->threshold);
8865 HmpStat(&hmp_stats);
8866 #endif
8867 out:
8868 return check->result;
8869 }
8870
8871 /*
8872 * Check whether this task should be migrated to LITTLE
8873 * Briefly summarize the flow as below;
8874 * 1) Migration stabilizing
8875 * 1.5) Keep all cpu busy
8876 * 2) Filter low-priorty task
8877 * 3) Check CPU capacity
8878 * 4) Check dynamic migration threshold
8879 */
8880 static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se,
8881 struct clb_env *clbenv)
8882 {
8883 struct task_struct *p = task_of(se);
8884 struct clb_stats *L, *B;
8885 struct mcheck *check;
8886 int curr_cpu = cpu;
8887 unsigned int caller = clbenv->flags;
8888
8889 L = &clbenv->lstats;
8890 B = &clbenv->bstats;
8891 check = &clbenv->mcheck;
8892
8893 check->status = caller;
8894 check->status |= HMP_TASK_DOWN_MIGRATION;
8895 check->result = 0;
8896
8897 /*
8898 * No migration is needed if
8899 * 1) There is only one cluster
8900 * 2) Task is already in LITTLE cluster
8901 * 3) It violates task affinity
8902 */
8903 if (!L->ncpu || !B->ncpu
8904 || cpumask_test_cpu(curr_cpu, clbenv->lcpus)
8905 || !cpumask_intersects(clbenv->lcpus, tsk_cpus_allowed(p)))
8906 goto out;
8907
8908 /*
8909 * [1] Migration stabilizing
8910 * Let the task load settle before doing another down migration.
8911 * It can prevent a bunch of tasks from migrating to a unstable CPU.
8912 */
8913 if (!hmp_down_stable(*target_cpu))
8914 goto out;
8915
8916 // [1.5]if big is busy and little is idle, just go to little
8917 if (rq_length(*target_cpu)==0 && caller == HMP_SELECT_RQ && rq_length(curr_cpu)>0)
8918 {
8919 check->status |= HMP_BIG_BUSY_LITTLE_IDLE;
8920 check->status |= HMP_MIGRATION_APPROVED;
8921 check->result = 1;
8922 goto trace;
8923 }
8924
8925 /* [2] Filter low-priorty task */
8926 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8927 if (hmp_low_prio_task_down_allowed(p,B,L)) {
8928 cfs_nr_dequeuing_low_prio(curr_cpu)++;
8929 check->status |= HMP_LOW_PRIORITY_FILTER;
8930 check->status |= HMP_MIGRATION_APPROVED;
8931 check->result = 1;
8932 goto trace;
8933 }
8934 #endif
8935
8936 /*
8937 * [3] Check CPU capacity
8938 * Forbid down-migration if either of the following conditions is true
8939 * 1) big cpu is not oversubscribed (if big CPU seems to have spare
8940 * cycles, do not force this task to run on LITTLE CPU, but
8941 * keep it staying in its previous cluster instead)
8942 * 2) LITTLE cpu doesn't have available capacity for this new task
8943 */
8944 if (!hmp_fast_cpu_oversubscribed(caller,B,se,curr_cpu)) {
8945 check->status |= HMP_BIG_NOT_OVERSUBSCRIBED;
8946 goto trace;
8947 }
8948
8949 if (!hmp_task_slow_cpu_afford(L,se)) {
8950 check->status |= HMP_LITTLE_CAPACITY_INSUFFICIENT;
8951 goto trace;
8952 }
8953
8954 /*
8955 * [4] Check dynamic migration threshold
8956 * Migrate task from big to LITTLE if load ratio is less than
8957 * or equal to down-threshold
8958 */
8959 if (L->threshold >= se_load(se)) {
8960 check->status |= HMP_MIGRATION_APPROVED;
8961 check->result = 1;
8962 }
8963
8964 trace:
8965 #ifdef CONFIG_HMP_TRACER
8966 if (check->result && hmp_caller_is_gb(caller))
8967 hmp_stats.nr_force_down++;
8968 trace_sched_hmp_stats(&hmp_stats);
8969 trace_sched_dynamic_threshold(task_of(se),L->threshold,check->status,
8970 curr_cpu,*target_cpu,se_load(se),B,L);
8971 #endif
8972 #ifdef CONFIG_MET_SCHED_HMP
8973 TaskTh(B->threshold,L->threshold);
8974 HmpStat(&hmp_stats);
8975 #endif
8976 out:
8977 return check->result;
8978 }
8979 #else /* CONFIG_SCHED_HMP_ENHANCEMENT */
8980 /* Check if task should migrate to a faster cpu */
8981 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
8982 {
8983 struct task_struct *p = task_of(se);
8984 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8985 u64 now;
8986
8987 if (target_cpu)
8988 *target_cpu = NR_CPUS;
8989
8990 if (hmp_cpu_is_fastest(cpu))
8991 return 0;
8992
8993 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8994 /* Filter by task priority */
8995 if (p->prio >= hmp_up_prio)
8996 return 0;
8997 #endif
8998 if (se->avg.load_avg_ratio < hmp_up_threshold)
8999 return 0;
9000
9001 /* Let the task load settle before doing another up migration */
9002 now = cfs_rq_clock_task(cfs_rq);
9003 if (((now - se->avg.hmp_last_up_migration) >> 10)
9004 < hmp_next_up_threshold)
9005 return 0;
9006
9007 /* Target domain load < 94% */
9008 if (hmp_domain_min_load(hmp_faster_domain(cpu), target_cpu)
9009 > NICE_0_LOAD-64)
9010 return 0;
9011
9012 if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus,
9013 tsk_cpus_allowed(p)))
9014 return 1;
9015
9016 return 0;
9017 }
9018
9019 /* Check if task should migrate to a slower cpu */
9020 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
9021 {
9022 struct task_struct *p = task_of(se);
9023 struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
9024 u64 now;
9025
9026 if (hmp_cpu_is_slowest(cpu))
9027 return 0;
9028
9029 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
9030 /* Filter by task priority */
9031 if ((p->prio >= hmp_up_prio) &&
9032 cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
9033 tsk_cpus_allowed(p))) {
9034 return 1;
9035 }
9036 #endif
9037
9038 /* Let the task load settle before doing another down migration */
9039 now = cfs_rq_clock_task(cfs_rq);
9040 if (((now - se->avg.hmp_last_down_migration) >> 10)
9041 < hmp_next_down_threshold)
9042 return 0;
9043
9044 if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
9045 tsk_cpus_allowed(p))
9046 && se->avg.load_avg_ratio < hmp_down_threshold) {
9047 return 1;
9048 }
9049 return 0;
9050 }
9051 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
9052
9053 /*
9054 * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
9055 * Ideally this function should be merged with can_migrate_task() to avoid
9056 * redundant code.
9057 */
9058 static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
9059 {
9060 int tsk_cache_hot = 0;
9061
9062 /*
9063 * We do not migrate tasks that are:
9064 * 1) running (obviously), or
9065 * 2) cannot be migrated to this CPU due to cpus_allowed
9066 */
9067 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
9068 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
9069 return 0;
9070 }
9071 env->flags &= ~LBF_ALL_PINNED;
9072
9073 if (task_running(env->src_rq, p)) {
9074 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
9075 return 0;
9076 }
9077
9078 /*
9079 * Aggressive migration if:
9080 * 1) task is cache cold, or
9081 * 2) too many balance attempts have failed.
9082 */
9083
9084 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
9085 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd, env->mt_check_cache_in_idle);
9086 #else
9087 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
9088 #endif
9089 if (!tsk_cache_hot ||
9090 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
9091 #ifdef CONFIG_SCHEDSTATS
9092 if (tsk_cache_hot) {
9093 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
9094 schedstat_inc(p, se.statistics.nr_forced_migrations);
9095 }
9096 #endif
9097 return 1;
9098 }
9099
9100 return 1;
9101 }
9102
9103 /*
9104 * move_specific_task tries to move a specific task.
9105 * Returns 1 if successful and 0 otherwise.
9106 * Called with both runqueues locked.
9107 */
9108 static int move_specific_task(struct lb_env *env, struct task_struct *pm)
9109 {
9110 struct task_struct *p, *n;
9111
9112 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
9113 if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
9114 env->dst_cpu))
9115 continue;
9116
9117 if (!hmp_can_migrate_task(p, env))
9118 continue;
9119 /* Check if we found the right task */
9120 if (p != pm)
9121 continue;
9122
9123 move_task(p, env);
9124 /*
9125 * Right now, this is only the third place move_task()
9126 * is called, so we can safely collect move_task()
9127 * stats here rather than inside move_task().
9128 */
9129 schedstat_inc(env->sd, lb_gained[env->idle]);
9130 return 1;
9131 }
9132 return 0;
9133 }
9134
9135 /*
9136 * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
9137 * migrate a specific task from one runqueue to another.
9138 * hmp_force_up_migration uses this to push a currently running task
9139 * off a runqueue.
9140 * Based on active_load_balance_stop_cpu and can potentially be merged.
9141 */
9142 static int hmp_active_task_migration_cpu_stop(void *data)
9143 {
9144 struct rq *busiest_rq = data;
9145 struct task_struct *p = busiest_rq->migrate_task;
9146 int busiest_cpu = cpu_of(busiest_rq);
9147 int target_cpu = busiest_rq->push_cpu;
9148 struct rq *target_rq = cpu_rq(target_cpu);
9149 struct sched_domain *sd;
9150
9151 raw_spin_lock_irq(&busiest_rq->lock);
9152 /* make sure the requested cpu hasn't gone down in the meantime */
9153 if (unlikely(busiest_cpu != smp_processor_id() ||
9154 !busiest_rq->active_balance)) {
9155 goto out_unlock;
9156 }
9157 /* Is there any task to move? */
9158 if (busiest_rq->nr_running <= 1)
9159 goto out_unlock;
9160 /* Task has migrated meanwhile, abort forced migration */
9161 if (task_rq(p) != busiest_rq)
9162 goto out_unlock;
9163 /*
9164 * This condition is "impossible", if it occurs
9165 * we need to fix it. Originally reported by
9166 * Bjorn Helgaas on a 128-cpu setup.
9167 */
9168 BUG_ON(busiest_rq == target_rq);
9169
9170 /* move a task from busiest_rq to target_rq */
9171 double_lock_balance(busiest_rq, target_rq);
9172
9173 /* Search for an sd spanning us and the target CPU. */
9174 rcu_read_lock();
9175 for_each_domain(target_cpu, sd) {
9176 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9177 break;
9178 }
9179
9180 if (likely(sd)) {
9181 struct lb_env env = {
9182 .sd = sd,
9183 .dst_cpu = target_cpu,
9184 .dst_rq = target_rq,
9185 .src_cpu = busiest_rq->cpu,
9186 .src_rq = busiest_rq,
9187 .idle = CPU_IDLE,
9188 };
9189
9190 schedstat_inc(sd, alb_count);
9191
9192 if (move_specific_task(&env, p))
9193 schedstat_inc(sd, alb_pushed);
9194 else
9195 schedstat_inc(sd, alb_failed);
9196 }
9197 rcu_read_unlock();
9198 double_unlock_balance(busiest_rq, target_rq);
9199 out_unlock:
9200 busiest_rq->active_balance = 0;
9201 raw_spin_unlock_irq(&busiest_rq->lock);
9202 return 0;
9203 }
9204
9205 static DEFINE_SPINLOCK(hmp_force_migration);
9206 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
9207 /*
9208 * Heterogenous Multi-Processor (HMP) Global Load Balance
9209 */
9210
9211 /*
9212 * According to Linaro's comment, we should only check the currently running
9213 * tasks because selecting other tasks for migration will require extensive
9214 * book keeping.
9215 */
9216 #ifdef CONFIG_HMP_GLOBAL_BALANCE
9217 static void hmp_force_down_migration(int this_cpu)
9218 {
9219 int curr_cpu, target_cpu;
9220 struct sched_entity *se;
9221 struct rq *target;
9222 unsigned long flags;
9223 unsigned int force;
9224 struct task_struct *p;
9225 struct clb_env clbenv;
9226
9227 /* Migrate light task from big to LITTLE */
9228 for_each_cpu(curr_cpu, &hmp_fast_cpu_mask) {
9229 /* Check whether CPU is online */
9230 if(!cpu_online(curr_cpu))
9231 continue;
9232
9233 force = 0;
9234 target = cpu_rq(curr_cpu);
9235 raw_spin_lock_irqsave(&target->lock, flags);
9236 se = target->cfs.curr;
9237 if (!se) {
9238 raw_spin_unlock_irqrestore(&target->lock, flags);
9239 continue;
9240 }
9241
9242 /* Find task entity */
9243 if (!entity_is_task(se)) {
9244 struct cfs_rq *cfs_rq;
9245 cfs_rq = group_cfs_rq(se);
9246 while (cfs_rq) {
9247 se = cfs_rq->curr;
9248 cfs_rq = group_cfs_rq(se);
9249 }
9250 }
9251
9252 p = task_of(se);
9253 target_cpu = hmp_select_cpu(HMP_GB,p,&hmp_slow_cpu_mask,-1);
9254 if(NR_CPUS == target_cpu) {
9255 raw_spin_unlock_irqrestore(&target->lock, flags);
9256 continue;
9257 }
9258
9259 /* Collect cluster information */
9260 memset(&clbenv, 0, sizeof(clbenv));
9261 clbenv.flags |= HMP_GB;
9262 clbenv.btarget = curr_cpu;
9263 clbenv.ltarget = target_cpu;
9264 clbenv.lcpus = &hmp_slow_cpu_mask;
9265 clbenv.bcpus = &hmp_fast_cpu_mask;
9266 sched_update_clbstats(&clbenv);
9267
9268 /* Check migration threshold */
9269 if (!target->active_balance &&
9270 hmp_down_migration(curr_cpu, &target_cpu, se, &clbenv)) {
9271 target->active_balance = 1;
9272 target->push_cpu = target_cpu;
9273 target->migrate_task = p;
9274 force = 1;
9275 trace_sched_hmp_migrate(p, target->push_cpu, 1);
9276 hmp_next_down_delay(&p->se, target->push_cpu);
9277 }
9278 raw_spin_unlock_irqrestore(&target->lock, flags);
9279 if (force) {
9280 stop_one_cpu_nowait(cpu_of(target),
9281 hmp_active_task_migration_cpu_stop,
9282 target, &target->active_balance_work);
9283 }
9284 }
9285 }
9286 #endif /* CONFIG_HMP_GLOBAL_BALANCE */
9287 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9288 u32 AVOID_FORCE_UP_MIGRATION_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
9289 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9290
9291 static void hmp_force_up_migration(int this_cpu)
9292 {
9293 int curr_cpu, target_cpu;
9294 struct sched_entity *se;
9295 struct rq *target;
9296 unsigned long flags;
9297 unsigned int force;
9298 struct task_struct *p;
9299 struct clb_env clbenv;
9300 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9301 int push_cpu;
9302 #endif
9303
9304 if (!spin_trylock(&hmp_force_migration))
9305 return;
9306
9307 #ifdef CONFIG_HMP_TRACER
9308 for_each_online_cpu(curr_cpu)
9309 trace_sched_cfs_runnable_load(curr_cpu,cfs_load(curr_cpu),
9310 cfs_length(curr_cpu));
9311 #endif
9312
9313 /* Migrate heavy task from LITTLE to big */
9314 for_each_cpu(curr_cpu, &hmp_slow_cpu_mask) {
9315 /* Check whether CPU is online */
9316 if(!cpu_online(curr_cpu))
9317 continue;
9318
9319 force = 0;
9320 target = cpu_rq(curr_cpu);
9321 raw_spin_lock_irqsave(&target->lock, flags);
9322 se = target->cfs.curr;
9323 if (!se) {
9324 raw_spin_unlock_irqrestore(&target->lock, flags);
9325 continue;
9326 }
9327
9328 /* Find task entity */
9329 if (!entity_is_task(se)) {
9330 struct cfs_rq *cfs_rq;
9331 cfs_rq = group_cfs_rq(se);
9332 while (cfs_rq) {
9333 se = cfs_rq->curr;
9334 cfs_rq = group_cfs_rq(se);
9335 }
9336 }
9337
9338 p = task_of(se);
9339 target_cpu = hmp_select_cpu(HMP_GB,p,&hmp_fast_cpu_mask,-1);
9340 if(NR_CPUS == target_cpu) {
9341 raw_spin_unlock_irqrestore(&target->lock, flags);
9342 continue;
9343 }
9344
9345 /* Collect cluster information */
9346 memset(&clbenv, 0, sizeof(clbenv));
9347 clbenv.flags |= HMP_GB;
9348 clbenv.ltarget = curr_cpu;
9349 clbenv.btarget = target_cpu;
9350 clbenv.lcpus = &hmp_slow_cpu_mask;
9351 clbenv.bcpus = &hmp_fast_cpu_mask;
9352 sched_update_clbstats(&clbenv);
9353
9354 #ifdef CONFIG_HMP_LAZY_BALANCE
9355 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9356 if (PA_ENABLE && LB_ENABLE) {
9357 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9358 if (is_light_task(p) && !is_buddy_busy(per_cpu(sd_pack_buddy, curr_cpu))) {
9359 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9360 push_cpu = hmp_select_cpu(HMP_GB,p,&hmp_fast_cpu_mask,-1);
9361 if (hmp_cpu_is_fast(push_cpu)) {
9362 AVOID_FORCE_UP_MIGRATION_FROM_CPUX_TO_CPUY_COUNT[curr_cpu][push_cpu]++;
9363 #ifdef CONFIG_HMP_TRACER
9364 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_FORCE_UP_FORM_CPUX_TO_CPUY, p->pid, curr_cpu, push_cpu);
9365 #endif /* CONFIG_HMP_TRACER */
9366 }
9367 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9368 goto out_force_up;
9369 }
9370 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9371 }
9372 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9373 #endif /* CONFIG_HMP_LAZY_BALANCE */
9374
9375 /* Check migration threshold */
9376 if (!target->active_balance &&
9377 hmp_up_migration(curr_cpu, &target_cpu, se, &clbenv)) {
9378 target->active_balance = 1;
9379 target->push_cpu = target_cpu;
9380 target->migrate_task = p;
9381 force = 1;
9382 trace_sched_hmp_migrate(p, target->push_cpu, 1);
9383 hmp_next_up_delay(&p->se, target->push_cpu);
9384 }
9385
9386 #ifdef CONFIG_HMP_LAZY_BALANCE
9387 out_force_up:
9388 #endif /* CONFIG_HMP_LAZY_BALANCE */
9389
9390 raw_spin_unlock_irqrestore(&target->lock, flags);
9391 if (force) {
9392 stop_one_cpu_nowait(cpu_of(target),
9393 hmp_active_task_migration_cpu_stop,
9394 target, &target->active_balance_work);
9395 }
9396 }
9397
9398 #ifdef CONFIG_HMP_GLOBAL_BALANCE
9399 hmp_force_down_migration(this_cpu);
9400 #endif
9401 #ifdef CONFIG_HMP_TRACER
9402 trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
9403 #endif
9404 spin_unlock(&hmp_force_migration);
9405 }
9406 #else /* CONFIG_SCHED_HMP_ENHANCEMENT */
9407 /*
9408 * hmp_force_up_migration checks runqueues for tasks that need to
9409 * be actively migrated to a faster cpu.
9410 */
9411 static void hmp_force_up_migration(int this_cpu)
9412 {
9413 int cpu, target_cpu;
9414 struct sched_entity *curr;
9415 struct rq *target;
9416 unsigned long flags;
9417 unsigned int force;
9418 struct task_struct *p;
9419
9420 if (!spin_trylock(&hmp_force_migration))
9421 return;
9422 for_each_online_cpu(cpu) {
9423 force = 0;
9424 target = cpu_rq(cpu);
9425 raw_spin_lock_irqsave(&target->lock, flags);
9426 curr = target->cfs.curr;
9427 if (!curr) {
9428 raw_spin_unlock_irqrestore(&target->lock, flags);
9429 continue;
9430 }
9431 if (!entity_is_task(curr)) {
9432 struct cfs_rq *cfs_rq;
9433
9434 cfs_rq = group_cfs_rq(curr);
9435 while (cfs_rq) {
9436 curr = cfs_rq->curr;
9437 cfs_rq = group_cfs_rq(curr);
9438 }
9439 }
9440 p = task_of(curr);
9441 if (hmp_up_migration(cpu, &target_cpu, curr)) {
9442 if (!target->active_balance) {
9443 target->active_balance = 1;
9444 target->push_cpu = target_cpu;
9445 target->migrate_task = p;
9446 force = 1;
9447 trace_sched_hmp_migrate(p, target->push_cpu, 1);
9448 hmp_next_up_delay(&p->se, target->push_cpu);
9449 }
9450 }
9451 if (!force && !target->active_balance) {
9452 /*
9453 * For now we just check the currently running task.
9454 * Selecting the lightest task for offloading will
9455 * require extensive book keeping.
9456 */
9457 target->push_cpu = hmp_offload_down(cpu, curr);
9458 if (target->push_cpu < NR_CPUS) {
9459 target->active_balance = 1;
9460 target->migrate_task = p;
9461 force = 1;
9462 trace_sched_hmp_migrate(p, target->push_cpu, 2);
9463 hmp_next_down_delay(&p->se, target->push_cpu);
9464 }
9465 }
9466 raw_spin_unlock_irqrestore(&target->lock, flags);
9467 if (force)
9468 stop_one_cpu_nowait(cpu_of(target),
9469 hmp_active_task_migration_cpu_stop,
9470 target, &target->active_balance_work);
9471 }
9472 spin_unlock(&hmp_force_migration);
9473 }
9474 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
9475 #else
9476 static void hmp_force_up_migration(int this_cpu) { }
9477 #endif /* CONFIG_SCHED_HMP */
9478
9479 /*
9480 * run_rebalance_domains is triggered when needed from the scheduler tick.
9481 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
9482 */
9483 static void run_rebalance_domains(struct softirq_action *h)
9484 {
9485 int this_cpu = smp_processor_id();
9486 struct rq *this_rq = cpu_rq(this_cpu);
9487 enum cpu_idle_type idle = this_rq->idle_balance ?
9488 CPU_IDLE : CPU_NOT_IDLE;
9489
9490 hmp_force_up_migration(this_cpu);
9491
9492 rebalance_domains(this_cpu, idle);
9493
9494 /*
9495 * If this cpu has a pending nohz_balance_kick, then do the
9496 * balancing on behalf of the other idle cpus whose ticks are
9497 * stopped.
9498 */
9499 nohz_idle_balance(this_cpu, idle);
9500 }
9501
9502 static inline int on_null_domain(int cpu)
9503 {
9504 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
9505 }
9506
9507 /*
9508 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
9509 */
9510 void trigger_load_balance(struct rq *rq, int cpu)
9511 {
9512 /* Don't need to rebalance while attached to NULL domain */
9513 if (time_after_eq(jiffies, rq->next_balance) &&
9514 likely(!on_null_domain(cpu)))
9515 raise_softirq(SCHED_SOFTIRQ);
9516 #ifdef CONFIG_NO_HZ_COMMON
9517 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
9518 nohz_balancer_kick(cpu);
9519 #endif
9520 }
9521
9522 static void rq_online_fair(struct rq *rq)
9523 {
9524 #ifdef CONFIG_SCHED_HMP
9525 hmp_online_cpu(rq->cpu);
9526 #endif
9527 update_sysctl();
9528 }
9529
9530 static void rq_offline_fair(struct rq *rq)
9531 {
9532 #ifdef CONFIG_SCHED_HMP
9533 hmp_offline_cpu(rq->cpu);
9534 #endif
9535 update_sysctl();
9536
9537 /* Ensure any throttled groups are reachable by pick_next_task */
9538 unthrottle_offline_cfs_rqs(rq);
9539 }
9540
9541 #endif /* CONFIG_SMP */
9542
9543 /*
9544 * scheduler tick hitting a task of our scheduling class:
9545 */
9546 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9547 {
9548 struct cfs_rq *cfs_rq;
9549 struct sched_entity *se = &curr->se;
9550
9551 for_each_sched_entity(se) {
9552 cfs_rq = cfs_rq_of(se);
9553 entity_tick(cfs_rq, se, queued);
9554 }
9555
9556 if (sched_feat_numa(NUMA))
9557 task_tick_numa(rq, curr);
9558
9559 update_rq_runnable_avg(rq, 1);
9560 }
9561
9562 /*
9563 * called on fork with the child task as argument from the parent's context
9564 * - child not yet on the tasklist
9565 * - preemption disabled
9566 */
9567 static void task_fork_fair(struct task_struct *p)
9568 {
9569 struct cfs_rq *cfs_rq;
9570 struct sched_entity *se = &p->se, *curr;
9571 int this_cpu = smp_processor_id();
9572 struct rq *rq = this_rq();
9573 unsigned long flags;
9574
9575 raw_spin_lock_irqsave(&rq->lock, flags);
9576
9577 update_rq_clock(rq);
9578
9579 cfs_rq = task_cfs_rq(current);
9580 curr = cfs_rq->curr;
9581
9582 /*
9583 * Not only the cpu but also the task_group of the parent might have
9584 * been changed after parent->se.parent,cfs_rq were copied to
9585 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
9586 * of child point to valid ones.
9587 */
9588 rcu_read_lock();
9589 __set_task_cpu(p, this_cpu);
9590 rcu_read_unlock();
9591
9592 update_curr(cfs_rq);
9593
9594 if (curr)
9595 se->vruntime = curr->vruntime;
9596 place_entity(cfs_rq, se, 1);
9597
9598 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
9599 /*
9600 * Upon rescheduling, sched_class::put_prev_task() will place
9601 * 'current' within the tree based on its new key value.
9602 */
9603 swap(curr->vruntime, se->vruntime);
9604 resched_task(rq->curr);
9605 }
9606
9607 se->vruntime -= cfs_rq->min_vruntime;
9608
9609 raw_spin_unlock_irqrestore(&rq->lock, flags);
9610 }
9611
9612 /*
9613 * Priority of the task has changed. Check to see if we preempt
9614 * the current task.
9615 */
9616 static void
9617 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
9618 {
9619 if (!p->se.on_rq)
9620 return;
9621
9622 /*
9623 * Reschedule if we are currently running on this runqueue and
9624 * our priority decreased, or if we are not currently running on
9625 * this runqueue and our priority is higher than the current's
9626 */
9627 if (rq->curr == p) {
9628 if (p->prio > oldprio)
9629 resched_task(rq->curr);
9630 } else
9631 check_preempt_curr(rq, p, 0);
9632 }
9633
9634 static void switched_from_fair(struct rq *rq, struct task_struct *p)
9635 {
9636 struct sched_entity *se = &p->se;
9637 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9638
9639 /*
9640 * Ensure the task's vruntime is normalized, so that when it's
9641 * switched back to the fair class the enqueue_entity(.flags=0) will
9642 * do the right thing.
9643 *
9644 * If it's on_rq, then the dequeue_entity(.flags=0) will already
9645 * have normalized the vruntime, if it's !on_rq, then only when
9646 * the task is sleeping will it still have non-normalized vruntime.
9647 */
9648 if (!p->on_rq && p->state != TASK_RUNNING) {
9649 /*
9650 * Fix up our vruntime so that the current sleep doesn't
9651 * cause 'unlimited' sleep bonus.
9652 */
9653 place_entity(cfs_rq, se, 0);
9654 se->vruntime -= cfs_rq->min_vruntime;
9655 }
9656
9657 #ifdef CONFIG_SMP
9658 /*
9659 * Remove our load from contribution when we leave sched_fair
9660 * and ensure we don't carry in an old decay_count if we
9661 * switch back.
9662 */
9663 if (p->se.avg.decay_count) {
9664 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
9665 __synchronize_entity_decay(&p->se);
9666 subtract_blocked_load_contrib(cfs_rq,
9667 p->se.avg.load_avg_contrib);
9668 }
9669 #endif
9670 }
9671
9672 /*
9673 * We switched to the sched_fair class.
9674 */
9675 static void switched_to_fair(struct rq *rq, struct task_struct *p)
9676 {
9677 if (!p->se.on_rq)
9678 return;
9679
9680 /*
9681 * We were most likely switched from sched_rt, so
9682 * kick off the schedule if running, otherwise just see
9683 * if we can still preempt the current task.
9684 */
9685 if (rq->curr == p)
9686 resched_task(rq->curr);
9687 else{
9688 /*
9689 When task p change priority form RT to normal priority
9690 in switch_from_rt(), it might call pull_rt_task
9691 and potentially double_lock_balance will unlock rq.
9692 Task p might migrate to other CPU and result in task p is NOT at rq.
9693 In this case, it is not necessary to check preempt for rq.
9694 (Because task p is NOT at rq anymore)
9695 and the migrate flow for task p will check preempt in enqueue flow.
9696 So bypass the check_preempt_curr.
9697 */
9698 if (rq == task_rq(p)) {
9699 check_preempt_curr(rq, p, 0);
9700 }
9701 }
9702 }
9703
9704 /* Account for a task changing its policy or group.
9705 *
9706 * This routine is mostly called to set cfs_rq->curr field when a task
9707 * migrates between groups/classes.
9708 */
9709 static void set_curr_task_fair(struct rq *rq)
9710 {
9711 struct sched_entity *se = &rq->curr->se;
9712
9713 for_each_sched_entity(se) {
9714 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9715
9716 set_next_entity(cfs_rq, se);
9717 /* ensure bandwidth has been allocated on our new cfs_rq */
9718 account_cfs_rq_runtime(cfs_rq, 0);
9719 }
9720 }
9721
9722 void init_cfs_rq(struct cfs_rq *cfs_rq)
9723 {
9724 cfs_rq->tasks_timeline = RB_ROOT;
9725 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9726 #ifndef CONFIG_64BIT
9727 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
9728 #endif
9729 #ifdef CONFIG_SMP
9730 atomic64_set(&cfs_rq->decay_counter, 1);
9731 atomic_long_set(&cfs_rq->removed_load, 0);
9732 #endif
9733 }
9734
9735 #ifdef CONFIG_FAIR_GROUP_SCHED
9736 static void task_move_group_fair(struct task_struct *p, int on_rq)
9737 {
9738 struct cfs_rq *cfs_rq;
9739 /*
9740 * If the task was not on the rq at the time of this cgroup movement
9741 * it must have been asleep, sleeping tasks keep their ->vruntime
9742 * absolute on their old rq until wakeup (needed for the fair sleeper
9743 * bonus in place_entity()).
9744 *
9745 * If it was on the rq, we've just 'preempted' it, which does convert
9746 * ->vruntime to a relative base.
9747 *
9748 * Make sure both cases convert their relative position when migrating
9749 * to another cgroup's rq. This does somewhat interfere with the
9750 * fair sleeper stuff for the first placement, but who cares.
9751 */
9752 /*
9753 * When !on_rq, vruntime of the task has usually NOT been normalized.
9754 * But there are some cases where it has already been normalized:
9755 *
9756 * - Moving a forked child which is waiting for being woken up by
9757 * wake_up_new_task().
9758 * - Moving a task which has been woken up by try_to_wake_up() and
9759 * waiting for actually being woken up by sched_ttwu_pending().
9760 *
9761 * To prevent boost or penalty in the new cfs_rq caused by delta
9762 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
9763 */
9764 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
9765 on_rq = 1;
9766
9767 if (!on_rq)
9768 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
9769 set_task_rq(p, task_cpu(p));
9770 if (!on_rq) {
9771 cfs_rq = cfs_rq_of(&p->se);
9772 p->se.vruntime += cfs_rq->min_vruntime;
9773 #ifdef CONFIG_SMP
9774 /*
9775 * migrate_task_rq_fair() will have removed our previous
9776 * contribution, but we must synchronize for ongoing future
9777 * decay.
9778 */
9779 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
9780 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
9781 #endif
9782 }
9783 }
9784
9785 void free_fair_sched_group(struct task_group *tg)
9786 {
9787 int i;
9788
9789 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
9790
9791 for_each_possible_cpu(i) {
9792 if (tg->cfs_rq)
9793 kfree(tg->cfs_rq[i]);
9794 if (tg->se)
9795 kfree(tg->se[i]);
9796 }
9797
9798 kfree(tg->cfs_rq);
9799 kfree(tg->se);
9800 }
9801
9802 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9803 {
9804 struct cfs_rq *cfs_rq;
9805 struct sched_entity *se;
9806 int i;
9807
9808 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
9809 if (!tg->cfs_rq)
9810 goto err;
9811 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
9812 if (!tg->se)
9813 goto err;
9814
9815 tg->shares = NICE_0_LOAD;
9816
9817 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
9818
9819 for_each_possible_cpu(i) {
9820 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
9821 GFP_KERNEL, cpu_to_node(i));
9822 if (!cfs_rq)
9823 goto err;
9824
9825 se = kzalloc_node(sizeof(struct sched_entity),
9826 GFP_KERNEL, cpu_to_node(i));
9827 if (!se)
9828 goto err_free_rq;
9829
9830 init_cfs_rq(cfs_rq);
9831 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
9832 }
9833
9834 return 1;
9835
9836 err_free_rq:
9837 kfree(cfs_rq);
9838 err:
9839 return 0;
9840 }
9841
9842 void unregister_fair_sched_group(struct task_group *tg, int cpu)
9843 {
9844 struct rq *rq = cpu_rq(cpu);
9845 unsigned long flags;
9846
9847 /*
9848 * Only empty task groups can be destroyed; so we can speculatively
9849 * check on_list without danger of it being re-added.
9850 */
9851 if (!tg->cfs_rq[cpu]->on_list)
9852 return;
9853
9854 raw_spin_lock_irqsave(&rq->lock, flags);
9855 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
9856 raw_spin_unlock_irqrestore(&rq->lock, flags);
9857 }
9858
9859 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
9860 struct sched_entity *se, int cpu,
9861 struct sched_entity *parent)
9862 {
9863 struct rq *rq = cpu_rq(cpu);
9864
9865 cfs_rq->tg = tg;
9866 cfs_rq->rq = rq;
9867 init_cfs_rq_runtime(cfs_rq);
9868
9869 tg->cfs_rq[cpu] = cfs_rq;
9870 tg->se[cpu] = se;
9871
9872 /* se could be NULL for root_task_group */
9873 if (!se)
9874 return;
9875
9876 if (!parent)
9877 se->cfs_rq = &rq->cfs;
9878 else
9879 se->cfs_rq = parent->my_q;
9880
9881 se->my_q = cfs_rq;
9882 /* guarantee group entities always have weight */
9883 update_load_set(&se->load, NICE_0_LOAD);
9884 se->parent = parent;
9885 }
9886
9887 static DEFINE_MUTEX(shares_mutex);
9888
9889 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9890 {
9891 int i;
9892 unsigned long flags;
9893
9894 /*
9895 * We can't change the weight of the root cgroup.
9896 */
9897 if (!tg->se[0])
9898 return -EINVAL;
9899
9900 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
9901
9902 mutex_lock(&shares_mutex);
9903 if (tg->shares == shares)
9904 goto done;
9905
9906 tg->shares = shares;
9907 for_each_possible_cpu(i) {
9908 struct rq *rq = cpu_rq(i);
9909 struct sched_entity *se;
9910
9911 se = tg->se[i];
9912 /* Propagate contribution to hierarchy */
9913 raw_spin_lock_irqsave(&rq->lock, flags);
9914 for_each_sched_entity(se)
9915 update_cfs_shares(group_cfs_rq(se));
9916 raw_spin_unlock_irqrestore(&rq->lock, flags);
9917 }
9918
9919 done:
9920 mutex_unlock(&shares_mutex);
9921 return 0;
9922 }
9923 #else /* CONFIG_FAIR_GROUP_SCHED */
9924
9925 void free_fair_sched_group(struct task_group *tg) { }
9926
9927 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9928 {
9929 return 1;
9930 }
9931
9932 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
9933
9934 #endif /* CONFIG_FAIR_GROUP_SCHED */
9935
9936
9937 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
9938 {
9939 struct sched_entity *se = &task->se;
9940 unsigned int rr_interval = 0;
9941
9942 /*
9943 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
9944 * idle runqueue:
9945 */
9946 if (rq->cfs.load.weight)
9947 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
9948
9949 return rr_interval;
9950 }
9951
9952 /*
9953 * All the scheduling class methods:
9954 */
9955 const struct sched_class fair_sched_class = {
9956 .next = &idle_sched_class,
9957 .enqueue_task = enqueue_task_fair,
9958 .dequeue_task = dequeue_task_fair,
9959 .yield_task = yield_task_fair,
9960 .yield_to_task = yield_to_task_fair,
9961
9962 .check_preempt_curr = check_preempt_wakeup,
9963
9964 .pick_next_task = pick_next_task_fair,
9965 .put_prev_task = put_prev_task_fair,
9966
9967 #ifdef CONFIG_SMP
9968 .select_task_rq = select_task_rq_fair,
9969 .migrate_task_rq = migrate_task_rq_fair,
9970
9971 .rq_online = rq_online_fair,
9972 .rq_offline = rq_offline_fair,
9973
9974 .task_waking = task_waking_fair,
9975 #endif
9976
9977 .set_curr_task = set_curr_task_fair,
9978 .task_tick = task_tick_fair,
9979 .task_fork = task_fork_fair,
9980
9981 .prio_changed = prio_changed_fair,
9982 .switched_from = switched_from_fair,
9983 .switched_to = switched_to_fair,
9984
9985 .get_rr_interval = get_rr_interval_fair,
9986
9987 #ifdef CONFIG_FAIR_GROUP_SCHED
9988 .task_move_group = task_move_group_fair,
9989 #endif
9990 };
9991
9992 #ifdef CONFIG_SCHED_DEBUG
9993 void print_cfs_stats(struct seq_file *m, int cpu)
9994 {
9995 struct cfs_rq *cfs_rq;
9996
9997 rcu_read_lock();
9998 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
9999 print_cfs_rq(m, cpu, cfs_rq);
10000 rcu_read_unlock();
10001 }
10002 #endif
10003
10004 __init void init_sched_fair_class(void)
10005 {
10006 #ifdef CONFIG_SMP
10007 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10008
10009 #ifdef CONFIG_NO_HZ_COMMON
10010 nohz.next_balance = jiffies;
10011 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10012 cpu_notifier(sched_ilb_notifier, 0);
10013 #endif
10014
10015 cmp_cputopo_domain_setup();
10016 #ifdef CONFIG_SCHED_HMP
10017 hmp_cpu_mask_setup();
10018 #endif
10019 #endif /* SMP */
10020 }
10021
10022 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
10023 static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
10024 {
10025 u32 result = curr / max;
10026 return result;
10027 }
10028
10029 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
10030 DEFINE_PER_CPU(u32, FREQ_CPU);
10031 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
10032
10033 /* Called when the CPU Frequency is changed.
10034 * Once for each CPU.
10035 */
10036 static int cpufreq_callback(struct notifier_block *nb,
10037 unsigned long val, void *data)
10038 {
10039 struct cpufreq_freqs *freq = data;
10040 int cpu = freq->cpu;
10041 struct cpufreq_extents *extents;
10042 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10043 struct cpumask* mask;
10044 int id;
10045 #endif
10046
10047 if (freq->flags & CPUFREQ_CONST_LOOPS)
10048 return NOTIFY_OK;
10049
10050 if (val != CPUFREQ_POSTCHANGE)
10051 return NOTIFY_OK;
10052
10053 /* if dynamic load scale is disabled, set the load scale to 1.0 */
10054 if (!hmp_data.freqinvar_load_scale_enabled) {
10055 freq_scale[cpu].curr_scale = 1024;
10056 return NOTIFY_OK;
10057 }
10058
10059 extents = &freq_scale[cpu];
10060 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10061 if (extents->max < extents->const_max){
10062 extents->throttling=1;
10063 }
10064 else {
10065 extents->throttling=0;
10066 }
10067 #endif
10068 if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
10069 /* If our governor was recognised as a single-freq governor,
10070 * use 1.0
10071 */
10072 extents->curr_scale = 1024;
10073 } else {
10074 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10075 extents->curr_scale = cpufreq_calc_scale(extents->min,
10076 extents->const_max, freq->new);
10077 #else
10078 extents->curr_scale = cpufreq_calc_scale(extents->min,
10079 extents->max, freq->new);
10080 #endif
10081 }
10082
10083 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10084 mask = arch_cpu_is_big(cpu)?&hmp_fast_cpu_mask:&hmp_slow_cpu_mask;
10085 for_each_cpu(id, mask)
10086 freq_scale[id].curr_scale = extents->curr_scale;
10087 #endif
10088
10089 #if NR_CPUS == 4
10090 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10091 switch (cpu) {
10092 case 0:
10093 case 2:
10094 (extents + 1)->curr_scale = extents->curr_scale;
10095 break;
10096
10097 case 1:
10098 case 3:
10099 (extents - 1)->curr_scale = extents->curr_scale;
10100 break;
10101
10102 default:
10103
10104 break;
10105 }
10106 #endif
10107 #endif
10108
10109 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
10110 per_cpu(FREQ_CPU, cpu) = freq->new;
10111 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
10112 return NOTIFY_OK;
10113 }
10114
10115 /* Called when the CPUFreq governor is changed.
10116 * Only called for the CPUs which are actually changed by the
10117 * userspace.
10118 */
10119 static int cpufreq_policy_callback(struct notifier_block *nb,
10120 unsigned long event, void *data)
10121 {
10122 struct cpufreq_policy *policy = data;
10123 struct cpufreq_extents *extents;
10124 int cpu, singleFreq = 0;
10125 static const char performance_governor[] = "performance";
10126 static const char powersave_governor[] = "powersave";
10127
10128 if (event == CPUFREQ_START)
10129 return 0;
10130
10131 if (event != CPUFREQ_INCOMPATIBLE)
10132 return 0;
10133
10134 /* CPUFreq governors do not accurately report the range of
10135 * CPU Frequencies they will choose from.
10136 * We recognise performance and powersave governors as
10137 * single-frequency only.
10138 */
10139 if (!strncmp(policy->governor->name, performance_governor,
10140 strlen(performance_governor)) ||
10141 !strncmp(policy->governor->name, powersave_governor,
10142 strlen(powersave_governor)))
10143 singleFreq = 1;
10144
10145 /* Make sure that all CPUs impacted by this policy are
10146 * updated since we will only get a notification when the
10147 * user explicitly changes the policy on a CPU.
10148 */
10149 for_each_cpu(cpu, policy->cpus) {
10150 extents = &freq_scale[cpu];
10151 extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
10152 extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
10153 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10154 extents->const_max = policy->cpuinfo.max_freq >> SCHED_FREQSCALE_SHIFT;
10155 #endif
10156 if (!hmp_data.freqinvar_load_scale_enabled) {
10157 extents->curr_scale = 1024;
10158 } else if (singleFreq) {
10159 extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
10160 extents->curr_scale = 1024;
10161 } else {
10162 extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
10163 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10164 extents->curr_scale = cpufreq_calc_scale(extents->min,
10165 extents->const_max, policy->cur);
10166 #else
10167 extents->curr_scale = cpufreq_calc_scale(extents->min,
10168 extents->max, policy->cur);
10169 #endif
10170 }
10171 }
10172
10173 return 0;
10174 }
10175
10176 static struct notifier_block cpufreq_notifier = {
10177 .notifier_call = cpufreq_callback,
10178 };
10179 static struct notifier_block cpufreq_policy_notifier = {
10180 .notifier_call = cpufreq_policy_callback,
10181 };
10182
10183 static int __init register_sched_cpufreq_notifier(void)
10184 {
10185 int ret = 0;
10186
10187 /* init safe defaults since there are no policies at registration */
10188 for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
10189 /* safe defaults */
10190 freq_scale[ret].max = 1024;
10191 freq_scale[ret].min = 1024;
10192 freq_scale[ret].curr_scale = 1024;
10193 }
10194
10195 pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
10196 ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
10197 CPUFREQ_POLICY_NOTIFIER);
10198
10199 if (ret != -EINVAL)
10200 ret = cpufreq_register_notifier(&cpufreq_notifier,
10201 CPUFREQ_TRANSITION_NOTIFIER);
10202
10203 return ret;
10204 }
10205
10206 core_initcall(register_sched_cpufreq_notifier);
10207 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
10208
10209 #ifdef CONFIG_HEVTASK_INTERFACE
10210 /*
10211 * * This allows printing both to /proc/task_detect and
10212 * * to the console
10213 * */
10214 #ifndef CONFIG_KGDB_KDB
10215 #define SEQ_printf(m, x...) \
10216 do { \
10217 if (m) \
10218 seq_printf(m, x); \
10219 else \
10220 printk(x); \
10221 } while (0)
10222 #else
10223 #define SEQ_printf(m, x...) \
10224 do { \
10225 if (m) \
10226 seq_printf(m, x); \
10227 else if (__get_cpu_var(kdb_in_use) == 1) \
10228 kdb_printf(x); \
10229 else \
10230 printk(x); \
10231 } while (0)
10232 #endif
10233
10234 static int task_detect_show(struct seq_file *m, void *v)
10235 {
10236 struct task_struct *p;
10237 unsigned long flags;
10238 unsigned int i;
10239
10240 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
10241 for(i=0;i<NR_CPUS;i++){
10242 SEQ_printf(m,"%5d ",freq_scale[i].curr_scale);
10243 }
10244 #endif
10245
10246 SEQ_printf(m, "\n%lu\n ",jiffies_to_cputime(jiffies));
10247
10248 for(i=0;i<NR_CPUS;i++){
10249 raw_spin_lock_irqsave(&cpu_rq(i)->lock,flags);
10250 if(cpu_online(i)){
10251 list_for_each_entry(p,&cpu_rq(i)->cfs_tasks,se.group_node){
10252 SEQ_printf(m, "%lu %5d %5d %lu (%15s)\n ",
10253 p->se.avg.load_avg_ratio,p->pid,task_cpu(p),
10254 (p->utime+p->stime),p->comm);
10255 }
10256 }
10257 raw_spin_unlock_irqrestore(&cpu_rq(i)->lock,flags);
10258
10259 }
10260
10261 return 0;
10262 }
10263
10264 static int task_detect_open(struct inode *inode, struct file *filp)
10265 {
10266 return single_open(filp, task_detect_show, NULL);
10267 }
10268
10269 static const struct file_operations task_detect_fops = {
10270 .open = task_detect_open,
10271 .read = seq_read,
10272 .llseek = seq_lseek,
10273 .release = single_release,
10274 };
10275
10276 static int __init init_task_detect_procfs(void)
10277 {
10278 struct proc_dir_entry *pe;
10279
10280 pe = proc_create("task_detect", 0444, NULL, &task_detect_fops);
10281 if (!pe)
10282 return -ENOMEM;
10283 return 0;
10284 }
10285
10286 __initcall(init_task_detect_procfs);
10287 #endif /* CONFIG_HEVTASK_INTERFACE */