2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
23 #include <linux/latencytop.h>
24 #include <linux/sched.h>
25 #include <linux/cpumask.h>
26 #include <linux/slab.h>
27 #include <linux/profile.h>
28 #include <linux/interrupt.h>
29 #include <linux/mempolicy.h>
30 #include <linux/migrate.h>
31 #include <linux/task_work.h>
33 #include <trace/events/sched.h>
34 #ifdef CONFIG_HMP_VARIABLE_SCALE
35 #include <linux/sysfs.h>
36 #include <linux/vmalloc.h>
37 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
38 /* Include cpufreq header to add a notifier so that cpu frequency
39 * scaling can track the current CPU frequency
41 #include <linux/cpufreq.h>
42 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
43 #endif /* CONFIG_HMP_VARIABLE_SCALE */
47 #include <mtlbprof/mtlbprof.h>
50 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
51 #ifdef CONFIG_LOCAL_TIMERS
52 unsigned long localtimer_get_counter(void);
56 #ifdef CONFIG_HEVTASK_INTERFACE
57 #include <linux/proc_fs.h>
58 #include <linux/seq_file.h>
59 #ifdef CONFIG_KGDB_KDB
60 #include <linux/kdb.h>
65 * Targeted preemption latency for CPU-bound tasks:
66 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
68 * NOTE: this latency value is not the same as the concept of
69 * 'timeslice length' - timeslices in CFS are of variable length
70 * and have no persistent notion like in traditional, time-slice
71 * based scheduling concepts.
73 * (to see the precise effective timeslice length of your workload,
74 * run vmstat and monitor the context-switches (cs) field)
76 unsigned int sysctl_sched_latency
= 6000000ULL;
77 unsigned int normalized_sysctl_sched_latency
= 6000000ULL;
80 * The initial- and re-scaling of tunables is configurable
81 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
84 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
85 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
86 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
88 enum sched_tunable_scaling sysctl_sched_tunable_scaling
89 = SCHED_TUNABLESCALING_LOG
;
92 * Minimal preemption granularity for CPU-bound tasks:
93 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
95 unsigned int sysctl_sched_min_granularity
= 750000ULL;
96 unsigned int normalized_sysctl_sched_min_granularity
= 750000ULL;
99 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
101 static unsigned int sched_nr_latency
= 8;
104 * After fork, child runs first. If set to 0 (default) then
105 * parent will (try to) run first.
107 unsigned int sysctl_sched_child_runs_first __read_mostly
;
110 * SCHED_OTHER wake-up granularity.
111 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
113 * This option delays the preemption effects of decoupled workloads
114 * and reduces their over-scheduling. Synchronous workloads will still
115 * have immediate wakeup/sleep latencies.
117 unsigned int sysctl_sched_wakeup_granularity
= 1000000UL;
118 unsigned int normalized_sysctl_sched_wakeup_granularity
= 1000000UL;
120 const_debug
unsigned int sysctl_sched_migration_cost
= 100000UL;
123 * The exponential sliding window over which load is averaged for shares
127 unsigned int __read_mostly sysctl_sched_shares_window
= 10000000UL;
129 #ifdef CONFIG_CFS_BANDWIDTH
131 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
132 * each time a cfs_rq requests quota.
134 * Note: in the case that the slice exceeds the runtime remaining (either due
135 * to consumption or the quota being specified to be smaller than the slice)
136 * we will always only issue the remaining available time.
138 * default: 5 msec, units: microseconds
140 unsigned int sysctl_sched_cfs_bandwidth_slice
= 5000UL;
142 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
143 static int need_lazy_balance(int dst_cpu
, int src_cpu
, struct task_struct
*p
);
147 * Increase the granularity value when there are more CPUs,
148 * because with more CPUs the 'effective latency' as visible
149 * to users decreases. But the relationship is not linear,
150 * so pick a second-best guess by going with the log2 of the
153 * This idea comes from the SD scheduler of Con Kolivas:
155 static int get_update_sysctl_factor(void)
157 unsigned int cpus
= min_t(int, num_online_cpus(), 8);
160 switch (sysctl_sched_tunable_scaling
) {
161 case SCHED_TUNABLESCALING_NONE
:
164 case SCHED_TUNABLESCALING_LINEAR
:
167 case SCHED_TUNABLESCALING_LOG
:
169 factor
= 1 + ilog2(cpus
);
176 static void update_sysctl(void)
178 unsigned int factor
= get_update_sysctl_factor();
180 #define SET_SYSCTL(name) \
181 (sysctl_##name = (factor) * normalized_sysctl_##name)
182 SET_SYSCTL(sched_min_granularity
);
183 SET_SYSCTL(sched_latency
);
184 SET_SYSCTL(sched_wakeup_granularity
);
188 void sched_init_granularity(void)
192 #if defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK) || defined (CONFIG_HMP_PACK_SMALL_TASK)
194 * Save the id of the optimal CPU that should be used to pack small tasks
195 * The value -1 is used when no buddy has been found
197 DEFINE_PER_CPU(int, sd_pack_buddy
) = {-1};
199 #ifdef CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK
200 struct cpumask buddy_cpu_map
= {{0}};
203 /* Look for the best buddy CPU that can be used to pack small tasks
204 * We make the assumption that it doesn't wort to pack on CPU that share the
205 * same powerline. We looks for the 1st sched_domain without the
206 * SD_SHARE_POWERLINE flag. Then We look for the sched_group witht the lowest
207 * power per core based on the assumption that their power efficiency is
209 void update_packing_domain(int cpu
)
211 struct sched_domain
*sd
;
214 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
215 pr_info("[PACK] update_packing_domain() CPU%d\n", cpu
);
216 #endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
217 mt_sched_printf("[PACK] update_packing_domain() CPU%d", cpu
);
219 sd
= highest_flag_domain(cpu
, SD_SHARE_POWERLINE
);
222 sd
= rcu_dereference_check_sched_domain(cpu_rq(cpu
)->sd
);
225 if (cpumask_first(sched_domain_span(sd
)) == cpu
|| !sd
->parent
)
229 struct sched_group
*sg
= sd
->groups
;
230 struct sched_group
*pack
= sg
;
231 struct sched_group
*tmp
= sg
->next
;
233 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
234 pr_info("[PACK] sd = 0x%08x, flags = %d\n", (unsigned int)sd
, sd
->flags
);
235 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
237 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
238 pr_info("[PACK] sg = 0x%08x\n", (unsigned int)sg
);
239 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
241 /* 1st CPU of the sched domain is a good candidate */
243 id
= cpumask_first(sched_domain_span(sd
));
245 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
246 pr_info("[PACK] First cpu in this sd id = %d\n", id
);
247 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
249 /* Find sched group of candidate */
252 if (cpumask_test_cpu(id
, sched_group_cpus(tmp
))) {
256 } while (tmp
= tmp
->next
, tmp
!= sd
->groups
);
258 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
259 pr_info("[PACK] pack = 0x%08x\n", (unsigned int)sg
);
260 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
265 /* loop the sched groups to find the best one */
266 //Stop find the best one in the same Load Balance Domain
267 //while (tmp != sg) {
268 while (tmp
!= sg
&& !(sd
->flags
& SD_LOAD_BALANCE
)) {
269 if (tmp
->sgp
->power
* sg
->group_weight
<
270 sg
->sgp
->power
* tmp
->group_weight
) {
272 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
273 pr_info("[PACK] Now sg power = %u, weight = %u, mask = %lu\n", sg
->sgp
->power
, sg
->group_weight
, sg
->cpumask
[0]);
274 pr_info("[PACK] Better sg power = %u, weight = %u, mask = %lu\n", tmp
->sgp
->power
, tmp
->group_weight
, tmp
->cpumask
[0]);
275 #endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
282 /* we have found a better group */
284 id
= cpumask_first(sched_group_cpus(pack
));
286 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
287 pr_info("[PACK] Better sg, first cpu id = %d\n", id
);
288 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
292 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
294 pr_info("[PACK] cpu = %d, id = %d, sd->parent = 0x%08x, flags = %d, SD_LOAD_BALANCE = %d\n", cpu
, id
, (unsigned int)sd
->parent
, sd
->parent
->flags
, SD_LOAD_BALANCE
);
295 pr_info("[PACK] %d\n", (id
!= cpu
));
296 pr_info("[PACK] 0x%08x\n", (unsigned int)(sd
->parent
));
297 pr_info("[PACK] %d\n", (sd
->parent
->flags
& SD_LOAD_BALANCE
));
300 pr_info("[PACK] cpu = %d, id = %d, sd->parent = 0x%08x\n", cpu
, id
, (unsigned int)sd
->parent
);
302 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
305 /* Look for another CPU than itself */
307 ((sd
->parent
) && (sd
->parent
->flags
& SD_LOAD_BALANCE
))) {
309 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
310 pr_info("[PACK] Break\n");
311 #endif /*CONFIG_HMP_PACK_BUDDY_INFO */
318 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
319 pr_info("[PACK] CPU%d packing on CPU%d\n", cpu
, id
);
320 #endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
321 mt_sched_printf("[PACK] CPU%d packing on CPU%d", cpu
, id
);
323 #ifdef CONFIG_HMP_PACK_SMALL_TASK
324 per_cpu(sd_pack_buddy
, cpu
) = id
;
325 #else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
326 if(per_cpu(sd_pack_buddy
, cpu
) != -1)
327 cpu_clear(per_cpu(sd_pack_buddy
, cpu
), buddy_cpu_map
);
328 per_cpu(sd_pack_buddy
, cpu
) = id
;
330 cpumask_set_cpu(id
, &buddy_cpu_map
);
334 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
335 DEFINE_PER_CPU(u32
, BUDDY_CPU_RQ_USAGE
);
336 DEFINE_PER_CPU(u32
, BUDDY_CPU_RQ_PERIOD
);
337 DEFINE_PER_CPU(u32
, BUDDY_CPU_RQ_NR
);
338 DEFINE_PER_CPU(u32
, TASK_USGAE
);
339 DEFINE_PER_CPU(u32
, TASK_PERIOD
);
340 u32 PACK_FROM_CPUX_TO_CPUY_COUNT
[NR_CPUS
][NR_CPUS
];
341 u32 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT
[NR_CPUS
][NR_CPUS
];
342 u32 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT
[NR_CPUS
][NR_CPUS
];
343 u32 TASK_PACK_CPU_COUNT
[4][NR_CPUS
] = {{0}};
345 u32 PA_MON_ENABLE
= 0;
346 char PA_MON
[4][TASK_COMM_LEN
]={{0}};
347 #endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER */
349 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
350 DEFINE_PER_CPU(u32
, BUDDY_CPU_RQ_USAGE
);
351 DEFINE_PER_CPU(u32
, BUDDY_CPU_RQ_PERIOD
);
352 DEFINE_PER_CPU(u32
, BUDDY_CPU_RQ_NR
);
353 DEFINE_PER_CPU(u32
, TASK_USGAE
);
354 DEFINE_PER_CPU(u32
, TASK_PERIOD
);
355 u32 PACK_FROM_CPUX_TO_CPUY_COUNT
[NR_CPUS
][NR_CPUS
];
356 u32 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT
[NR_CPUS
][NR_CPUS
];
357 u32 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT
[NR_CPUS
][NR_CPUS
];
358 u32 HMP_FROM_CPUX_TO_CPUY_COUNT
[NR_CPUS
][NR_CPUS
];
361 u32 PA_MON_ENABLE
= 0;
362 char PA_MON
[TASK_COMM_LEN
];
364 #ifdef CONFIG_HMP_TRACER
365 #define POWER_AWARE_ACTIVE_MODULE_PACK_FORM_CPUX_TO_CPUY (0)
366 #define POWER_AWARE_ACTIVE_MODULE_AVOID_WAKE_UP_FORM_CPUX_TO_CPUY (1)
367 #define POWER_AWARE_ACTIVE_MODULE_AVOID_BALANCE_FORM_CPUX_TO_CPUY (2)
368 #define POWER_AWARE_ACTIVE_MODULE_AVOID_FORCE_UP_FORM_CPUX_TO_CPUY (3)
369 #endif /* CONFIG_HMP_TRACER */
371 #endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER */
374 static inline bool is_buddy_busy(int cpu
)
376 #ifdef CONFIG_HMP_PACK_SMALL_TASK
383 #else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
384 struct rq
*rq
= cpu_rq(cpu
);
387 * A busy buddy is a CPU with a high load or a small load with a lot of
391 #if defined (CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER)
392 per_cpu(BUDDY_CPU_RQ_USAGE
, cpu
) = rq
->avg
.usage_avg_sum
;
393 per_cpu(BUDDY_CPU_RQ_PERIOD
, cpu
) = rq
->avg
.runnable_avg_period
;
394 per_cpu(BUDDY_CPU_RQ_NR
, cpu
) = rq
->nr_running
;
395 #endif /*(CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER) */
397 return ((rq
->avg
.usage_avg_sum
<< rq
->nr_running
) >
398 rq
->avg
.runnable_avg_period
);
402 static inline bool is_light_task(struct task_struct
*p
)
404 #if defined (CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER)
405 per_cpu(TASK_USGAE
, task_cpu(p
)) = p
->se
.avg
.usage_avg_sum
;
406 per_cpu(TASK_PERIOD
, task_cpu(p
)) = p
->se
.avg
.runnable_avg_period
;
407 #endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER || CONFIG_HMP_POWER_AWARE_CONTROLLER*/
409 /* A light task runs less than 25% in average */
410 return ((p
->se
.avg
.usage_avg_sum
<< 2) < p
->se
.avg
.runnable_avg_period
);
414 static int check_pack_buddy(int cpu
, struct task_struct
*p
)
416 #ifdef CONFIG_HMP_PACK_SMALL_TASK
419 if(cpu
>= NR_CPUS
|| cpu
< 0)
421 buddy
= per_cpu(sd_pack_buddy
, cpu
);
422 #else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
426 /* No pack buddy for this CPU */
431 * If a task is waiting for running on the CPU which is its own buddy,
432 * let the default behavior to look for a better CPU if available
433 * The threshold has been set to 37.5%
435 #ifdef CONFIG_HMP_PACK_SMALL_TASK
437 && ((p
->se
.avg
.usage_avg_sum
<< 3) < (p
->se
.avg
.runnable_avg_sum
* 5)))
441 /* buddy is not an allowed CPU */
442 if (!cpumask_test_cpu(buddy
, tsk_cpus_allowed(p
)))
446 * If the task is a small one and the buddy is not overloaded,
449 if (!is_light_task(p
) || is_buddy_busy(buddy
))
454 #endif /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK || CONFIG_HMP_PACK_SMALL_TASK*/
456 #if BITS_PER_LONG == 32
457 # define WMULT_CONST (~0UL)
459 # define WMULT_CONST (1UL << 32)
462 #define WMULT_SHIFT 32
465 * Shift right and round:
467 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
470 * delta *= weight / lw
473 calc_delta_mine(unsigned long delta_exec
, unsigned long weight
,
474 struct load_weight
*lw
)
479 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
480 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
481 * 2^SCHED_LOAD_RESOLUTION.
483 if (likely(weight
> (1UL << SCHED_LOAD_RESOLUTION
)))
484 tmp
= (u64
)delta_exec
* scale_load_down(weight
);
486 tmp
= (u64
)delta_exec
;
488 if (!lw
->inv_weight
) {
489 unsigned long w
= scale_load_down(lw
->weight
);
491 if (BITS_PER_LONG
> 32 && unlikely(w
>= WMULT_CONST
))
493 else if (unlikely(!w
))
494 lw
->inv_weight
= WMULT_CONST
;
496 lw
->inv_weight
= WMULT_CONST
/ w
;
500 * Check whether we'd overflow the 64-bit multiplication:
502 if (unlikely(tmp
> WMULT_CONST
))
503 tmp
= SRR(SRR(tmp
, WMULT_SHIFT
/2) * lw
->inv_weight
,
506 tmp
= SRR(tmp
* lw
->inv_weight
, WMULT_SHIFT
);
508 return (unsigned long)min(tmp
, (u64
)(unsigned long)LONG_MAX
);
512 const struct sched_class fair_sched_class
;
514 /**************************************************************
515 * CFS operations on generic schedulable entities:
518 #ifdef CONFIG_FAIR_GROUP_SCHED
520 /* cpu runqueue to which this cfs_rq is attached */
521 static inline struct rq
*rq_of(struct cfs_rq
*cfs_rq
)
526 /* An entity is a task if it doesn't "own" a runqueue */
527 #define entity_is_task(se) (!se->my_q)
529 static inline struct task_struct
*task_of(struct sched_entity
*se
)
531 #ifdef CONFIG_SCHED_DEBUG
532 WARN_ON_ONCE(!entity_is_task(se
));
534 return container_of(se
, struct task_struct
, se
);
537 /* Walk up scheduling entities hierarchy */
538 #define for_each_sched_entity(se) \
539 for (; se; se = se->parent)
541 static inline struct cfs_rq
*task_cfs_rq(struct task_struct
*p
)
546 /* runqueue on which this entity is (to be) queued */
547 static inline struct cfs_rq
*cfs_rq_of(struct sched_entity
*se
)
552 /* runqueue "owned" by this group */
553 static inline struct cfs_rq
*group_cfs_rq(struct sched_entity
*grp
)
558 static void update_cfs_rq_blocked_load(struct cfs_rq
*cfs_rq
,
561 static inline void list_add_leaf_cfs_rq(struct cfs_rq
*cfs_rq
)
563 if (!cfs_rq
->on_list
) {
565 * Ensure we either appear before our parent (if already
566 * enqueued) or force our parent to appear after us when it is
567 * enqueued. The fact that we always enqueue bottom-up
568 * reduces this to two cases.
570 if (cfs_rq
->tg
->parent
&&
571 cfs_rq
->tg
->parent
->cfs_rq
[cpu_of(rq_of(cfs_rq
))]->on_list
) {
572 list_add_rcu(&cfs_rq
->leaf_cfs_rq_list
,
573 &rq_of(cfs_rq
)->leaf_cfs_rq_list
);
575 list_add_tail_rcu(&cfs_rq
->leaf_cfs_rq_list
,
576 &rq_of(cfs_rq
)->leaf_cfs_rq_list
);
580 /* We should have no load, but we need to update last_decay. */
581 update_cfs_rq_blocked_load(cfs_rq
, 0);
585 static inline void list_del_leaf_cfs_rq(struct cfs_rq
*cfs_rq
)
587 if (cfs_rq
->on_list
) {
588 list_del_rcu(&cfs_rq
->leaf_cfs_rq_list
);
593 /* Iterate thr' all leaf cfs_rq's on a runqueue */
594 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
595 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
597 /* Do the two (enqueued) entities belong to the same group ? */
599 is_same_group(struct sched_entity
*se
, struct sched_entity
*pse
)
603 if (se
->cfs_rq
== pse
->cfs_rq
)
610 static inline struct sched_entity
*parent_entity(struct sched_entity
*se
)
615 /* return depth at which a sched entity is present in the hierarchy */
616 static inline int depth_se(struct sched_entity
*se
)
620 for_each_sched_entity(se
)
627 find_matching_se(struct sched_entity
**se
, struct sched_entity
**pse
)
629 int se_depth
, pse_depth
;
632 * preemption test can be made between sibling entities who are in the
633 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
634 * both tasks until we find their ancestors who are siblings of common
638 /* First walk up until both entities are at same depth */
639 se_depth
= depth_se(*se
);
640 pse_depth
= depth_se(*pse
);
642 while (se_depth
> pse_depth
) {
644 *se
= parent_entity(*se
);
647 while (pse_depth
> se_depth
) {
649 *pse
= parent_entity(*pse
);
652 while (!is_same_group(*se
, *pse
)) {
653 *se
= parent_entity(*se
);
654 *pse
= parent_entity(*pse
);
658 #else /* !CONFIG_FAIR_GROUP_SCHED */
660 static inline struct task_struct
*task_of(struct sched_entity
*se
)
662 return container_of(se
, struct task_struct
, se
);
665 static inline struct rq
*rq_of(struct cfs_rq
*cfs_rq
)
667 return container_of(cfs_rq
, struct rq
, cfs
);
670 #define entity_is_task(se) 1
672 #define for_each_sched_entity(se) \
673 for (; se; se = NULL)
675 static inline struct cfs_rq
*task_cfs_rq(struct task_struct
*p
)
677 return &task_rq(p
)->cfs
;
680 static inline struct cfs_rq
*cfs_rq_of(struct sched_entity
*se
)
682 struct task_struct
*p
= task_of(se
);
683 struct rq
*rq
= task_rq(p
);
688 /* runqueue "owned" by this group */
689 static inline struct cfs_rq
*group_cfs_rq(struct sched_entity
*grp
)
694 static inline void list_add_leaf_cfs_rq(struct cfs_rq
*cfs_rq
)
698 static inline void list_del_leaf_cfs_rq(struct cfs_rq
*cfs_rq
)
702 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
703 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
706 is_same_group(struct sched_entity
*se
, struct sched_entity
*pse
)
711 static inline struct sched_entity
*parent_entity(struct sched_entity
*se
)
717 find_matching_se(struct sched_entity
**se
, struct sched_entity
**pse
)
721 #endif /* CONFIG_FAIR_GROUP_SCHED */
723 static __always_inline
724 void account_cfs_rq_runtime(struct cfs_rq
*cfs_rq
, unsigned long delta_exec
);
726 /**************************************************************
727 * Scheduling class tree data structure manipulation methods:
730 static inline u64
max_vruntime(u64 max_vruntime
, u64 vruntime
)
732 s64 delta
= (s64
)(vruntime
- max_vruntime
);
734 max_vruntime
= vruntime
;
739 static inline u64
min_vruntime(u64 min_vruntime
, u64 vruntime
)
741 s64 delta
= (s64
)(vruntime
- min_vruntime
);
743 min_vruntime
= vruntime
;
748 static inline int entity_before(struct sched_entity
*a
,
749 struct sched_entity
*b
)
751 return (s64
)(a
->vruntime
- b
->vruntime
) < 0;
754 static void update_min_vruntime(struct cfs_rq
*cfs_rq
)
756 u64 vruntime
= cfs_rq
->min_vruntime
;
759 vruntime
= cfs_rq
->curr
->vruntime
;
761 if (cfs_rq
->rb_leftmost
) {
762 struct sched_entity
*se
= rb_entry(cfs_rq
->rb_leftmost
,
767 vruntime
= se
->vruntime
;
769 vruntime
= min_vruntime(vruntime
, se
->vruntime
);
772 /* ensure we never gain time by being placed backwards. */
773 cfs_rq
->min_vruntime
= max_vruntime(cfs_rq
->min_vruntime
, vruntime
);
776 cfs_rq
->min_vruntime_copy
= cfs_rq
->min_vruntime
;
781 * Enqueue an entity into the rb-tree:
783 static void __enqueue_entity(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
785 struct rb_node
**link
= &cfs_rq
->tasks_timeline
.rb_node
;
786 struct rb_node
*parent
= NULL
;
787 struct sched_entity
*entry
;
791 * Find the right place in the rbtree:
795 entry
= rb_entry(parent
, struct sched_entity
, run_node
);
797 * We dont care about collisions. Nodes with
798 * the same key stay together.
800 if (entity_before(se
, entry
)) {
801 link
= &parent
->rb_left
;
803 link
= &parent
->rb_right
;
809 * Maintain a cache of leftmost tree entries (it is frequently
813 cfs_rq
->rb_leftmost
= &se
->run_node
;
815 rb_link_node(&se
->run_node
, parent
, link
);
816 rb_insert_color(&se
->run_node
, &cfs_rq
->tasks_timeline
);
819 static void __dequeue_entity(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
821 if (cfs_rq
->rb_leftmost
== &se
->run_node
) {
822 struct rb_node
*next_node
;
824 next_node
= rb_next(&se
->run_node
);
825 cfs_rq
->rb_leftmost
= next_node
;
828 rb_erase(&se
->run_node
, &cfs_rq
->tasks_timeline
);
831 struct sched_entity
*__pick_first_entity(struct cfs_rq
*cfs_rq
)
833 struct rb_node
*left
= cfs_rq
->rb_leftmost
;
838 return rb_entry(left
, struct sched_entity
, run_node
);
841 static struct sched_entity
*__pick_next_entity(struct sched_entity
*se
)
843 struct rb_node
*next
= rb_next(&se
->run_node
);
848 return rb_entry(next
, struct sched_entity
, run_node
);
851 #ifdef CONFIG_SCHED_DEBUG
852 struct sched_entity
*__pick_last_entity(struct cfs_rq
*cfs_rq
)
854 struct rb_node
*last
= rb_last(&cfs_rq
->tasks_timeline
);
859 return rb_entry(last
, struct sched_entity
, run_node
);
862 /**************************************************************
863 * Scheduling class statistics methods:
866 int sched_proc_update_handler(struct ctl_table
*table
, int write
,
867 void __user
*buffer
, size_t *lenp
,
870 int ret
= proc_dointvec_minmax(table
, write
, buffer
, lenp
, ppos
);
871 int factor
= get_update_sysctl_factor();
876 sched_nr_latency
= DIV_ROUND_UP(sysctl_sched_latency
,
877 sysctl_sched_min_granularity
);
879 #define WRT_SYSCTL(name) \
880 (normalized_sysctl_##name = sysctl_##name / (factor))
881 WRT_SYSCTL(sched_min_granularity
);
882 WRT_SYSCTL(sched_latency
);
883 WRT_SYSCTL(sched_wakeup_granularity
);
893 static inline unsigned long
894 calc_delta_fair(unsigned long delta
, struct sched_entity
*se
)
896 if (unlikely(se
->load
.weight
!= NICE_0_LOAD
))
897 delta
= calc_delta_mine(delta
, NICE_0_LOAD
, &se
->load
);
903 * The idea is to set a period in which each task runs once.
905 * When there are too many tasks (sched_nr_latency) we have to stretch
906 * this period because otherwise the slices get too small.
908 * p = (nr <= nl) ? l : l*nr/nl
910 static u64
__sched_period(unsigned long nr_running
)
912 u64 period
= sysctl_sched_latency
;
913 unsigned long nr_latency
= sched_nr_latency
;
915 if (unlikely(nr_running
> nr_latency
)) {
916 period
= sysctl_sched_min_granularity
;
917 period
*= nr_running
;
924 * We calculate the wall-time slice from the period by taking a part
925 * proportional to the weight.
929 static u64
sched_slice(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
931 u64 slice
= __sched_period(cfs_rq
->nr_running
+ !se
->on_rq
);
933 for_each_sched_entity(se
) {
934 struct load_weight
*load
;
935 struct load_weight lw
;
937 cfs_rq
= cfs_rq_of(se
);
938 load
= &cfs_rq
->load
;
940 if (unlikely(!se
->on_rq
)) {
943 update_load_add(&lw
, se
->load
.weight
);
946 slice
= calc_delta_mine(slice
, se
->load
.weight
, load
);
952 * We calculate the vruntime slice of a to-be-inserted task.
956 static u64
sched_vslice(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
958 return calc_delta_fair(sched_slice(cfs_rq
, se
), se
);
963 static inline void __update_task_entity_contrib(struct sched_entity
*se
);
965 static long __update_task_entity_ratio(struct sched_entity
*se
);
967 #define LOAD_AVG_PERIOD 32
968 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
969 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
970 #define LOAD_AVG_VARIABLE_PERIOD 512
971 static unsigned int init_task_load_period
= 4000;
973 /* Give new task start runnable values to heavy its load in infant time */
974 void init_task_runnable_average(struct task_struct
*p
)
978 p
->se
.avg
.decay_count
= 0;
979 slice
= sched_slice(task_cfs_rq(p
), &p
->se
) >> 10;
980 p
->se
.avg
.runnable_avg_sum
= (init_task_load_period
) ? 0 : slice
;
981 p
->se
.avg
.runnable_avg_period
= (init_task_load_period
)?(init_task_load_period
):slice
;
982 __update_task_entity_contrib(&p
->se
);
984 #ifdef CONFIG_MTK_SCHED_CMP
985 /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
986 p
->se
.avg
.usage_avg_sum
= (init_task_load_period
) ? 0 : slice
;
988 __update_task_entity_ratio(&p
->se
);
989 trace_sched_task_entity_avg(0, p
, &p
->se
.avg
);
992 void init_task_runnable_average(struct task_struct
*p
)
998 * Update the current task's runtime statistics. Skip current tasks that
999 * are not in our scheduling class.
1002 __update_curr(struct cfs_rq
*cfs_rq
, struct sched_entity
*curr
,
1003 unsigned long delta_exec
)
1005 unsigned long delta_exec_weighted
;
1007 schedstat_set(curr
->statistics
.exec_max
,
1008 max((u64
)delta_exec
, curr
->statistics
.exec_max
));
1010 curr
->sum_exec_runtime
+= delta_exec
;
1011 schedstat_add(cfs_rq
, exec_clock
, delta_exec
);
1012 delta_exec_weighted
= calc_delta_fair(delta_exec
, curr
);
1014 curr
->vruntime
+= delta_exec_weighted
;
1015 update_min_vruntime(cfs_rq
);
1018 static void update_curr(struct cfs_rq
*cfs_rq
)
1020 struct sched_entity
*curr
= cfs_rq
->curr
;
1021 u64 now
= rq_of(cfs_rq
)->clock_task
;
1022 unsigned long delta_exec
;
1024 if (unlikely(!curr
))
1028 * Get the amount of time the current task was running
1029 * since the last time we changed load (this cannot
1030 * overflow on 32 bits):
1032 delta_exec
= (unsigned long)(now
- curr
->exec_start
);
1036 __update_curr(cfs_rq
, curr
, delta_exec
);
1037 curr
->exec_start
= now
;
1039 if (entity_is_task(curr
)) {
1040 struct task_struct
*curtask
= task_of(curr
);
1042 trace_sched_stat_runtime(curtask
, delta_exec
, curr
->vruntime
);
1043 cpuacct_charge(curtask
, delta_exec
);
1044 account_group_exec_runtime(curtask
, delta_exec
);
1047 account_cfs_rq_runtime(cfs_rq
, delta_exec
);
1051 update_stats_wait_start(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
1053 schedstat_set(se
->statistics
.wait_start
, rq_of(cfs_rq
)->clock
);
1057 * Task is being enqueued - update stats:
1059 static void update_stats_enqueue(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
1062 * Are we enqueueing a waiting task? (for current tasks
1063 * a dequeue/enqueue event is a NOP)
1065 if (se
!= cfs_rq
->curr
)
1066 update_stats_wait_start(cfs_rq
, se
);
1070 update_stats_wait_end(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
1072 schedstat_set(se
->statistics
.wait_max
, max(se
->statistics
.wait_max
,
1073 rq_of(cfs_rq
)->clock
- se
->statistics
.wait_start
));
1074 schedstat_set(se
->statistics
.wait_count
, se
->statistics
.wait_count
+ 1);
1075 schedstat_set(se
->statistics
.wait_sum
, se
->statistics
.wait_sum
+
1076 rq_of(cfs_rq
)->clock
- se
->statistics
.wait_start
);
1077 #ifdef CONFIG_SCHEDSTATS
1078 if (entity_is_task(se
)) {
1079 trace_sched_stat_wait(task_of(se
),
1080 rq_of(cfs_rq
)->clock
- se
->statistics
.wait_start
);
1083 schedstat_set(se
->statistics
.wait_start
, 0);
1087 update_stats_dequeue(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
1090 * Mark the end of the wait period if dequeueing a
1093 if (se
!= cfs_rq
->curr
)
1094 update_stats_wait_end(cfs_rq
, se
);
1098 * We are picking a new current task - update its stats:
1101 update_stats_curr_start(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
1104 * We are starting a new run period:
1106 se
->exec_start
= rq_of(cfs_rq
)->clock_task
;
1109 /**************************************************
1110 * Scheduling class queueing methods:
1113 #ifdef CONFIG_NUMA_BALANCING
1115 * numa task sample period in ms
1117 unsigned int sysctl_numa_balancing_scan_period_min
= 100;
1118 unsigned int sysctl_numa_balancing_scan_period_max
= 100*50;
1119 unsigned int sysctl_numa_balancing_scan_period_reset
= 100*600;
1121 /* Portion of address space to scan in MB */
1122 unsigned int sysctl_numa_balancing_scan_size
= 256;
1124 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1125 unsigned int sysctl_numa_balancing_scan_delay
= 1000;
1127 static void task_numa_placement(struct task_struct
*p
)
1131 if (!p
->mm
) /* for example, ksmd faulting in a user's mm */
1133 seq
= ACCESS_ONCE(p
->mm
->numa_scan_seq
);
1134 if (p
->numa_scan_seq
== seq
)
1136 p
->numa_scan_seq
= seq
;
1138 /* FIXME: Scheduling placement policy hints go here */
1142 * Got a PROT_NONE fault for a page on @node.
1144 void task_numa_fault(int node
, int pages
, bool migrated
)
1146 struct task_struct
*p
= current
;
1148 if (!sched_feat_numa(NUMA
))
1151 /* FIXME: Allocate task-specific structure for placement policy here */
1154 * If pages are properly placed (did not migrate) then scan slower.
1155 * This is reset periodically in case of phase changes
1158 p
->numa_scan_period
= min(sysctl_numa_balancing_scan_period_max
,
1159 p
->numa_scan_period
+ jiffies_to_msecs(10));
1161 task_numa_placement(p
);
1164 static void reset_ptenuma_scan(struct task_struct
*p
)
1166 ACCESS_ONCE(p
->mm
->numa_scan_seq
)++;
1167 p
->mm
->numa_scan_offset
= 0;
1171 * The expensive part of numa migration is done from task_work context.
1172 * Triggered from task_tick_numa().
1174 void task_numa_work(struct callback_head
*work
)
1176 unsigned long migrate
, next_scan
, now
= jiffies
;
1177 struct task_struct
*p
= current
;
1178 struct mm_struct
*mm
= p
->mm
;
1179 struct vm_area_struct
*vma
;
1180 unsigned long start
, end
;
1183 WARN_ON_ONCE(p
!= container_of(work
, struct task_struct
, numa_work
));
1185 work
->next
= work
; /* protect against double add */
1187 * Who cares about NUMA placement when they're dying.
1189 * NOTE: make sure not to dereference p->mm before this check,
1190 * exit_task_work() happens _after_ exit_mm() so we could be called
1191 * without p->mm even though we still had it when we enqueued this
1194 if (p
->flags
& PF_EXITING
)
1198 * We do not care about task placement until a task runs on a node
1199 * other than the first one used by the address space. This is
1200 * largely because migrations are driven by what CPU the task
1201 * is running on. If it's never scheduled on another node, it'll
1202 * not migrate so why bother trapping the fault.
1204 if (mm
->first_nid
== NUMA_PTE_SCAN_INIT
)
1205 mm
->first_nid
= numa_node_id();
1206 if (mm
->first_nid
!= NUMA_PTE_SCAN_ACTIVE
) {
1207 /* Are we running on a new node yet? */
1208 if (numa_node_id() == mm
->first_nid
&&
1209 !sched_feat_numa(NUMA_FORCE
))
1212 mm
->first_nid
= NUMA_PTE_SCAN_ACTIVE
;
1216 * Reset the scan period if enough time has gone by. Objective is that
1217 * scanning will be reduced if pages are properly placed. As tasks
1218 * can enter different phases this needs to be re-examined. Lacking
1219 * proper tracking of reference behaviour, this blunt hammer is used.
1221 migrate
= mm
->numa_next_reset
;
1222 if (time_after(now
, migrate
)) {
1223 p
->numa_scan_period
= sysctl_numa_balancing_scan_period_min
;
1224 next_scan
= now
+ msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset
);
1225 xchg(&mm
->numa_next_reset
, next_scan
);
1229 * Enforce maximal scan/migration frequency..
1231 migrate
= mm
->numa_next_scan
;
1232 if (time_before(now
, migrate
))
1235 if (p
->numa_scan_period
== 0)
1236 p
->numa_scan_period
= sysctl_numa_balancing_scan_period_min
;
1238 next_scan
= now
+ msecs_to_jiffies(p
->numa_scan_period
);
1239 if (cmpxchg(&mm
->numa_next_scan
, migrate
, next_scan
) != migrate
)
1243 * Do not set pte_numa if the current running node is rate-limited.
1244 * This loses statistics on the fault but if we are unwilling to
1245 * migrate to this node, it is less likely we can do useful work
1247 if (migrate_ratelimited(numa_node_id()))
1250 start
= mm
->numa_scan_offset
;
1251 pages
= sysctl_numa_balancing_scan_size
;
1252 pages
<<= 20 - PAGE_SHIFT
; /* MB in pages */
1256 down_read(&mm
->mmap_sem
);
1257 vma
= find_vma(mm
, start
);
1259 reset_ptenuma_scan(p
);
1263 for (; vma
; vma
= vma
->vm_next
) {
1264 if (!vma_migratable(vma
))
1267 /* Skip small VMAs. They are not likely to be of relevance */
1268 if (vma
->vm_end
- vma
->vm_start
< HPAGE_SIZE
)
1272 * Skip inaccessible VMAs to avoid any confusion between
1273 * PROT_NONE and NUMA hinting ptes
1275 if (!(vma
->vm_flags
& (VM_READ
| VM_EXEC
| VM_WRITE
)))
1279 start
= max(start
, vma
->vm_start
);
1280 end
= ALIGN(start
+ (pages
<< PAGE_SHIFT
), HPAGE_SIZE
);
1281 end
= min(end
, vma
->vm_end
);
1282 pages
-= change_prot_numa(vma
, start
, end
);
1287 } while (end
!= vma
->vm_end
);
1292 * It is possible to reach the end of the VMA list but the last few VMAs are
1293 * not guaranteed to the vma_migratable. If they are not, we would find the
1294 * !migratable VMA on the next scan but not reset the scanner to the start
1298 mm
->numa_scan_offset
= start
;
1300 reset_ptenuma_scan(p
);
1301 up_read(&mm
->mmap_sem
);
1305 * Drive the periodic memory faults..
1307 void task_tick_numa(struct rq
*rq
, struct task_struct
*curr
)
1309 struct callback_head
*work
= &curr
->numa_work
;
1313 * We don't care about NUMA placement if we don't have memory.
1315 if (!curr
->mm
|| (curr
->flags
& PF_EXITING
) || work
->next
!= work
)
1319 * Using runtime rather than walltime has the dual advantage that
1320 * we (mostly) drive the selection from busy threads and that the
1321 * task needs to have done some actual work before we bother with
1324 now
= curr
->se
.sum_exec_runtime
;
1325 period
= (u64
)curr
->numa_scan_period
* NSEC_PER_MSEC
;
1327 if (now
- curr
->node_stamp
> period
) {
1328 if (!curr
->node_stamp
)
1329 curr
->numa_scan_period
= sysctl_numa_balancing_scan_period_min
;
1330 curr
->node_stamp
= now
;
1332 if (!time_before(jiffies
, curr
->mm
->numa_next_scan
)) {
1333 init_task_work(work
, task_numa_work
); /* TODO: move this into sched_fork() */
1334 task_work_add(curr
, work
, true);
1339 static void task_tick_numa(struct rq
*rq
, struct task_struct
*curr
)
1342 #endif /* CONFIG_NUMA_BALANCING */
1345 account_entity_enqueue(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
1347 update_load_add(&cfs_rq
->load
, se
->load
.weight
);
1348 if (!parent_entity(se
))
1349 update_load_add(&rq_of(cfs_rq
)->load
, se
->load
.weight
);
1351 if (entity_is_task(se
))
1352 list_add(&se
->group_node
, &rq_of(cfs_rq
)->cfs_tasks
);
1354 cfs_rq
->nr_running
++;
1358 account_entity_dequeue(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
1360 update_load_sub(&cfs_rq
->load
, se
->load
.weight
);
1361 if (!parent_entity(se
))
1362 update_load_sub(&rq_of(cfs_rq
)->load
, se
->load
.weight
);
1363 if (entity_is_task(se
))
1364 list_del_init(&se
->group_node
);
1365 cfs_rq
->nr_running
--;
1368 #ifdef CONFIG_FAIR_GROUP_SCHED
1370 static inline long calc_tg_weight(struct task_group
*tg
, struct cfs_rq
*cfs_rq
)
1375 * Use this CPU's actual weight instead of the last load_contribution
1376 * to gain a more accurate current total weight. See
1377 * update_cfs_rq_load_contribution().
1379 tg_weight
= atomic_long_read(&tg
->load_avg
);
1380 tg_weight
-= cfs_rq
->tg_load_contrib
;
1381 tg_weight
+= cfs_rq
->load
.weight
;
1386 static long calc_cfs_shares(struct cfs_rq
*cfs_rq
, struct task_group
*tg
)
1388 long tg_weight
, load
, shares
;
1390 tg_weight
= calc_tg_weight(tg
, cfs_rq
);
1391 load
= cfs_rq
->load
.weight
;
1393 shares
= (tg
->shares
* load
);
1395 shares
/= tg_weight
;
1397 if (shares
< MIN_SHARES
)
1398 shares
= MIN_SHARES
;
1399 if (shares
> tg
->shares
)
1400 shares
= tg
->shares
;
1404 # else /* CONFIG_SMP */
1405 static inline long calc_cfs_shares(struct cfs_rq
*cfs_rq
, struct task_group
*tg
)
1409 # endif /* CONFIG_SMP */
1410 static void reweight_entity(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
,
1411 unsigned long weight
)
1414 /* commit outstanding execution time */
1415 if (cfs_rq
->curr
== se
)
1416 update_curr(cfs_rq
);
1417 account_entity_dequeue(cfs_rq
, se
);
1420 update_load_set(&se
->load
, weight
);
1423 account_entity_enqueue(cfs_rq
, se
);
1426 static inline int throttled_hierarchy(struct cfs_rq
*cfs_rq
);
1428 static void update_cfs_shares(struct cfs_rq
*cfs_rq
)
1430 struct task_group
*tg
;
1431 struct sched_entity
*se
;
1435 se
= tg
->se
[cpu_of(rq_of(cfs_rq
))];
1436 if (!se
|| throttled_hierarchy(cfs_rq
))
1439 if (likely(se
->load
.weight
== tg
->shares
))
1442 shares
= calc_cfs_shares(cfs_rq
, tg
);
1444 reweight_entity(cfs_rq_of(se
), se
, shares
);
1446 #else /* CONFIG_FAIR_GROUP_SCHED */
1447 static inline void update_cfs_shares(struct cfs_rq
*cfs_rq
)
1450 #endif /* CONFIG_FAIR_GROUP_SCHED */
1454 * We choose a half-life close to 1 scheduling period.
1455 * Note: The tables below are dependent on this value.
1457 //#define LOAD_AVG_PERIOD 32
1458 //#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1459 //#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1461 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1462 static const u32 runnable_avg_yN_inv
[] = {
1463 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1464 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1465 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1466 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1467 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1468 0x85aac367, 0x82cd8698,
1472 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1473 * over-estimates when re-combining.
1475 static const u32 runnable_avg_yN_sum
[] = {
1476 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1477 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1478 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1483 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1485 static __always_inline u64
decay_load(u64 val
, u64 n
)
1487 unsigned int local_n
;
1491 else if (unlikely(n
> LOAD_AVG_PERIOD
* 63))
1494 /* after bounds checking we can collapse to 32-bit */
1498 * As y^PERIOD = 1/2, we can combine
1499 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1500 * With a look-up table which covers k^n (n<PERIOD)
1502 * To achieve constant time decay_load.
1504 if (unlikely(local_n
>= LOAD_AVG_PERIOD
)) {
1505 val
>>= local_n
/ LOAD_AVG_PERIOD
;
1506 local_n
%= LOAD_AVG_PERIOD
;
1509 val
*= runnable_avg_yN_inv
[local_n
];
1510 /* We don't use SRR here since we always want to round down. */
1515 * For updates fully spanning n periods, the contribution to runnable
1516 * average will be: \Sum 1024*y^n
1518 * We can compute this reasonably efficiently by combining:
1519 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1521 static u32
__compute_runnable_contrib(u64 n
)
1525 if (likely(n
<= LOAD_AVG_PERIOD
))
1526 return runnable_avg_yN_sum
[n
];
1527 else if (unlikely(n
>= LOAD_AVG_MAX_N
))
1528 return LOAD_AVG_MAX
;
1530 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1532 contrib
/= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1533 contrib
+= runnable_avg_yN_sum
[LOAD_AVG_PERIOD
];
1535 n
-= LOAD_AVG_PERIOD
;
1536 } while (n
> LOAD_AVG_PERIOD
);
1538 contrib
= decay_load(contrib
, n
);
1539 return contrib
+ runnable_avg_yN_sum
[n
];
1542 #ifdef CONFIG_HMP_VARIABLE_SCALE
1544 #define HMP_VARIABLE_SCALE_SHIFT 16ULL
1545 struct hmp_global_attr
{
1546 struct attribute attr
;
1547 ssize_t (*show
)(struct kobject
*kobj
,
1548 struct attribute
*attr
, char *buf
);
1549 ssize_t (*store
)(struct kobject
*a
, struct attribute
*b
,
1550 const char *c
, size_t count
);
1552 int (*to_sysfs
)(int);
1553 int (*from_sysfs
)(int);
1556 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1557 #define HMP_DATA_SYSFS_MAX 5
1559 #define HMP_DATA_SYSFS_MAX 4
1562 struct hmp_data_struct
{
1563 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1564 int freqinvar_load_scale_enabled
;
1566 int multiplier
; /* used to scale the time delta */
1567 struct attribute_group attr_group
;
1568 struct attribute
*attributes
[HMP_DATA_SYSFS_MAX
+ 1];
1569 struct hmp_global_attr attr
[HMP_DATA_SYSFS_MAX
];
1572 static u64
hmp_variable_scale_convert(u64 delta
);
1573 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1574 /* Frequency-Invariant Load Modification:
1575 * Loads are calculated as in PJT's patch however we also scale the current
1576 * contribution in line with the frequency of the CPU that the task was
1578 * In this version, we use a simple linear scale derived from the maximum
1579 * frequency reported by CPUFreq. As an example:
1581 * Consider that we ran a task for 100% of the previous interval.
1583 * Our CPU was under asynchronous frequency control through one of the
1584 * CPUFreq governors.
1586 * The CPUFreq governor reports that it is able to scale the CPU between
1589 * During the period, the CPU was running at 1GHz.
1591 * In this case, our load contribution for that period is calculated as
1592 * 1 * (number_of_active_microseconds)
1594 * This results in our task being able to accumulate maximum load as normal.
1597 * Consider now that our CPU was executing at 500MHz.
1599 * We now scale the load contribution such that it is calculated as
1600 * 0.5 * (number_of_active_microseconds)
1602 * Our task can only record 50% maximum load during this period.
1604 * This represents the task consuming 50% of the CPU's *possible* compute
1605 * capacity. However the task did consume 100% of the CPU's *available*
1606 * compute capacity which is the value seen by the CPUFreq governor and
1607 * user-side CPU Utilization tools.
1609 * Restricting tracked load to be scaled by the CPU's frequency accurately
1610 * represents the consumption of possible compute capacity and allows the
1611 * HMP migration's simple threshold migration strategy to interact more
1612 * predictably with CPUFreq's asynchronous compute capacity changes.
1614 #define SCHED_FREQSCALE_SHIFT 10
1615 struct cpufreq_extents
{
1620 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
1625 /* Flag set when the governor in use only allows one frequency.
1628 #define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
1630 static struct cpufreq_extents freq_scale
[CONFIG_NR_CPUS
];
1631 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1632 #endif /* CONFIG_HMP_VARIABLE_SCALE */
1634 #ifdef CONFIG_MTK_SCHED_CMP
1635 int get_cluster_id(unsigned int cpu
)
1637 return arch_get_cluster_id(cpu
);
1640 void get_cluster_cpus(struct cpumask
*cpus
, int cluster_id
,
1641 bool exclusive_offline
)
1643 struct cpumask cls_cpus
;
1645 arch_get_cluster_cpus(&cls_cpus
, cluster_id
);
1646 if (exclusive_offline
) {
1647 cpumask_and(cpus
, cpu_online_mask
, &cls_cpus
);
1649 cpumask_copy(cpus
, &cls_cpus
);
1652 static int nr_cpus_in_cluster(int cluster_id
, bool exclusive_offline
)
1654 struct cpumask cls_cpus
;
1657 arch_get_cluster_cpus(&cls_cpus
, cluster_id
);
1658 if (exclusive_offline
) {
1659 struct cpumask online_cpus
;
1660 cpumask_and(&online_cpus
, cpu_online_mask
, &cls_cpus
);
1661 nr_cpus
= cpumask_weight(&online_cpus
);
1663 nr_cpus
= cpumask_weight(&cls_cpus
);
1667 #endif /* CONFIG_MTK_SCHED_CMP */
1669 void sched_get_big_little_cpus(struct cpumask
*big
, struct cpumask
*little
)
1671 arch_get_big_little_cpus(big
, little
);
1673 EXPORT_SYMBOL(sched_get_big_little_cpus
);
1676 * generic entry point for cpu mask construction, dedicated for
1677 * mediatek scheduler.
1679 static __init __inline
void cmp_cputopo_domain_setup(void)
1681 WARN(smp_processor_id() != 0, "%s is supposed runs on CPU0 "
1682 "while kernel init", __func__
);
1683 #ifdef CONFIG_MTK_CPU_TOPOLOGY
1686 * |-> cmp_cputopo_domain_seutp()
1689 * ^ fork kernel_init
1690 * |-> kernel_init_freeable
1692 * |-> arch_build_cpu_topology_domain
1694 * here, we focus to build up cpu topology and domain before scheduler runs.
1696 pr_debug("[CPUTOPO][%s] build CPU topology and cluster.\n", __func__
);
1697 arch_build_cpu_topology_domain();
1701 #ifdef CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY
1702 static u64 __inline
variable_scale_convert(u64 delta
)
1704 u64 high
= delta
>> 32ULL;
1705 u64 low
= delta
& 0xffffffffULL
;
1706 low
*= LOAD_AVG_VARIABLE_PERIOD
;
1707 high
*= LOAD_AVG_VARIABLE_PERIOD
;
1708 return (low
>> 16ULL) + (high
<< (32ULL - 16ULL));
1712 /* We can represent the historical contribution to runnable average as the
1713 * coefficients of a geometric series. To do this we sub-divide our runnable
1714 * history into segments of approximately 1ms (1024us); label the segment that
1715 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1717 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1719 * (now) (~1ms ago) (~2ms ago)
1721 * Let u_i denote the fraction of p_i that the entity was runnable.
1723 * We then designate the fractions u_i as our co-efficients, yielding the
1724 * following representation of historical load:
1725 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1727 * We choose y based on the with of a reasonably scheduling period, fixing:
1730 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1731 * approximately half as much as the contribution to load within the last ms
1734 * When a period "rolls over" and we have new u_0`, multiplying the previous
1735 * sum again by y is sufficient to update:
1736 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1737 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1739 static __always_inline
int __update_entity_runnable_avg(u64 now
,
1740 struct sched_avg
*sa
,
1745 u64 delta
, periods
, lru
;
1746 u32 runnable_contrib
;
1747 int delta_w
, decayed
= 0;
1748 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1750 u32 scaled_runnable_contrib
;
1752 u32 curr_scale
= 1024;
1753 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1755 u32 scaled_runnable_contrib
;
1757 u32 curr_scale
= CPUPOWER_FREQSCALE_DEFAULT
;
1758 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1760 delta
= now
- sa
->last_runnable_update
;
1761 lru
= sa
->last_runnable_update
;
1763 * This should only happen when time goes backwards, which it
1764 * unfortunately does during sched clock init when we swap over to TSC.
1766 if ((s64
)delta
< 0) {
1767 sa
->last_runnable_update
= now
;
1771 #ifdef CONFIG_HMP_VARIABLE_SCALE
1772 delta
= hmp_variable_scale_convert(delta
);
1773 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1774 delta
= variable_scale_convert(delta
);
1777 * Use 1024ns as the unit of measurement since it's a reasonable
1778 * approximation of 1us and fast to compute.
1783 sa
->last_runnable_update
= now
;
1785 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1786 WARN(cpu
< 0, "[%s] CPU %d < 0 !!!\n", __func__
, cpu
);
1787 /* retrieve scale factor for load */
1788 if (cpu
>= 0 && cpu
< nr_cpu_ids
&& hmp_data
.freqinvar_load_scale_enabled
)
1789 curr_scale
= freq_scale
[cpu
].curr_scale
;
1790 mt_sched_printf("[%s] cpu=%d delta=%llu now=%llu last=%llu curr_scale=%u",
1791 __func__
, cpu
, delta
, now
, lru
, curr_scale
);
1792 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1793 WARN(cpu
< 0, "[%s] CPU %d < 0 !!!\n", __func__
, cpu
);
1794 /* retrieve scale factor for load */
1795 if (cpu
>= 0 && cpu
< nr_cpu_ids
)
1796 curr_scale
= (topology_cpu_capacity(cpu
) << CPUPOWER_FREQSCALE_SHIFT
)
1797 / (topology_max_cpu_capacity(cpu
)+1);
1798 mt_sched_printf("[%s] cpu=%d delta=%llu now=%llu last=%llu curr_scale=%u",
1799 __func__
, cpu
, delta
, now
, lru
, curr_scale
);
1800 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1802 /* delta_w is the amount already accumulated against our next period */
1803 delta_w
= sa
->runnable_avg_period
% 1024;
1804 if (delta
+ delta_w
>= 1024) {
1805 /* period roll-over */
1809 * Now that we know we're crossing a period boundary, figure
1810 * out how much from delta we need to complete the current
1811 * period and accrue it.
1813 delta_w
= 1024 - delta_w
;
1814 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1815 /* scale runnable time if necessary */
1816 scaled_delta_w
= (delta_w
* curr_scale
)
1817 >> SCHED_FREQSCALE_SHIFT
;
1819 sa
->runnable_avg_sum
+= scaled_delta_w
;
1821 sa
->usage_avg_sum
+= scaled_delta_w
;
1822 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1823 /* scale runnable time if necessary */
1824 scaled_delta_w
= (delta_w
* curr_scale
)
1825 >> CPUPOWER_FREQSCALE_SHIFT
;
1827 sa
->runnable_avg_sum
+= scaled_delta_w
;
1829 sa
->usage_avg_sum
+= scaled_delta_w
;
1832 sa
->runnable_avg_sum
+= delta_w
;
1834 sa
->usage_avg_sum
+= delta_w
;
1835 #endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1836 sa
->runnable_avg_period
+= delta_w
;
1840 /* Figure out how many additional periods this update spans */
1841 periods
= delta
/ 1024;
1843 /* decay the load we have accumulated so far */
1844 sa
->runnable_avg_sum
= decay_load(sa
->runnable_avg_sum
,
1846 sa
->runnable_avg_period
= decay_load(sa
->runnable_avg_period
,
1848 sa
->usage_avg_sum
= decay_load(sa
->usage_avg_sum
, periods
+ 1);
1849 /* add the contribution from this period */
1850 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1851 runnable_contrib
= __compute_runnable_contrib(periods
);
1852 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1853 /* Apply load scaling if necessary.
1854 * Note that multiplying the whole series is same as
1855 * multiplying all terms
1857 scaled_runnable_contrib
= (runnable_contrib
* curr_scale
)
1858 >> SCHED_FREQSCALE_SHIFT
;
1860 sa
->runnable_avg_sum
+= scaled_runnable_contrib
;
1862 sa
->usage_avg_sum
+= scaled_runnable_contrib
;
1863 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1864 /* Apply load scaling if necessary.
1865 * Note that multiplying the whole series is same as
1866 * multiplying all terms
1868 scaled_runnable_contrib
= (runnable_contrib
* curr_scale
)
1869 >> CPUPOWER_FREQSCALE_SHIFT
;
1871 sa
->runnable_avg_sum
+= scaled_runnable_contrib
;
1873 sa
->usage_avg_sum
+= scaled_runnable_contrib
;
1876 sa
->runnable_avg_sum
+= runnable_contrib
;
1878 sa
->usage_avg_sum
+= runnable_contrib
;
1879 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1880 sa
->runnable_avg_period
+= runnable_contrib
;
1883 /* Remainder of delta accrued against u_0` */
1884 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1885 /* scale if necessary */
1886 scaled_delta
= ((delta
* curr_scale
) >> SCHED_FREQSCALE_SHIFT
);
1888 sa
->runnable_avg_sum
+= scaled_delta
;
1890 sa
->usage_avg_sum
+= scaled_delta
;
1891 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1892 /* scale if necessary */
1893 scaled_delta
= ((delta
* curr_scale
) >> CPUPOWER_FREQSCALE_SHIFT
);
1895 sa
->runnable_avg_sum
+= scaled_delta
;
1897 sa
->usage_avg_sum
+= scaled_delta
;
1900 sa
->runnable_avg_sum
+= delta
;
1902 sa
->usage_avg_sum
+= delta
;
1903 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1904 sa
->runnable_avg_period
+= delta
;
1909 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1910 static inline u64
__synchronize_entity_decay(struct sched_entity
*se
)
1912 struct cfs_rq
*cfs_rq
= cfs_rq_of(se
);
1913 u64 decays
= atomic64_read(&cfs_rq
->decay_counter
);
1915 decays
-= se
->avg
.decay_count
;
1919 se
->avg
.load_avg_contrib
= decay_load(se
->avg
.load_avg_contrib
, decays
);
1920 se
->avg
.decay_count
= 0;
1925 #ifdef CONFIG_FAIR_GROUP_SCHED
1926 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq
*cfs_rq
,
1929 struct task_group
*tg
= cfs_rq
->tg
;
1932 tg_contrib
= cfs_rq
->runnable_load_avg
+ cfs_rq
->blocked_load_avg
;
1933 tg_contrib
-= cfs_rq
->tg_load_contrib
;
1935 if (force_update
|| abs(tg_contrib
) > cfs_rq
->tg_load_contrib
/ 8) {
1936 atomic_long_add(tg_contrib
, &tg
->load_avg
);
1937 cfs_rq
->tg_load_contrib
+= tg_contrib
;
1942 * Aggregate cfs_rq runnable averages into an equivalent task_group
1943 * representation for computing load contributions.
1945 static inline void __update_tg_runnable_avg(struct sched_avg
*sa
,
1946 struct cfs_rq
*cfs_rq
)
1948 struct task_group
*tg
= cfs_rq
->tg
;
1949 long contrib
, usage_contrib
;
1951 /* The fraction of a cpu used by this cfs_rq */
1952 contrib
= div_u64(sa
->runnable_avg_sum
<< NICE_0_SHIFT
,
1953 sa
->runnable_avg_period
+ 1);
1954 contrib
-= cfs_rq
->tg_runnable_contrib
;
1956 usage_contrib
= div_u64(sa
->usage_avg_sum
<< NICE_0_SHIFT
,
1957 sa
->runnable_avg_period
+ 1);
1958 usage_contrib
-= cfs_rq
->tg_usage_contrib
;
1961 * contrib/usage at this point represent deltas, only update if they
1964 if ((abs(contrib
) > cfs_rq
->tg_runnable_contrib
/ 64) ||
1965 (abs(usage_contrib
) > cfs_rq
->tg_usage_contrib
/ 64)) {
1966 atomic_add(contrib
, &tg
->runnable_avg
);
1967 cfs_rq
->tg_runnable_contrib
+= contrib
;
1969 atomic_add(usage_contrib
, &tg
->usage_avg
);
1970 cfs_rq
->tg_usage_contrib
+= usage_contrib
;
1974 static inline void __update_group_entity_contrib(struct sched_entity
*se
)
1976 struct cfs_rq
*cfs_rq
= group_cfs_rq(se
);
1977 struct task_group
*tg
= cfs_rq
->tg
;
1982 contrib
= cfs_rq
->tg_load_contrib
* tg
->shares
;
1983 se
->avg
.load_avg_contrib
= div_u64(contrib
,
1984 atomic_long_read(&tg
->load_avg
) + 1);
1987 * For group entities we need to compute a correction term in the case
1988 * that they are consuming <1 cpu so that we would contribute the same
1989 * load as a task of equal weight.
1991 * Explicitly co-ordinating this measurement would be expensive, but
1992 * fortunately the sum of each cpus contribution forms a usable
1993 * lower-bound on the true value.
1995 * Consider the aggregate of 2 contributions. Either they are disjoint
1996 * (and the sum represents true value) or they are disjoint and we are
1997 * understating by the aggregate of their overlap.
1999 * Extending this to N cpus, for a given overlap, the maximum amount we
2000 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
2001 * cpus that overlap for this interval and w_i is the interval width.
2003 * On a small machine; the first term is well-bounded which bounds the
2004 * total error since w_i is a subset of the period. Whereas on a
2005 * larger machine, while this first term can be larger, if w_i is the
2006 * of consequential size guaranteed to see n_i*w_i quickly converge to
2007 * our upper bound of 1-cpu.
2009 runnable_avg
= atomic_read(&tg
->runnable_avg
);
2010 if (runnable_avg
< NICE_0_LOAD
) {
2011 se
->avg
.load_avg_contrib
*= runnable_avg
;
2012 se
->avg
.load_avg_contrib
>>= NICE_0_SHIFT
;
2016 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq
*cfs_rq
,
2017 int force_update
) {}
2018 static inline void __update_tg_runnable_avg(struct sched_avg
*sa
,
2019 struct cfs_rq
*cfs_rq
) {}
2020 static inline void __update_group_entity_contrib(struct sched_entity
*se
) {}
2023 static inline void __update_task_entity_contrib(struct sched_entity
*se
)
2027 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2028 contrib
= se
->avg
.runnable_avg_sum
* scale_load_down(se
->load
.weight
);
2029 contrib
/= (se
->avg
.runnable_avg_period
+ 1);
2030 se
->avg
.load_avg_contrib
= scale_load(contrib
);
2033 /* Compute the current contribution to load_avg by se, return any delta */
2034 static long __update_entity_load_avg_contrib(struct sched_entity
*se
)
2036 long old_contrib
= se
->avg
.load_avg_contrib
;
2038 if (entity_is_task(se
)) {
2039 __update_task_entity_contrib(se
);
2041 __update_tg_runnable_avg(&se
->avg
, group_cfs_rq(se
));
2042 __update_group_entity_contrib(se
);
2045 return se
->avg
.load_avg_contrib
- old_contrib
;
2048 #if defined(CONFIG_MTK_SCHED_CMP) || defined(CONFIG_SCHED_HMP_ENHANCEMENT)
2049 /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
2050 static long __update_task_entity_ratio(struct sched_entity
*se
)
2052 long old_ratio
= se
->avg
.load_avg_ratio
;
2055 ratio
= se
->avg
.runnable_avg_sum
* scale_load_down(NICE_0_LOAD
);
2056 ratio
/= (se
->avg
.runnable_avg_period
+ 1);
2057 se
->avg
.load_avg_ratio
= scale_load(ratio
);
2059 return se
->avg
.load_avg_ratio
- old_ratio
;
2062 static inline long __update_task_entity_ratio(struct sched_entity
*se
) { return 0; }
2065 static inline void subtract_blocked_load_contrib(struct cfs_rq
*cfs_rq
,
2068 if (likely(load_contrib
< cfs_rq
->blocked_load_avg
))
2069 cfs_rq
->blocked_load_avg
-= load_contrib
;
2071 cfs_rq
->blocked_load_avg
= 0;
2074 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2075 unsigned int hmp_up_prio
= NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL
);
2078 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
2079 /* Schedule entity */
2080 #define se_pid(se) ((se != NULL && entity_is_task(se))? \
2081 container_of(se,struct task_struct,se)->pid:-1)
2082 #define se_load(se) se->avg.load_avg_ratio
2083 #define se_contrib(se) se->avg.load_avg_contrib
2085 /* CPU related : load information */
2086 #define cfs_pending_load(cpu) cpu_rq(cpu)->cfs.avg.pending_load
2087 #define cfs_load(cpu) cpu_rq(cpu)->cfs.avg.load_avg_ratio
2088 #define cfs_contrib(cpu) cpu_rq(cpu)->cfs.avg.load_avg_contrib
2090 /* CPU related : the number of tasks */
2091 #define cfs_nr_normal_prio(cpu) cpu_rq(cpu)->cfs.avg.nr_normal_prio
2092 #define cfs_nr_pending(cpu) cpu_rq(cpu)->cfs.avg.nr_pending
2093 #define cfs_length(cpu) cpu_rq(cpu)->cfs.h_nr_running
2094 #define rq_length(cpu) (cpu_rq(cpu)->nr_running + cfs_nr_pending(cpu))
2096 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2097 #define task_low_priority(prio) ((prio >= hmp_up_prio)?1:0)
2098 #define cfs_nr_dequeuing_low_prio(cpu) \
2099 cpu_rq(cpu)->cfs.avg.nr_dequeuing_low_prio
2100 #define cfs_reset_nr_dequeuing_low_prio(cpu) \
2101 (cfs_nr_dequeuing_low_prio(cpu) = 0)
2103 #define task_low_priority(prio) (0)
2104 #define cfs_reset_nr_dequeuing_low_prio(cpu)
2105 #endif /* CONFIG_SCHED_HMP_PRIO_FILTER */
2106 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
2108 static inline u64
cfs_rq_clock_task(struct cfs_rq
*cfs_rq
);
2110 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2111 int group_leader_is_empty(struct task_struct
*p
) {
2113 struct task_struct
*tg
= p
->group_leader
;
2115 if (SIGNAL_GROUP_EXIT
& p
->signal
->flags
){
2116 // pr_warn("[%s] (0x%p/0x%p)(#%d/%s) leader: pid(%d) state(%d) exit_state(%d)signal_flags=%x p->signal->flags=%x group_exit_code=%x\n", __func__,
2117 // p, tg, get_nr_threads(p), thread_group_empty(p) ? "empty" : "not empty",
2118 // p->tgid, tg->state, tg->exit_state, tg->state, p->signal->flags, p->signal->group_exit_code);
2122 // workaround debug codes
2123 if(tg
->state
== 0x6b6b6b6b){
2124 // pr_warn("[%s] (0x%p/0x%p)(#%d/%s) leader: state(%d) exit_state(%d)\n", __func__,
2125 // p, tg, get_nr_threads(p), thread_group_empty(p) ? "empty" : "not empty",
2126 // tg->state, tg->exit_state);
2133 static inline void update_tg_info(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
, long ratio_delta
)
2135 struct task_struct
*p
= task_of(se
);
2136 struct task_struct
*tg
= p
->group_leader
;
2138 unsigned long flags
;
2140 if (group_leader_is_empty(p
))
2142 id
= get_cluster_id(cfs_rq
->rq
->cpu
);
2143 if (unlikely(WARN_ON(id
< 0)))
2146 raw_spin_lock_irqsave(&tg
->thread_group_info_lock
, flags
);
2147 tg
->thread_group_info
[id
].load_avg_ratio
+= ratio_delta
;
2148 raw_spin_unlock_irqrestore(&tg
->thread_group_info_lock
, flags
);
2150 #ifdef CONFIG_MT_SCHED_INFO
2151 mt_sched_printf("update_tg_info %d:%s %d:%s %ld %ld %d %d %lu:%lu:%lu update",
2152 tg
->pid
, tg
->comm
, p
->pid
, p
->comm
,
2153 se
->avg
.load_avg_ratio
, ratio_delta
,
2154 cfs_rq
->rq
->cpu
, id
,
2155 tg
->thread_group_info
[id
].nr_running
,
2156 tg
->thread_group_info
[id
].cfs_nr_running
,
2157 tg
->thread_group_info
[id
].load_avg_ratio
);
2159 mt_sched_printf("update %d:%s %d:%s %ld %ld %d %d %lu %lu %lu, %lu %lu %lu",
2160 tg->pid, tg->comm, p->pid, p->comm,
2161 se->avg.load_avg_ratio, ratio_delta,
2162 id, cfs_rq->rq->cpu,
2163 tg->thread_group_info[0].nr_running,
2164 tg->thread_group_info[0].cfs_nr_running,
2165 tg->thread_group_info[0].load_avg_ratio,
2166 tg->thread_group_info[1].nr_running,
2167 tg->thread_group_info[1].cfs_nr_running,
2168 tg->thread_group_info[1].load_avg_ratio);
2175 /* Update a sched_entity's runnable average */
2176 static inline void update_entity_load_avg(struct sched_entity
*se
,
2179 struct cfs_rq
*cfs_rq
= cfs_rq_of(se
);
2182 long ratio_delta
= 0;
2183 int cpu
= -1; /* not used in normal case */
2185 #if defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) \
2186 || defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
2187 cpu
= cfs_rq
->rq
->cpu
;
2191 * For a group entity we need to use their owned cfs_rq_clock_task() in
2192 * case they are the parent of a throttled hierarchy.
2194 if (entity_is_task(se
))
2195 now
= cfs_rq_clock_task(cfs_rq
);
2197 now
= cfs_rq_clock_task(group_cfs_rq(se
));
2199 if (!__update_entity_runnable_avg(now
, &se
->avg
, se
->on_rq
,
2200 cfs_rq
->curr
== se
, cpu
)) {
2202 if (entity_is_task(se
)) {
2203 ratio_delta
= __update_task_entity_ratio(se
);
2206 cpu
= cfs_rq
->rq
->cpu
;
2207 cpu_rq(cpu
)->cfs
.avg
.load_avg_ratio
+= ratio_delta
;
2208 #ifdef CONFIG_HMP_TRACER
2209 trace_sched_cfs_load_update(task_of(se
),se_load(se
),ratio_delta
, cpu
);
2210 #endif /* CONFIG_HMP_TRACER */
2213 trace_sched_task_entity_avg(2, task_of(se
), &se
->avg
);
2214 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2216 update_tg_info(cfs_rq
, se
, ratio_delta
);
2224 contrib_delta
= __update_entity_load_avg_contrib(se
);
2226 /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
2227 if (entity_is_task(se
)) {
2228 ratio_delta
= __update_task_entity_ratio(se
);
2230 * ratio is re-estimated just for entity of task; as
2231 * for contrib, mark tracer here for task entity while
2232 * mining tg's at __update_group_entity_contrib().
2234 * track running usage in passing.
2236 trace_sched_task_entity_avg(3, task_of(se
), &se
->avg
);
2243 cfs_rq
->runnable_load_avg
+= contrib_delta
;
2244 if (entity_is_task(se
)) {
2245 cpu
= cfs_rq
->rq
->cpu
;
2246 cpu_rq(cpu
)->cfs
.avg
.load_avg_ratio
+= ratio_delta
;
2247 cpu_rq(cpu
)->cfs
.avg
.load_avg_contrib
+= contrib_delta
;
2248 #ifdef CONFIG_HMP_TRACER
2249 trace_sched_cfs_load_update(task_of(se
),se_load(se
),ratio_delta
,cpu
);
2250 #endif /* CONFIG_HMP_TRACER */
2251 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2252 update_tg_info(cfs_rq
, se
, ratio_delta
);
2257 subtract_blocked_load_contrib(cfs_rq
, -contrib_delta
);
2262 * Decay the load contributed by all blocked children and account this so that
2263 * their contribution may appropriately discounted when they wake up.
2265 static void update_cfs_rq_blocked_load(struct cfs_rq
*cfs_rq
, int force_update
)
2267 u64 now
= cfs_rq_clock_task(cfs_rq
) >> 20;
2270 decays
= now
- cfs_rq
->last_decay
;
2271 if (!decays
&& !force_update
)
2274 if (atomic_long_read(&cfs_rq
->removed_load
)) {
2275 unsigned long removed_load
;
2276 removed_load
= atomic_long_xchg(&cfs_rq
->removed_load
, 0);
2277 subtract_blocked_load_contrib(cfs_rq
, removed_load
);
2281 cfs_rq
->blocked_load_avg
= decay_load(cfs_rq
->blocked_load_avg
,
2283 atomic64_add(decays
, &cfs_rq
->decay_counter
);
2284 cfs_rq
->last_decay
= now
;
2287 __update_cfs_rq_tg_load_contrib(cfs_rq
, force_update
);
2290 static inline void update_rq_runnable_avg(struct rq
*rq
, int runnable
)
2293 int cpu
= -1; /* not used in normal case */
2295 #if defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE) \
2296 || defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
2299 __update_entity_runnable_avg(rq
->clock_task
, &rq
->avg
, runnable
,
2301 __update_tg_runnable_avg(&rq
->avg
, &rq
->cfs
);
2302 contrib
= rq
->avg
.runnable_avg_sum
* scale_load_down(1024);
2303 contrib
/= (rq
->avg
.runnable_avg_period
+ 1);
2304 trace_sched_rq_runnable_ratio(cpu_of(rq
), scale_load(contrib
));
2305 trace_sched_rq_runnable_load(cpu_of(rq
), rq
->cfs
.runnable_load_avg
);
2308 /* Add the load generated by se into cfs_rq's child load-average */
2309 static inline void enqueue_entity_load_avg(struct cfs_rq
*cfs_rq
,
2310 struct sched_entity
*se
,
2313 int cpu
= cfs_rq
->rq
->cpu
;
2316 * We track migrations using entity decay_count <= 0, on a wake-up
2317 * migration we use a negative decay count to track the remote decays
2318 * accumulated while sleeping.
2320 * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
2321 * are seen by enqueue_entity_load_avg() as a migration with an already
2322 * constructed load_avg_contrib.
2324 if (unlikely(se
->avg
.decay_count
<= 0)) {
2325 se
->avg
.last_runnable_update
= rq_of(cfs_rq
)->clock_task
;
2326 if (se
->avg
.decay_count
) {
2328 * In a wake-up migration we have to approximate the
2329 * time sleeping. This is because we can't synchronize
2330 * clock_task between the two cpus, and it is not
2331 * guaranteed to be read-safe. Instead, we can
2332 * approximate this using our carried decays, which are
2333 * explicitly atomically readable.
2335 se
->avg
.last_runnable_update
-= (-se
->avg
.decay_count
)
2337 update_entity_load_avg(se
, 0);
2338 /* Indicate that we're now synchronized and on-rq */
2339 se
->avg
.decay_count
= 0;
2340 #ifdef CONFIG_MTK_SCHED_CMP
2342 if (entity_is_task(se
))
2343 trace_sched_task_entity_avg(1, task_of(se
), &se
->avg
);
2348 __synchronize_entity_decay(se
);
2351 /* migrated tasks did not contribute to our blocked load */
2353 subtract_blocked_load_contrib(cfs_rq
, se
->avg
.load_avg_contrib
);
2354 update_entity_load_avg(se
, 0);
2357 cfs_rq
->runnable_load_avg
+= se
->avg
.load_avg_contrib
;
2358 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2359 if(entity_is_task(se
)){
2360 update_tg_info(cfs_rq
, se
, se
->avg
.load_avg_ratio
);
2364 if (entity_is_task(se
)) {
2365 cpu_rq(cpu
)->cfs
.avg
.load_avg_contrib
+= se
->avg
.load_avg_contrib
;
2366 cpu_rq(cpu
)->cfs
.avg
.load_avg_ratio
+= se
->avg
.load_avg_ratio
;
2367 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
2368 cfs_nr_pending(cpu
) = 0;
2369 cfs_pending_load(cpu
) = 0;
2371 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2372 if(!task_low_priority(task_of(se
)->prio
))
2373 cfs_nr_normal_prio(cpu
)++;
2375 #ifdef CONFIG_HMP_TRACER
2376 trace_sched_cfs_enqueue_task(task_of(se
),se_load(se
),cpu
);
2380 /* we force update consideration on load-balancer moves */
2381 update_cfs_rq_blocked_load(cfs_rq
, !wakeup
);
2385 * Remove se's load from this cfs_rq child load-average, if the entity is
2386 * transitioning to a blocked state we track its projected decay using
2389 static inline void dequeue_entity_load_avg(struct cfs_rq
*cfs_rq
,
2390 struct sched_entity
*se
,
2393 int cpu
= cfs_rq
->rq
->cpu
;
2395 update_entity_load_avg(se
, 1);
2396 /* we force update consideration on load-balancer moves */
2397 update_cfs_rq_blocked_load(cfs_rq
, !sleep
);
2399 cfs_rq
->runnable_load_avg
-= se
->avg
.load_avg_contrib
;
2400 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2401 if(entity_is_task(se
)){
2402 update_tg_info(cfs_rq
, se
, -se
->avg
.load_avg_ratio
);
2406 if (entity_is_task(se
)) {
2407 cpu_rq(cpu
)->cfs
.avg
.load_avg_contrib
-= se
->avg
.load_avg_contrib
;
2408 cpu_rq(cpu
)->cfs
.avg
.load_avg_ratio
-= se
->avg
.load_avg_ratio
;
2409 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2410 cfs_reset_nr_dequeuing_low_prio(cpu
);
2411 if(!task_low_priority(task_of(se
)->prio
))
2412 cfs_nr_normal_prio(cpu
)--;
2414 #ifdef CONFIG_HMP_TRACER
2415 trace_sched_cfs_dequeue_task(task_of(se
),se_load(se
),cpu
);
2420 cfs_rq
->blocked_load_avg
+= se
->avg
.load_avg_contrib
;
2421 se
->avg
.decay_count
= atomic64_read(&cfs_rq
->decay_counter
);
2422 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
2426 * Update the rq's load with the elapsed running time before entering
2427 * idle. if the last scheduled task is not a CFS task, idle_enter will
2428 * be the only way to update the runnable statistic.
2430 void idle_enter_fair(struct rq
*this_rq
)
2432 update_rq_runnable_avg(this_rq
, 1);
2436 * Update the rq's load with the elapsed idle time before a task is
2437 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2438 * be the only way to update the runnable statistic.
2440 void idle_exit_fair(struct rq
*this_rq
)
2442 update_rq_runnable_avg(this_rq
, 0);
2446 static inline void update_entity_load_avg(struct sched_entity
*se
,
2447 int update_cfs_rq
) {}
2448 static inline void update_rq_runnable_avg(struct rq
*rq
, int runnable
) {}
2449 static inline void enqueue_entity_load_avg(struct cfs_rq
*cfs_rq
,
2450 struct sched_entity
*se
,
2452 static inline void dequeue_entity_load_avg(struct cfs_rq
*cfs_rq
,
2453 struct sched_entity
*se
,
2455 static inline void update_cfs_rq_blocked_load(struct cfs_rq
*cfs_rq
,
2456 int force_update
) {}
2459 static void enqueue_sleeper(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
2461 #ifdef CONFIG_SCHEDSTATS
2462 struct task_struct
*tsk
= NULL
;
2464 if (entity_is_task(se
))
2467 if (se
->statistics
.sleep_start
) {
2468 u64 delta
= rq_of(cfs_rq
)->clock
- se
->statistics
.sleep_start
;
2473 if (unlikely(delta
> se
->statistics
.sleep_max
))
2474 se
->statistics
.sleep_max
= delta
;
2476 se
->statistics
.sleep_start
= 0;
2477 se
->statistics
.sum_sleep_runtime
+= delta
;
2480 account_scheduler_latency(tsk
, delta
>> 10, 1);
2481 trace_sched_stat_sleep(tsk
, delta
);
2484 if (se
->statistics
.block_start
) {
2485 u64 delta
= rq_of(cfs_rq
)->clock
- se
->statistics
.block_start
;
2490 if (unlikely(delta
> se
->statistics
.block_max
))
2491 se
->statistics
.block_max
= delta
;
2493 se
->statistics
.block_start
= 0;
2494 se
->statistics
.sum_sleep_runtime
+= delta
;
2497 if (tsk
->in_iowait
) {
2498 se
->statistics
.iowait_sum
+= delta
;
2499 se
->statistics
.iowait_count
++;
2500 trace_sched_stat_iowait(tsk
, delta
);
2503 trace_sched_stat_blocked(tsk
, delta
);
2506 * Blocking time is in units of nanosecs, so shift by
2507 * 20 to get a milliseconds-range estimation of the
2508 * amount of time that the task spent sleeping:
2510 if (unlikely(prof_on
== SLEEP_PROFILING
)) {
2511 profile_hits(SLEEP_PROFILING
,
2512 (void *)get_wchan(tsk
),
2515 account_scheduler_latency(tsk
, delta
>> 10, 0);
2521 static void check_spread(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
2523 #ifdef CONFIG_SCHED_DEBUG
2524 s64 d
= se
->vruntime
- cfs_rq
->min_vruntime
;
2529 if (d
> 3*sysctl_sched_latency
)
2530 schedstat_inc(cfs_rq
, nr_spread_over
);
2535 place_entity(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
, int initial
)
2537 u64 vruntime
= cfs_rq
->min_vruntime
;
2540 * The 'current' period is already promised to the current tasks,
2541 * however the extra weight of the new task will slow them down a
2542 * little, place the new task so that it fits in the slot that
2543 * stays open at the end.
2545 if (initial
&& sched_feat(START_DEBIT
))
2546 vruntime
+= sched_vslice(cfs_rq
, se
);
2548 /* sleeps up to a single latency don't count. */
2550 unsigned long thresh
= sysctl_sched_latency
;
2553 * Halve their sleep time's effect, to allow
2554 * for a gentler effect of sleepers:
2556 if (sched_feat(GENTLE_FAIR_SLEEPERS
))
2562 /* ensure we never gain time by being placed backwards. */
2563 se
->vruntime
= max_vruntime(se
->vruntime
, vruntime
);
2566 static void check_enqueue_throttle(struct cfs_rq
*cfs_rq
);
2569 enqueue_entity(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
, int flags
)
2572 * Update the normalized vruntime before updating min_vruntime
2573 * through calling update_curr().
2575 if (!(flags
& ENQUEUE_WAKEUP
) || (flags
& ENQUEUE_WAKING
))
2576 se
->vruntime
+= cfs_rq
->min_vruntime
;
2579 * Update run-time statistics of the 'current'.
2581 update_curr(cfs_rq
);
2582 enqueue_entity_load_avg(cfs_rq
, se
, flags
& ENQUEUE_WAKEUP
);
2583 account_entity_enqueue(cfs_rq
, se
);
2584 update_cfs_shares(cfs_rq
);
2586 if (flags
& ENQUEUE_WAKEUP
) {
2587 place_entity(cfs_rq
, se
, 0);
2588 enqueue_sleeper(cfs_rq
, se
);
2591 update_stats_enqueue(cfs_rq
, se
);
2592 check_spread(cfs_rq
, se
);
2593 if (se
!= cfs_rq
->curr
)
2594 __enqueue_entity(cfs_rq
, se
);
2597 if (cfs_rq
->nr_running
== 1) {
2598 list_add_leaf_cfs_rq(cfs_rq
);
2599 check_enqueue_throttle(cfs_rq
);
2603 static void __clear_buddies_last(struct sched_entity
*se
)
2605 for_each_sched_entity(se
) {
2606 struct cfs_rq
*cfs_rq
= cfs_rq_of(se
);
2607 if (cfs_rq
->last
== se
)
2608 cfs_rq
->last
= NULL
;
2614 static void __clear_buddies_next(struct sched_entity
*se
)
2616 for_each_sched_entity(se
) {
2617 struct cfs_rq
*cfs_rq
= cfs_rq_of(se
);
2618 if (cfs_rq
->next
== se
)
2619 cfs_rq
->next
= NULL
;
2625 static void __clear_buddies_skip(struct sched_entity
*se
)
2627 for_each_sched_entity(se
) {
2628 struct cfs_rq
*cfs_rq
= cfs_rq_of(se
);
2629 if (cfs_rq
->skip
== se
)
2630 cfs_rq
->skip
= NULL
;
2636 static void clear_buddies(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
2638 if (cfs_rq
->last
== se
)
2639 __clear_buddies_last(se
);
2641 if (cfs_rq
->next
== se
)
2642 __clear_buddies_next(se
);
2644 if (cfs_rq
->skip
== se
)
2645 __clear_buddies_skip(se
);
2648 static __always_inline
void return_cfs_rq_runtime(struct cfs_rq
*cfs_rq
);
2651 dequeue_entity(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
, int flags
)
2654 * Update run-time statistics of the 'current'.
2656 update_curr(cfs_rq
);
2657 dequeue_entity_load_avg(cfs_rq
, se
, flags
& DEQUEUE_SLEEP
);
2659 update_stats_dequeue(cfs_rq
, se
);
2660 if (flags
& DEQUEUE_SLEEP
) {
2661 #ifdef CONFIG_SCHEDSTATS
2662 if (entity_is_task(se
)) {
2663 struct task_struct
*tsk
= task_of(se
);
2665 if (tsk
->state
& TASK_INTERRUPTIBLE
)
2666 se
->statistics
.sleep_start
= rq_of(cfs_rq
)->clock
;
2667 if (tsk
->state
& TASK_UNINTERRUPTIBLE
)
2668 se
->statistics
.block_start
= rq_of(cfs_rq
)->clock
;
2673 clear_buddies(cfs_rq
, se
);
2675 if (se
!= cfs_rq
->curr
)
2676 __dequeue_entity(cfs_rq
, se
);
2678 account_entity_dequeue(cfs_rq
, se
);
2681 * Normalize the entity after updating the min_vruntime because the
2682 * update can refer to the ->curr item and we need to reflect this
2683 * movement in our normalized position.
2685 if (!(flags
& DEQUEUE_SLEEP
))
2686 se
->vruntime
-= cfs_rq
->min_vruntime
;
2688 /* return excess runtime on last dequeue */
2689 return_cfs_rq_runtime(cfs_rq
);
2691 update_min_vruntime(cfs_rq
);
2692 update_cfs_shares(cfs_rq
);
2696 * Preempt the current task with a newly woken task if needed:
2699 check_preempt_tick(struct cfs_rq
*cfs_rq
, struct sched_entity
*curr
)
2701 unsigned long ideal_runtime
, delta_exec
;
2702 struct sched_entity
*se
;
2705 ideal_runtime
= sched_slice(cfs_rq
, curr
);
2706 delta_exec
= curr
->sum_exec_runtime
- curr
->prev_sum_exec_runtime
;
2707 if (delta_exec
> ideal_runtime
) {
2708 resched_task(rq_of(cfs_rq
)->curr
);
2710 * The current task ran long enough, ensure it doesn't get
2711 * re-elected due to buddy favours.
2713 clear_buddies(cfs_rq
, curr
);
2718 * Ensure that a task that missed wakeup preemption by a
2719 * narrow margin doesn't have to wait for a full slice.
2720 * This also mitigates buddy induced latencies under load.
2722 if (delta_exec
< sysctl_sched_min_granularity
)
2725 se
= __pick_first_entity(cfs_rq
);
2726 delta
= curr
->vruntime
- se
->vruntime
;
2731 if (delta
> ideal_runtime
)
2732 resched_task(rq_of(cfs_rq
)->curr
);
2736 set_next_entity(struct cfs_rq
*cfs_rq
, struct sched_entity
*se
)
2738 /* 'current' is not kept within the tree. */
2741 * Any task has to be enqueued before it get to execute on
2742 * a CPU. So account for the time it spent waiting on the
2745 update_stats_wait_end(cfs_rq
, se
);
2746 __dequeue_entity(cfs_rq
, se
);
2747 update_entity_load_avg(se
, 1);
2750 update_stats_curr_start(cfs_rq
, se
);
2752 #ifdef CONFIG_SCHEDSTATS
2754 * Track our maximum slice length, if the CPU's load is at
2755 * least twice that of our own weight (i.e. dont track it
2756 * when there are only lesser-weight tasks around):
2758 if (rq_of(cfs_rq
)->load
.weight
>= 2*se
->load
.weight
) {
2759 se
->statistics
.slice_max
= max(se
->statistics
.slice_max
,
2760 se
->sum_exec_runtime
- se
->prev_sum_exec_runtime
);
2763 se
->prev_sum_exec_runtime
= se
->sum_exec_runtime
;
2767 wakeup_preempt_entity(struct sched_entity
*curr
, struct sched_entity
*se
);
2770 * Pick the next process, keeping these things in mind, in this order:
2771 * 1) keep things fair between processes/task groups
2772 * 2) pick the "next" process, since someone really wants that to run
2773 * 3) pick the "last" process, for cache locality
2774 * 4) do not run the "skip" process, if something else is available
2776 static struct sched_entity
*pick_next_entity(struct cfs_rq
*cfs_rq
)
2778 struct sched_entity
*se
= __pick_first_entity(cfs_rq
);
2779 struct sched_entity
*left
= se
;
2782 * Avoid running the skip buddy, if running something else can
2783 * be done without getting too unfair.
2785 if (cfs_rq
->skip
== se
) {
2786 struct sched_entity
*second
= __pick_next_entity(se
);
2787 if (second
&& wakeup_preempt_entity(second
, left
) < 1)
2792 * Prefer last buddy, try to return the CPU to a preempted task.
2794 if (cfs_rq
->last
&& wakeup_preempt_entity(cfs_rq
->last
, left
) < 1)
2798 * Someone really wants this to run. If it's not unfair, run it.
2800 if (cfs_rq
->next
&& wakeup_preempt_entity(cfs_rq
->next
, left
) < 1)
2803 clear_buddies(cfs_rq
, se
);
2808 static void check_cfs_rq_runtime(struct cfs_rq
*cfs_rq
);
2810 static void put_prev_entity(struct cfs_rq
*cfs_rq
, struct sched_entity
*prev
)
2813 * If still on the runqueue then deactivate_task()
2814 * was not called and update_curr() has to be done:
2817 update_curr(cfs_rq
);
2819 /* throttle cfs_rqs exceeding runtime */
2820 check_cfs_rq_runtime(cfs_rq
);
2822 check_spread(cfs_rq
, prev
);
2824 update_stats_wait_start(cfs_rq
, prev
);
2825 /* Put 'current' back into the tree. */
2826 __enqueue_entity(cfs_rq
, prev
);
2827 /* in !on_rq case, update occurred at dequeue */
2828 update_entity_load_avg(prev
, 1);
2830 cfs_rq
->curr
= NULL
;
2834 entity_tick(struct cfs_rq
*cfs_rq
, struct sched_entity
*curr
, int queued
)
2837 * Update run-time statistics of the 'current'.
2839 update_curr(cfs_rq
);
2842 * Ensure that runnable average is periodically updated.
2844 update_entity_load_avg(curr
, 1);
2845 update_cfs_rq_blocked_load(cfs_rq
, 1);
2846 update_cfs_shares(cfs_rq
);
2848 #ifdef CONFIG_SCHED_HRTICK
2850 * queued ticks are scheduled to match the slice, so don't bother
2851 * validating it and just reschedule.
2854 resched_task(rq_of(cfs_rq
)->curr
);
2858 * don't let the period tick interfere with the hrtick preemption
2860 if (!sched_feat(DOUBLE_TICK
) &&
2861 hrtimer_active(&rq_of(cfs_rq
)->hrtick_timer
))
2865 if (cfs_rq
->nr_running
> 1)
2866 check_preempt_tick(cfs_rq
, curr
);
2870 /**************************************************
2871 * CFS bandwidth control machinery
2874 #ifdef CONFIG_CFS_BANDWIDTH
2876 #ifdef HAVE_JUMP_LABEL
2877 static struct static_key __cfs_bandwidth_used
;
2879 static inline bool cfs_bandwidth_used(void)
2881 return static_key_false(&__cfs_bandwidth_used
);
2884 void cfs_bandwidth_usage_inc(void)
2886 static_key_slow_inc(&__cfs_bandwidth_used
);
2889 void cfs_bandwidth_usage_dec(void)
2891 static_key_slow_dec(&__cfs_bandwidth_used
);
2893 #else /* HAVE_JUMP_LABEL */
2894 static bool cfs_bandwidth_used(void)
2899 void cfs_bandwidth_usage_inc(void) {}
2900 void cfs_bandwidth_usage_dec(void) {}
2901 #endif /* HAVE_JUMP_LABEL */
2904 * default period for cfs group bandwidth.
2905 * default: 0.1s, units: nanoseconds
2907 static inline u64
default_cfs_period(void)
2909 return 100000000ULL;
2912 static inline u64
sched_cfs_bandwidth_slice(void)
2914 return (u64
)sysctl_sched_cfs_bandwidth_slice
* NSEC_PER_USEC
;
2918 * Replenish runtime according to assigned quota and update expiration time.
2919 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2920 * additional synchronization around rq->lock.
2922 * requires cfs_b->lock
2924 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth
*cfs_b
)
2928 if (cfs_b
->quota
== RUNTIME_INF
)
2931 now
= sched_clock_cpu(smp_processor_id());
2932 cfs_b
->runtime
= cfs_b
->quota
;
2933 cfs_b
->runtime_expires
= now
+ ktime_to_ns(cfs_b
->period
);
2936 static inline struct cfs_bandwidth
*tg_cfs_bandwidth(struct task_group
*tg
)
2938 return &tg
->cfs_bandwidth
;
2941 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2942 static inline u64
cfs_rq_clock_task(struct cfs_rq
*cfs_rq
)
2944 if (unlikely(cfs_rq
->throttle_count
))
2945 return cfs_rq
->throttled_clock_task
;
2947 return rq_of(cfs_rq
)->clock_task
- cfs_rq
->throttled_clock_task_time
;
2950 /* returns 0 on failure to allocate runtime */
2951 static int assign_cfs_rq_runtime(struct cfs_rq
*cfs_rq
)
2953 struct task_group
*tg
= cfs_rq
->tg
;
2954 struct cfs_bandwidth
*cfs_b
= tg_cfs_bandwidth(tg
);
2955 u64 amount
= 0, min_amount
, expires
;
2957 /* note: this is a positive sum as runtime_remaining <= 0 */
2958 min_amount
= sched_cfs_bandwidth_slice() - cfs_rq
->runtime_remaining
;
2960 raw_spin_lock(&cfs_b
->lock
);
2961 if (cfs_b
->quota
== RUNTIME_INF
)
2962 amount
= min_amount
;
2965 * If the bandwidth pool has become inactive, then at least one
2966 * period must have elapsed since the last consumption.
2967 * Refresh the global state and ensure bandwidth timer becomes
2970 if (!cfs_b
->timer_active
) {
2971 __refill_cfs_bandwidth_runtime(cfs_b
);
2972 __start_cfs_bandwidth(cfs_b
);
2975 if (cfs_b
->runtime
> 0) {
2976 amount
= min(cfs_b
->runtime
, min_amount
);
2977 cfs_b
->runtime
-= amount
;
2981 expires
= cfs_b
->runtime_expires
;
2982 raw_spin_unlock(&cfs_b
->lock
);
2984 cfs_rq
->runtime_remaining
+= amount
;
2986 * we may have advanced our local expiration to account for allowed
2987 * spread between our sched_clock and the one on which runtime was
2990 if ((s64
)(expires
- cfs_rq
->runtime_expires
) > 0)
2991 cfs_rq
->runtime_expires
= expires
;
2993 return cfs_rq
->runtime_remaining
> 0;
2997 * Note: This depends on the synchronization provided by sched_clock and the
2998 * fact that rq->clock snapshots this value.
3000 static void expire_cfs_rq_runtime(struct cfs_rq
*cfs_rq
)
3002 struct cfs_bandwidth
*cfs_b
= tg_cfs_bandwidth(cfs_rq
->tg
);
3003 struct rq
*rq
= rq_of(cfs_rq
);
3005 /* if the deadline is ahead of our clock, nothing to do */
3006 if (likely((s64
)(rq
->clock
- cfs_rq
->runtime_expires
) < 0))
3009 if (cfs_rq
->runtime_remaining
< 0)
3013 * If the local deadline has passed we have to consider the
3014 * possibility that our sched_clock is 'fast' and the global deadline
3015 * has not truly expired.
3017 * Fortunately we can check determine whether this the case by checking
3018 * whether the global deadline has advanced.
3021 if ((s64
)(cfs_rq
->runtime_expires
- cfs_b
->runtime_expires
) >= 0) {
3022 /* extend local deadline, drift is bounded above by 2 ticks */
3023 cfs_rq
->runtime_expires
+= TICK_NSEC
;
3025 /* global deadline is ahead, expiration has passed */
3026 cfs_rq
->runtime_remaining
= 0;
3030 static void __account_cfs_rq_runtime(struct cfs_rq
*cfs_rq
,
3031 unsigned long delta_exec
)
3033 /* dock delta_exec before expiring quota (as it could span periods) */
3034 cfs_rq
->runtime_remaining
-= delta_exec
;
3035 expire_cfs_rq_runtime(cfs_rq
);
3037 if (likely(cfs_rq
->runtime_remaining
> 0))
3041 * if we're unable to extend our runtime we resched so that the active
3042 * hierarchy can be throttled
3044 if (!assign_cfs_rq_runtime(cfs_rq
) && likely(cfs_rq
->curr
))
3045 resched_task(rq_of(cfs_rq
)->curr
);
3048 static __always_inline
3049 void account_cfs_rq_runtime(struct cfs_rq
*cfs_rq
, unsigned long delta_exec
)
3051 if (!cfs_bandwidth_used() || !cfs_rq
->runtime_enabled
)
3054 __account_cfs_rq_runtime(cfs_rq
, delta_exec
);
3057 static inline int cfs_rq_throttled(struct cfs_rq
*cfs_rq
)
3059 return cfs_bandwidth_used() && cfs_rq
->throttled
;
3062 /* check whether cfs_rq, or any parent, is throttled */
3063 static inline int throttled_hierarchy(struct cfs_rq
*cfs_rq
)
3065 return cfs_bandwidth_used() && cfs_rq
->throttle_count
;
3069 * Ensure that neither of the group entities corresponding to src_cpu or
3070 * dest_cpu are members of a throttled hierarchy when performing group
3071 * load-balance operations.
3073 static inline int throttled_lb_pair(struct task_group
*tg
,
3074 int src_cpu
, int dest_cpu
)
3076 struct cfs_rq
*src_cfs_rq
, *dest_cfs_rq
;
3078 src_cfs_rq
= tg
->cfs_rq
[src_cpu
];
3079 dest_cfs_rq
= tg
->cfs_rq
[dest_cpu
];
3081 return throttled_hierarchy(src_cfs_rq
) ||
3082 throttled_hierarchy(dest_cfs_rq
);
3085 /* updated child weight may affect parent so we have to do this bottom up */
3086 static int tg_unthrottle_up(struct task_group
*tg
, void *data
)
3088 struct rq
*rq
= data
;
3089 struct cfs_rq
*cfs_rq
= tg
->cfs_rq
[cpu_of(rq
)];
3091 cfs_rq
->throttle_count
--;
3093 if (!cfs_rq
->throttle_count
) {
3094 /* adjust cfs_rq_clock_task() */
3095 cfs_rq
->throttled_clock_task_time
+= rq
->clock_task
-
3096 cfs_rq
->throttled_clock_task
;
3103 static int tg_throttle_down(struct task_group
*tg
, void *data
)
3105 struct rq
*rq
= data
;
3106 struct cfs_rq
*cfs_rq
= tg
->cfs_rq
[cpu_of(rq
)];
3108 /* group is entering throttled state, stop time */
3109 if (!cfs_rq
->throttle_count
)
3110 cfs_rq
->throttled_clock_task
= rq
->clock_task
;
3111 cfs_rq
->throttle_count
++;
3116 static void throttle_cfs_rq(struct cfs_rq
*cfs_rq
)
3118 struct rq
*rq
= rq_of(cfs_rq
);
3119 struct cfs_bandwidth
*cfs_b
= tg_cfs_bandwidth(cfs_rq
->tg
);
3120 struct sched_entity
*se
;
3121 long task_delta
, dequeue
= 1;
3123 se
= cfs_rq
->tg
->se
[cpu_of(rq_of(cfs_rq
))];
3125 /* freeze hierarchy runnable averages while throttled */
3127 walk_tg_tree_from(cfs_rq
->tg
, tg_throttle_down
, tg_nop
, (void *)rq
);
3130 task_delta
= cfs_rq
->h_nr_running
;
3131 for_each_sched_entity(se
) {
3132 struct cfs_rq
*qcfs_rq
= cfs_rq_of(se
);
3133 /* throttled entity or throttle-on-deactivate */
3138 dequeue_entity(qcfs_rq
, se
, DEQUEUE_SLEEP
);
3139 qcfs_rq
->h_nr_running
-= task_delta
;
3141 if (qcfs_rq
->load
.weight
)
3146 rq
->nr_running
-= task_delta
;
3148 cfs_rq
->throttled
= 1;
3149 cfs_rq
->throttled_clock
= rq
->clock
;
3150 raw_spin_lock(&cfs_b
->lock
);
3151 list_add_tail_rcu(&cfs_rq
->throttled_list
, &cfs_b
->throttled_cfs_rq
);
3152 if (!cfs_b
->timer_active
)
3153 __start_cfs_bandwidth(cfs_b
);
3154 raw_spin_unlock(&cfs_b
->lock
);
3157 void unthrottle_cfs_rq(struct cfs_rq
*cfs_rq
)
3159 struct rq
*rq
= rq_of(cfs_rq
);
3160 struct cfs_bandwidth
*cfs_b
= tg_cfs_bandwidth(cfs_rq
->tg
);
3161 struct sched_entity
*se
;
3165 se
= cfs_rq
->tg
->se
[cpu_of(rq
)];
3167 cfs_rq
->throttled
= 0;
3168 raw_spin_lock(&cfs_b
->lock
);
3169 cfs_b
->throttled_time
+= rq
->clock
- cfs_rq
->throttled_clock
;
3170 list_del_rcu(&cfs_rq
->throttled_list
);
3171 raw_spin_unlock(&cfs_b
->lock
);
3173 update_rq_clock(rq
);
3174 /* update hierarchical throttle state */
3175 walk_tg_tree_from(cfs_rq
->tg
, tg_nop
, tg_unthrottle_up
, (void *)rq
);
3177 if (!cfs_rq
->load
.weight
)
3180 task_delta
= cfs_rq
->h_nr_running
;
3181 for_each_sched_entity(se
) {
3185 cfs_rq
= cfs_rq_of(se
);
3187 enqueue_entity(cfs_rq
, se
, ENQUEUE_WAKEUP
);
3188 cfs_rq
->h_nr_running
+= task_delta
;
3190 if (cfs_rq_throttled(cfs_rq
))
3195 rq
->nr_running
+= task_delta
;
3197 /* determine whether we need to wake up potentially idle cpu */
3198 if (rq
->curr
== rq
->idle
&& rq
->cfs
.nr_running
)
3199 resched_task(rq
->curr
);
3202 static u64
distribute_cfs_runtime(struct cfs_bandwidth
*cfs_b
,
3203 u64 remaining
, u64 expires
)
3205 struct cfs_rq
*cfs_rq
;
3206 u64 runtime
= remaining
;
3209 list_for_each_entry_rcu(cfs_rq
, &cfs_b
->throttled_cfs_rq
,
3211 struct rq
*rq
= rq_of(cfs_rq
);
3213 raw_spin_lock(&rq
->lock
);
3214 if (!cfs_rq_throttled(cfs_rq
))
3217 runtime
= -cfs_rq
->runtime_remaining
+ 1;
3218 if (runtime
> remaining
)
3219 runtime
= remaining
;
3220 remaining
-= runtime
;
3222 cfs_rq
->runtime_remaining
+= runtime
;
3223 cfs_rq
->runtime_expires
= expires
;
3225 /* we check whether we're throttled above */
3226 if (cfs_rq
->runtime_remaining
> 0)
3227 unthrottle_cfs_rq(cfs_rq
);
3230 raw_spin_unlock(&rq
->lock
);
3241 * Responsible for refilling a task_group's bandwidth and unthrottling its
3242 * cfs_rqs as appropriate. If there has been no activity within the last
3243 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3244 * used to track this state.
3246 static int do_sched_cfs_period_timer(struct cfs_bandwidth
*cfs_b
, int overrun
)
3248 u64 runtime
, runtime_expires
;
3249 int idle
= 1, throttled
;
3251 raw_spin_lock(&cfs_b
->lock
);
3252 /* no need to continue the timer with no bandwidth constraint */
3253 if (cfs_b
->quota
== RUNTIME_INF
)
3256 throttled
= !list_empty(&cfs_b
->throttled_cfs_rq
);
3257 /* idle depends on !throttled (for the case of a large deficit) */
3258 idle
= cfs_b
->idle
&& !throttled
;
3259 cfs_b
->nr_periods
+= overrun
;
3261 /* if we're going inactive then everything else can be deferred */
3266 * if we have relooped after returning idle once, we need to update our
3267 * status as actually running, so that other cpus doing
3268 * __start_cfs_bandwidth will stop trying to cancel us.
3270 cfs_b
->timer_active
= 1;
3272 __refill_cfs_bandwidth_runtime(cfs_b
);
3275 /* mark as potentially idle for the upcoming period */
3280 /* account preceding periods in which throttling occurred */
3281 cfs_b
->nr_throttled
+= overrun
;
3284 * There are throttled entities so we must first use the new bandwidth
3285 * to unthrottle them before making it generally available. This
3286 * ensures that all existing debts will be paid before a new cfs_rq is
3289 runtime
= cfs_b
->runtime
;
3290 runtime_expires
= cfs_b
->runtime_expires
;
3294 * This check is repeated as we are holding onto the new bandwidth
3295 * while we unthrottle. This can potentially race with an unthrottled
3296 * group trying to acquire new bandwidth from the global pool.
3298 while (throttled
&& runtime
> 0) {
3299 raw_spin_unlock(&cfs_b
->lock
);
3300 /* we can't nest cfs_b->lock while distributing bandwidth */
3301 runtime
= distribute_cfs_runtime(cfs_b
, runtime
,
3303 raw_spin_lock(&cfs_b
->lock
);
3305 throttled
= !list_empty(&cfs_b
->throttled_cfs_rq
);
3308 /* return (any) remaining runtime */
3309 cfs_b
->runtime
= runtime
;
3311 * While we are ensured activity in the period following an
3312 * unthrottle, this also covers the case in which the new bandwidth is
3313 * insufficient to cover the existing bandwidth deficit. (Forcing the
3314 * timer to remain active while there are any throttled entities.)
3319 cfs_b
->timer_active
= 0;
3320 raw_spin_unlock(&cfs_b
->lock
);
3325 /* a cfs_rq won't donate quota below this amount */
3326 static const u64 min_cfs_rq_runtime
= 1 * NSEC_PER_MSEC
;
3327 /* minimum remaining period time to redistribute slack quota */
3328 static const u64 min_bandwidth_expiration
= 2 * NSEC_PER_MSEC
;
3329 /* how long we wait to gather additional slack before distributing */
3330 static const u64 cfs_bandwidth_slack_period
= 5 * NSEC_PER_MSEC
;
3333 * Are we near the end of the current quota period?
3335 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3336 * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3337 * migrate_hrtimers, base is never cleared, so we are fine.
3339 static int runtime_refresh_within(struct cfs_bandwidth
*cfs_b
, u64 min_expire
)
3341 struct hrtimer
*refresh_timer
= &cfs_b
->period_timer
;
3344 /* if the call-back is running a quota refresh is already occurring */
3345 if (hrtimer_callback_running(refresh_timer
))
3348 /* is a quota refresh about to occur? */
3349 remaining
= ktime_to_ns(hrtimer_expires_remaining(refresh_timer
));
3350 if (remaining
< min_expire
)
3356 static void start_cfs_slack_bandwidth(struct cfs_bandwidth
*cfs_b
)
3358 u64 min_left
= cfs_bandwidth_slack_period
+ min_bandwidth_expiration
;
3360 /* if there's a quota refresh soon don't bother with slack */
3361 if (runtime_refresh_within(cfs_b
, min_left
))
3364 start_bandwidth_timer(&cfs_b
->slack_timer
,
3365 ns_to_ktime(cfs_bandwidth_slack_period
));
3368 /* we know any runtime found here is valid as update_curr() precedes return */
3369 static void __return_cfs_rq_runtime(struct cfs_rq
*cfs_rq
)
3371 struct cfs_bandwidth
*cfs_b
= tg_cfs_bandwidth(cfs_rq
->tg
);
3372 s64 slack_runtime
= cfs_rq
->runtime_remaining
- min_cfs_rq_runtime
;
3374 if (slack_runtime
<= 0)
3377 raw_spin_lock(&cfs_b
->lock
);
3378 if (cfs_b
->quota
!= RUNTIME_INF
&&
3379 cfs_rq
->runtime_expires
== cfs_b
->runtime_expires
) {
3380 cfs_b
->runtime
+= slack_runtime
;
3382 /* we are under rq->lock, defer unthrottling using a timer */
3383 if (cfs_b
->runtime
> sched_cfs_bandwidth_slice() &&
3384 !list_empty(&cfs_b
->throttled_cfs_rq
))
3385 start_cfs_slack_bandwidth(cfs_b
);
3387 raw_spin_unlock(&cfs_b
->lock
);
3389 /* even if it's not valid for return we don't want to try again */
3390 cfs_rq
->runtime_remaining
-= slack_runtime
;
3393 static __always_inline
void return_cfs_rq_runtime(struct cfs_rq
*cfs_rq
)
3395 if (!cfs_bandwidth_used())
3398 if (!cfs_rq
->runtime_enabled
|| cfs_rq
->nr_running
)
3401 __return_cfs_rq_runtime(cfs_rq
);
3405 * This is done with a timer (instead of inline with bandwidth return) since
3406 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3408 static void do_sched_cfs_slack_timer(struct cfs_bandwidth
*cfs_b
)
3410 u64 runtime
= 0, slice
= sched_cfs_bandwidth_slice();
3413 /* confirm we're still not at a refresh boundary */
3414 raw_spin_lock(&cfs_b
->lock
);
3415 if (runtime_refresh_within(cfs_b
, min_bandwidth_expiration
)) {
3416 raw_spin_unlock(&cfs_b
->lock
);
3420 if (cfs_b
->quota
!= RUNTIME_INF
&& cfs_b
->runtime
> slice
) {
3421 runtime
= cfs_b
->runtime
;
3424 expires
= cfs_b
->runtime_expires
;
3425 raw_spin_unlock(&cfs_b
->lock
);
3430 runtime
= distribute_cfs_runtime(cfs_b
, runtime
, expires
);
3432 raw_spin_lock(&cfs_b
->lock
);
3433 if (expires
== cfs_b
->runtime_expires
)
3434 cfs_b
->runtime
= runtime
;
3435 raw_spin_unlock(&cfs_b
->lock
);
3439 * When a group wakes up we want to make sure that its quota is not already
3440 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3441 * runtime as update_curr() throttling can not not trigger until it's on-rq.
3443 static void check_enqueue_throttle(struct cfs_rq
*cfs_rq
)
3445 if (!cfs_bandwidth_used())
3448 /* an active group must be handled by the update_curr()->put() path */
3449 if (!cfs_rq
->runtime_enabled
|| cfs_rq
->curr
)
3452 /* ensure the group is not already throttled */
3453 if (cfs_rq_throttled(cfs_rq
))
3456 /* update runtime allocation */
3457 account_cfs_rq_runtime(cfs_rq
, 0);
3458 if (cfs_rq
->runtime_remaining
<= 0)
3459 throttle_cfs_rq(cfs_rq
);
3462 /* conditionally throttle active cfs_rq's from put_prev_entity() */
3463 static void check_cfs_rq_runtime(struct cfs_rq
*cfs_rq
)
3465 if (!cfs_bandwidth_used())
3468 if (likely(!cfs_rq
->runtime_enabled
|| cfs_rq
->runtime_remaining
> 0))
3472 * it's possible for a throttled entity to be forced into a running
3473 * state (e.g. set_curr_task), in this case we're finished.
3475 if (cfs_rq_throttled(cfs_rq
))
3478 throttle_cfs_rq(cfs_rq
);
3481 static inline u64
default_cfs_period(void);
3482 static int do_sched_cfs_period_timer(struct cfs_bandwidth
*cfs_b
, int overrun
);
3483 static void do_sched_cfs_slack_timer(struct cfs_bandwidth
*cfs_b
);
3485 static enum hrtimer_restart
sched_cfs_slack_timer(struct hrtimer
*timer
)
3487 struct cfs_bandwidth
*cfs_b
=
3488 container_of(timer
, struct cfs_bandwidth
, slack_timer
);
3489 do_sched_cfs_slack_timer(cfs_b
);
3491 return HRTIMER_NORESTART
;
3494 static enum hrtimer_restart
sched_cfs_period_timer(struct hrtimer
*timer
)
3496 struct cfs_bandwidth
*cfs_b
=
3497 container_of(timer
, struct cfs_bandwidth
, period_timer
);
3503 now
= hrtimer_cb_get_time(timer
);
3504 overrun
= hrtimer_forward(timer
, now
, cfs_b
->period
);
3509 idle
= do_sched_cfs_period_timer(cfs_b
, overrun
);
3512 return idle
? HRTIMER_NORESTART
: HRTIMER_RESTART
;
3515 void init_cfs_bandwidth(struct cfs_bandwidth
*cfs_b
)
3517 raw_spin_lock_init(&cfs_b
->lock
);
3519 cfs_b
->quota
= RUNTIME_INF
;
3520 cfs_b
->period
= ns_to_ktime(default_cfs_period());
3522 INIT_LIST_HEAD(&cfs_b
->throttled_cfs_rq
);
3523 hrtimer_init(&cfs_b
->period_timer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
3524 cfs_b
->period_timer
.function
= sched_cfs_period_timer
;
3525 hrtimer_init(&cfs_b
->slack_timer
, CLOCK_MONOTONIC
, HRTIMER_MODE_REL
);
3526 cfs_b
->slack_timer
.function
= sched_cfs_slack_timer
;
3529 static void init_cfs_rq_runtime(struct cfs_rq
*cfs_rq
)
3531 cfs_rq
->runtime_enabled
= 0;
3532 INIT_LIST_HEAD(&cfs_rq
->throttled_list
);
3535 /* requires cfs_b->lock, may release to reprogram timer */
3536 void __start_cfs_bandwidth(struct cfs_bandwidth
*cfs_b
)
3539 * The timer may be active because we're trying to set a new bandwidth
3540 * period or because we're racing with the tear-down path
3541 * (timer_active==0 becomes visible before the hrtimer call-back
3542 * terminates). In either case we ensure that it's re-programmed
3544 while (unlikely(hrtimer_active(&cfs_b
->period_timer
)) &&
3545 hrtimer_try_to_cancel(&cfs_b
->period_timer
) < 0) {
3546 /* bounce the lock to allow do_sched_cfs_period_timer to run */
3547 raw_spin_unlock(&cfs_b
->lock
);
3549 raw_spin_lock(&cfs_b
->lock
);
3550 /* if someone else restarted the timer then we're done */
3551 if (cfs_b
->timer_active
)
3555 cfs_b
->timer_active
= 1;
3556 start_bandwidth_timer(&cfs_b
->period_timer
, cfs_b
->period
);
3559 static void destroy_cfs_bandwidth(struct cfs_bandwidth
*cfs_b
)
3561 hrtimer_cancel(&cfs_b
->period_timer
);
3562 hrtimer_cancel(&cfs_b
->slack_timer
);
3565 static void __maybe_unused
unthrottle_offline_cfs_rqs(struct rq
*rq
)
3567 struct cfs_rq
*cfs_rq
;
3569 for_each_leaf_cfs_rq(rq
, cfs_rq
) {
3570 struct cfs_bandwidth
*cfs_b
= tg_cfs_bandwidth(cfs_rq
->tg
);
3572 if (!cfs_rq
->runtime_enabled
)
3576 * clock_task is not advancing so we just need to make sure
3577 * there's some valid quota amount
3579 cfs_rq
->runtime_remaining
= cfs_b
->quota
;
3580 if (cfs_rq_throttled(cfs_rq
))
3581 unthrottle_cfs_rq(cfs_rq
);
3585 #else /* CONFIG_CFS_BANDWIDTH */
3586 static inline u64
cfs_rq_clock_task(struct cfs_rq
*cfs_rq
)
3588 return rq_of(cfs_rq
)->clock_task
;
3591 static void account_cfs_rq_runtime(struct cfs_rq
*cfs_rq
,
3592 unsigned long delta_exec
) {}
3593 static void check_cfs_rq_runtime(struct cfs_rq
*cfs_rq
) {}
3594 static void check_enqueue_throttle(struct cfs_rq
*cfs_rq
) {}
3595 static __always_inline
void return_cfs_rq_runtime(struct cfs_rq
*cfs_rq
) {}
3597 static inline int cfs_rq_throttled(struct cfs_rq
*cfs_rq
)
3602 static inline int throttled_hierarchy(struct cfs_rq
*cfs_rq
)
3607 static inline int throttled_lb_pair(struct task_group
*tg
,
3608 int src_cpu
, int dest_cpu
)
3613 void init_cfs_bandwidth(struct cfs_bandwidth
*cfs_b
) {}
3615 #ifdef CONFIG_FAIR_GROUP_SCHED
3616 static void init_cfs_rq_runtime(struct cfs_rq
*cfs_rq
) {}
3619 static inline struct cfs_bandwidth
*tg_cfs_bandwidth(struct task_group
*tg
)
3623 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth
*cfs_b
) {}
3624 static inline void unthrottle_offline_cfs_rqs(struct rq
*rq
) {}
3626 #endif /* CONFIG_CFS_BANDWIDTH */
3628 /**************************************************
3629 * CFS operations on tasks:
3632 #ifdef CONFIG_SCHED_HRTICK
3633 static void hrtick_start_fair(struct rq
*rq
, struct task_struct
*p
)
3635 struct sched_entity
*se
= &p
->se
;
3636 struct cfs_rq
*cfs_rq
= cfs_rq_of(se
);
3638 WARN_ON(task_rq(p
) != rq
);
3640 if (cfs_rq
->nr_running
> 1) {
3641 u64 slice
= sched_slice(cfs_rq
, se
);
3642 u64 ran
= se
->sum_exec_runtime
- se
->prev_sum_exec_runtime
;
3643 s64 delta
= slice
- ran
;
3652 * Don't schedule slices shorter than 10000ns, that just
3653 * doesn't make sense. Rely on vruntime for fairness.
3656 delta
= max_t(s64
, 10000LL, delta
);
3658 hrtick_start(rq
, delta
);
3663 * called from enqueue/dequeue and updates the hrtick when the
3664 * current task is from our class and nr_running is low enough
3667 static void hrtick_update(struct rq
*rq
)
3669 struct task_struct
*curr
= rq
->curr
;
3671 if (!hrtick_enabled(rq
) || curr
->sched_class
!= &fair_sched_class
)
3674 if (cfs_rq_of(&curr
->se
)->nr_running
< sched_nr_latency
)
3675 hrtick_start_fair(rq
, curr
);
3677 #else /* !CONFIG_SCHED_HRTICK */
3679 hrtick_start_fair(struct rq
*rq
, struct task_struct
*p
)
3683 static inline void hrtick_update(struct rq
*rq
)
3688 #if defined(CONFIG_SCHED_HMP) || defined(CONFIG_MTK_SCHED_CMP)
3690 /* CPU cluster statistics for task migration control */
3691 #define HMP_GB (0x1000)
3692 #define HMP_SELECT_RQ (0x2000)
3693 #define HMP_LB (0x4000)
3694 #define HMP_MAX_LOAD (NICE_0_LOAD - 1)
3698 struct clb_stats bstats
;
3699 struct clb_stats lstats
;
3700 int btarget
, ltarget
;
3702 struct cpumask
*bcpus
;
3703 struct cpumask
*lcpus
;
3707 int status
; /* Details of this migration check */
3708 int result
; /* Indicate whether we should perform this task migration */
3712 unsigned long __weak
arch_scale_freq_power(struct sched_domain
*sd
, int cpu
);
3714 static void collect_cluster_stats(struct clb_stats
*clbs
,
3715 struct cpumask
*cluster_cpus
, int target
)
3717 #define HMP_RESOLUTION_SCALING (4)
3718 #define hmp_scale_down(w) ((w) >> HMP_RESOLUTION_SCALING)
3720 /* Update cluster informatics */
3722 for_each_cpu(cpu
, cluster_cpus
) {
3723 if(cpu_online(cpu
)) {
3725 clbs
->ntask
+= cpu_rq(cpu
)->cfs
.h_nr_running
;
3726 clbs
->load_avg
+= cpu_rq(cpu
)->cfs
.avg
.load_avg_ratio
;
3727 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
3728 clbs
->nr_normal_prio_task
+= cfs_nr_normal_prio(cpu
);
3729 clbs
->nr_dequeuing_low_prio
+= cfs_nr_dequeuing_low_prio(cpu
);
3734 if(!clbs
->ncpu
|| NR_CPUS
== target
|| !cpumask_test_cpu(target
,cluster_cpus
))
3737 clbs
->cpu_power
= (int) arch_scale_freq_power(NULL
, target
);
3739 /* Scale current CPU compute capacity in accordance with frequency */
3740 clbs
->cpu_capacity
= HMP_MAX_LOAD
;
3741 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
3742 if (hmp_data
.freqinvar_load_scale_enabled
) {
3743 cpu
= cpumask_any(cluster_cpus
);
3744 if (freq_scale
[cpu
].throttling
== 1){
3745 clbs
->cpu_capacity
*= freq_scale
[cpu
].curr_scale
;
3747 clbs
->cpu_capacity
*= freq_scale
[cpu
].max
;
3749 clbs
->cpu_capacity
>>= SCHED_FREQSCALE_SHIFT
;
3751 if (clbs
->cpu_capacity
> HMP_MAX_LOAD
){
3752 clbs
->cpu_capacity
= HMP_MAX_LOAD
;
3755 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
3756 if (topology_cpu_inv_power_en()) {
3757 cpu
= cpumask_any(cluster_cpus
);
3758 if (topology_cpu_throttling(cpu
))
3759 clbs
->cpu_capacity
*=
3760 (topology_cpu_capacity(cpu
) << CPUPOWER_FREQSCALE_SHIFT
)
3761 / (topology_max_cpu_capacity(cpu
)+1);
3763 clbs
->cpu_capacity
*= topology_max_cpu_capacity(cpu
);
3764 clbs
->cpu_capacity
>>= CPUPOWER_FREQSCALE_SHIFT
;
3766 if (clbs
->cpu_capacity
> HMP_MAX_LOAD
){
3767 clbs
->cpu_capacity
= HMP_MAX_LOAD
;
3773 * Calculate available CPU capacity
3774 * Calculate available task space
3776 * Why load ratio should be multiplied by the number of task ?
3777 * The task is the entity of scheduling unit so that we should consider
3778 * it in scheduler. Only considering task load is not enough.
3779 * Thus, multiplying the number of tasks can adjust load ratio to a more
3782 clbs
->load_avg
/= clbs
->ncpu
;
3783 clbs
->acap
= clbs
->cpu_capacity
- cpu_rq(target
)->cfs
.avg
.load_avg_ratio
;
3784 clbs
->scaled_acap
= hmp_scale_down(clbs
->acap
);
3785 clbs
->scaled_atask
= cpu_rq(target
)->cfs
.h_nr_running
* cpu_rq(target
)->cfs
.avg
.load_avg_ratio
;
3786 clbs
->scaled_atask
= clbs
->cpu_capacity
- clbs
->scaled_atask
;
3787 clbs
->scaled_atask
= hmp_scale_down(clbs
->scaled_atask
);
3789 mt_sched_printf("[%s] cpu/cluster:%d/%02lx load/len:%lu/%u stats:%d,%d,%d,%d,%d,%d,%d,%d\n", __func__
,
3790 target
, *cpumask_bits(cluster_cpus
),
3791 cpu_rq(target
)->cfs
.avg
.load_avg_ratio
, cpu_rq(target
)->cfs
.h_nr_running
,
3792 clbs
->ncpu
, clbs
->ntask
, clbs
->load_avg
, clbs
->cpu_capacity
,
3793 clbs
->acap
, clbs
->scaled_acap
, clbs
->scaled_atask
, clbs
->threshold
);
3796 //#define USE_HMP_DYNAMIC_THRESHOLD
3797 #if defined(CONFIG_SCHED_HMP) && defined(USE_HMP_DYNAMIC_THRESHOLD)
3798 static inline void hmp_dynamic_threshold(struct clb_env
*clbenv
);
3802 * Task Dynamic Migration Threshold Adjustment.
3804 * If the workload between clusters is not balanced, adjust migration
3805 * threshold in an attempt to move task precisely.
3807 * Diff. = Max Threshold - Min Threshold
3809 * Dynamic UP-Threshold =
3811 * Max Threshold - Diff. x ----------------- x -------------------
3812 * B_nacap + L_nacap B_natask + L_natask
3815 * Dynamic Down-Threshold =
3817 * Min Threshold + Diff. x ----------------- x -------------------
3818 * B_nacap + L_nacap B_natask + L_natask
3820 static void adj_threshold(struct clb_env
*clbenv
)
3822 #define TSKLD_SHIFT (2)
3823 #define POSITIVE(x) ((int)(x) < 0 ? 0 : (x))
3826 unsigned long b_cap
=0, l_cap
=0;
3827 unsigned long b_load
=0, l_load
=0;
3828 unsigned long b_task
=0, l_task
=0;
3829 int b_nacap
, l_nacap
, b_natask
, l_natask
;
3831 #if defined(CONFIG_SCHED_HMP) && defined(USE_HMP_DYNAMIC_THRESHOLD)
3832 hmp_dynamic_threshold(clbenv
);
3836 bcpu
= clbenv
->btarget
;
3837 lcpu
= clbenv
->ltarget
;
3838 if (bcpu
< nr_cpu_ids
) {
3839 b_load
= cpu_rq(bcpu
)->cfs
.avg
.load_avg_ratio
;
3840 b_task
= cpu_rq(bcpu
)->cfs
.h_nr_running
;
3842 if (lcpu
< nr_cpu_ids
) {
3843 l_load
= cpu_rq(lcpu
)->cfs
.avg
.load_avg_ratio
;
3844 l_task
= cpu_rq(lcpu
)->cfs
.h_nr_running
;
3847 #ifdef CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY
3848 if (bcpu
< nr_cpu_ids
) {
3849 b_cap
= topology_cpu_capacity(bcpu
);
3851 if (lcpu
< nr_cpu_ids
) {
3852 l_cap
= topology_cpu_capacity(lcpu
);
3855 b_nacap
= POSITIVE(b_cap
- b_load
);
3856 b_natask
= POSITIVE(b_cap
- ((b_task
* b_load
) >> TSKLD_SHIFT
));
3857 l_nacap
= POSITIVE(l_cap
- l_load
);
3858 l_natask
= POSITIVE(l_cap
- ((l_task
* l_load
) >> TSKLD_SHIFT
));
3859 #else /* !CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY */
3860 b_cap
= clbenv
->bstats
.cpu_power
;
3861 l_cap
= clbenv
->lstats
.cpu_power
;
3862 b_nacap
= POSITIVE(clbenv
->bstats
.scaled_acap
*
3863 clbenv
->bstats
.cpu_power
/ (clbenv
->lstats
.cpu_power
+1));
3864 b_natask
= POSITIVE(clbenv
->bstats
.scaled_atask
*
3865 clbenv
->bstats
.cpu_power
/ (clbenv
->lstats
.cpu_power
+1));
3866 l_nacap
= POSITIVE(clbenv
->lstats
.scaled_acap
);
3867 l_natask
= POSITIVE(clbenv
->bstats
.scaled_atask
);
3869 #endif /* CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY */
3871 clbenv
->bstats
.threshold
= HMP_MAX_LOAD
- HMP_MAX_LOAD
* b_nacap
* b_natask
/
3872 ((b_nacap
+ l_nacap
) * (b_natask
+ l_natask
)+1);
3873 clbenv
->lstats
.threshold
= HMP_MAX_LOAD
* l_nacap
* l_natask
/
3874 ((b_nacap
+ l_nacap
) * (b_natask
+ l_natask
)+1);
3876 mt_sched_printf("[%s]\tup/dl:%4d/%4d L(%d:%4lu,%4lu/%4lu) b(%d:%4lu,%4lu/%4lu)\n", __func__
,
3877 clbenv
->bstats
.threshold
, clbenv
->lstats
.threshold
,
3878 lcpu
, l_load
, l_task
, l_cap
,
3879 bcpu
, b_load
, b_task
, b_cap
);
3882 static void sched_update_clbstats(struct clb_env
*clbenv
)
3884 collect_cluster_stats(&clbenv
->bstats
, clbenv
->bcpus
, clbenv
->btarget
);
3885 collect_cluster_stats(&clbenv
->lstats
, clbenv
->lcpus
, clbenv
->ltarget
);
3886 adj_threshold(clbenv
);
3888 #endif /* #if defined(CONFIG_SCHED_HMP) || defined(CONFIG_SCHED_CMP) */
3891 #ifdef CONFIG_SCHED_HMP
3893 * Heterogenous multiprocessor (HMP) optimizations
3895 * The cpu types are distinguished using a list of hmp_domains
3896 * which each represent one cpu type using a cpumask.
3897 * The list is assumed ordered by compute capacity with the
3898 * fastest domain first.
3900 DEFINE_PER_CPU(struct hmp_domain
*, hmp_cpu_domain
);
3901 /* We need to know which cpus are fast and slow. */
3902 extern struct cpumask hmp_fast_cpu_mask
;
3903 extern struct cpumask hmp_slow_cpu_mask
;
3905 extern void __init
arch_get_hmp_domains(struct list_head
*hmp_domains_list
);
3907 /* Setup hmp_domains */
3908 static int __init
hmp_cpu_mask_setup(void)
3911 struct hmp_domain
*domain
;
3912 struct list_head
*pos
;
3915 #if defined(CONFIG_SCHED_HMP_ENHANCEMENT) || \
3916 defined(CONFIG_MT_RT_SCHED) || defined(CONFIG_MT_RT_SCHED_LOG)
3917 cpumask_clear(&hmp_fast_cpu_mask
);
3918 cpumask_clear(&hmp_slow_cpu_mask
);
3921 pr_debug("Initializing HMP scheduler:\n");
3923 /* Initialize hmp_domains using platform code */
3924 arch_get_hmp_domains(&hmp_domains
);
3925 if (list_empty(&hmp_domains
)) {
3926 pr_debug("HMP domain list is empty!\n");
3930 /* Print hmp_domains */
3932 list_for_each(pos
, &hmp_domains
) {
3933 domain
= list_entry(pos
, struct hmp_domain
, hmp_domains
);
3934 cpulist_scnprintf(buf
, 64, &domain
->possible_cpus
);
3935 pr_debug(" HMP domain %d: %s\n", dc
, buf
);
3938 * According to the description in "arch_get_hmp_domains",
3939 * Fastest domain is at head of list. Thus, the fast-cpu mask should
3940 * be initialized first, followed by slow-cpu mask.
3942 #if defined(CONFIG_SCHED_HMP_ENHANCEMENT) || \
3943 defined(CONFIG_MT_RT_SCHED) || defined(CONFIG_MT_RT_SCHED_LOG)
3944 if(cpumask_empty(&hmp_fast_cpu_mask
)) {
3945 cpumask_copy(&hmp_fast_cpu_mask
,&domain
->possible_cpus
);
3946 for_each_cpu(cpu
, &hmp_fast_cpu_mask
)
3947 pr_debug(" HMP fast cpu : %d\n",cpu
);
3948 } else if (cpumask_empty(&hmp_slow_cpu_mask
)){
3949 cpumask_copy(&hmp_slow_cpu_mask
,&domain
->possible_cpus
);
3950 for_each_cpu(cpu
, &hmp_slow_cpu_mask
)
3951 pr_debug(" HMP slow cpu : %d\n",cpu
);
3955 for_each_cpu_mask(cpu
, domain
->possible_cpus
) {
3956 per_cpu(hmp_cpu_domain
, cpu
) = domain
;
3964 static struct hmp_domain
*hmp_get_hmp_domain_for_cpu(int cpu
)
3966 struct hmp_domain
*domain
;
3967 struct list_head
*pos
;
3969 list_for_each(pos
, &hmp_domains
) {
3970 domain
= list_entry(pos
, struct hmp_domain
, hmp_domains
);
3971 if(cpumask_test_cpu(cpu
, &domain
->possible_cpus
))
3977 static void hmp_online_cpu(int cpu
)
3979 struct hmp_domain
*domain
= hmp_get_hmp_domain_for_cpu(cpu
);
3982 cpumask_set_cpu(cpu
, &domain
->cpus
);
3985 static void hmp_offline_cpu(int cpu
)
3987 struct hmp_domain
*domain
= hmp_get_hmp_domain_for_cpu(cpu
);
3990 cpumask_clear_cpu(cpu
, &domain
->cpus
);
3994 * Migration thresholds should be in the range [0..1023]
3995 * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
3996 * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
3997 * The default values (512, 256) offer good responsiveness, but may need
3998 * tweaking suit particular needs.
4000 * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
4001 * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
4002 * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
4004 #ifdef CONFIG_HMP_DYNAMIC_THRESHOLD
4005 unsigned int hmp_up_threshold
= 1023;
4006 unsigned int hmp_down_threshold
= 0;
4008 unsigned int hmp_up_threshold
= 512;
4009 unsigned int hmp_down_threshold
= 256;
4012 unsigned int hmp_next_up_threshold
= 4096;
4013 unsigned int hmp_next_down_threshold
= 4096;
4014 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4015 #define hmp_last_up_migration(cpu) \
4016 cpu_rq(cpu)->cfs.avg.hmp_last_up_migration
4017 #define hmp_last_down_migration(cpu) \
4018 cpu_rq(cpu)->cfs.avg.hmp_last_down_migration
4019 static int hmp_select_task_rq_fair(int sd_flag
, struct task_struct
*p
,
4020 int prev_cpu
, int new_cpu
);
4022 static unsigned int hmp_up_migration(int cpu
, int *target_cpu
, struct sched_entity
*se
);
4023 static unsigned int hmp_down_migration(int cpu
, struct sched_entity
*se
);
4025 static inline unsigned int hmp_domain_min_load(struct hmp_domain
*hmpd
,
4028 /* Check if cpu is in fastest hmp_domain */
4029 static inline unsigned int hmp_cpu_is_fastest(int cpu
)
4031 struct list_head
*pos
;
4033 pos
= &hmp_cpu_domain(cpu
)->hmp_domains
;
4034 return pos
== hmp_domains
.next
;
4037 /* Check if cpu is in slowest hmp_domain */
4038 static inline unsigned int hmp_cpu_is_slowest(int cpu
)
4040 struct list_head
*pos
;
4042 pos
= &hmp_cpu_domain(cpu
)->hmp_domains
;
4043 return list_is_last(pos
, &hmp_domains
);
4046 /* Next (slower) hmp_domain relative to cpu */
4047 static inline struct hmp_domain
*hmp_slower_domain(int cpu
)
4049 struct list_head
*pos
;
4051 pos
= &hmp_cpu_domain(cpu
)->hmp_domains
;
4052 return list_entry(pos
->next
, struct hmp_domain
, hmp_domains
);
4055 /* Previous (faster) hmp_domain relative to cpu */
4056 static inline struct hmp_domain
*hmp_faster_domain(int cpu
)
4058 struct list_head
*pos
;
4060 pos
= &hmp_cpu_domain(cpu
)->hmp_domains
;
4061 return list_entry(pos
->prev
, struct hmp_domain
, hmp_domains
);
4065 * Selects a cpu in previous (faster) hmp_domain
4066 * Note that cpumask_any_and() returns the first cpu in the cpumask
4068 static inline unsigned int hmp_select_faster_cpu(struct task_struct
*tsk
,
4071 int lowest_cpu
=NR_CPUS
;
4072 __always_unused
int lowest_ratio
= hmp_domain_min_load(hmp_faster_domain(cpu
), &lowest_cpu
);
4074 * If the lowest-loaded CPU in the domain is allowed by the task affinity
4075 * select that one, otherwise select one which is allowed
4077 if(lowest_cpu
< nr_cpu_ids
&& cpumask_test_cpu(lowest_cpu
,tsk_cpus_allowed(tsk
)))
4080 return cpumask_any_and(&hmp_faster_domain(cpu
)->cpus
,
4081 tsk_cpus_allowed(tsk
));
4085 * Selects a cpu in next (slower) hmp_domain
4086 * Note that cpumask_any_and() returns the first cpu in the cpumask
4088 static inline unsigned int hmp_select_slower_cpu(struct task_struct
*tsk
,
4091 int lowest_cpu
=NR_CPUS
;
4092 __always_unused
int lowest_ratio
= hmp_domain_min_load(hmp_slower_domain(cpu
), &lowest_cpu
);
4094 * If the lowest-loaded CPU in the domain is allowed by the task affinity
4095 * select that one, otherwise select one which is allowed
4097 if(lowest_cpu
< nr_cpu_ids
&& cpumask_test_cpu(lowest_cpu
,tsk_cpus_allowed(tsk
)))
4100 return cpumask_any_and(&hmp_slower_domain(cpu
)->cpus
,
4101 tsk_cpus_allowed(tsk
));
4104 static inline void hmp_next_up_delay(struct sched_entity
*se
, int cpu
)
4106 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4107 struct cfs_rq
*cfs_rq
= &cpu_rq(cpu
)->cfs
;
4108 hmp_last_up_migration(cpu
) = cfs_rq_clock_task(cfs_rq
);
4109 hmp_last_down_migration(cpu
) = 0;
4111 struct cfs_rq
*cfs_rq
= &cpu_rq(cpu
)->cfs
;
4113 se
->avg
.hmp_last_up_migration
= cfs_rq_clock_task(cfs_rq
);
4114 se
->avg
.hmp_last_down_migration
= 0;
4118 static inline void hmp_next_down_delay(struct sched_entity
*se
, int cpu
)
4120 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4121 struct cfs_rq
*cfs_rq
= &cpu_rq(cpu
)->cfs
;
4122 hmp_last_down_migration(cpu
) = cfs_rq_clock_task(cfs_rq
);
4123 hmp_last_up_migration(cpu
) = 0;
4125 struct cfs_rq
*cfs_rq
= &cpu_rq(cpu
)->cfs
;
4127 se
->avg
.hmp_last_down_migration
= cfs_rq_clock_task(cfs_rq
);
4128 se
->avg
.hmp_last_up_migration
= 0;
4132 #ifdef CONFIG_HMP_VARIABLE_SCALE
4134 * Heterogenous multiprocessor (HMP) optimizations
4136 * These functions allow to change the growing speed of the load_avg_ratio
4137 * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
4138 * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
4140 * These functions also allow to change the up and down threshold of HMP
4141 * using /sys/kernel/hmp/{up,down}_threshold.
4142 * Both must be between 0 and 1023. The threshold that is compared
4143 * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
4145 * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
4146 * task with a load of 0 will reach the threshold after 64ms of busy loop.
4148 * Changing load_avg_periods_ms has the same effect than changing the
4149 * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
4150 * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
4151 * could trigger overflows.
4152 * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
4153 * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
4154 * could be overflowed for a weight > 2^12 even is the load_avg_contrib
4155 * should still be a 32bits result. This would not happen by multiplicating
4156 * delta time by 1/22 and setting load_avg_period_ms = 706.
4160 * By scaling the delta time it end-up increasing or decrease the
4161 * growing speed of the per entity load_avg_ratio
4162 * The scale factor hmp_data.multiplier is a fixed point
4163 * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
4165 static u64
hmp_variable_scale_convert(u64 delta
)
4167 u64 high
= delta
>> 32ULL;
4168 u64 low
= delta
& 0xffffffffULL
;
4169 low
*= hmp_data
.multiplier
;
4170 high
*= hmp_data
.multiplier
;
4171 return (low
>> HMP_VARIABLE_SCALE_SHIFT
)
4172 + (high
<< (32ULL - HMP_VARIABLE_SCALE_SHIFT
));
4175 static ssize_t
hmp_show(struct kobject
*kobj
,
4176 struct attribute
*attr
, char *buf
)
4179 struct hmp_global_attr
*hmp_attr
=
4180 container_of(attr
, struct hmp_global_attr
, attr
);
4181 int temp
= *(hmp_attr
->value
);
4182 if (hmp_attr
->to_sysfs
!= NULL
)
4183 temp
= hmp_attr
->to_sysfs(temp
);
4184 ret
= sprintf(buf
, "%d\n", temp
);
4188 static ssize_t
hmp_store(struct kobject
*a
, struct attribute
*attr
,
4189 const char *buf
, size_t count
)
4192 ssize_t ret
= count
;
4193 struct hmp_global_attr
*hmp_attr
=
4194 container_of(attr
, struct hmp_global_attr
, attr
);
4195 char *str
= vmalloc(count
+ 1);
4198 memcpy(str
, buf
, count
);
4200 if (sscanf(str
, "%d", &temp
) < 1)
4203 if (hmp_attr
->from_sysfs
!= NULL
)
4204 temp
= hmp_attr
->from_sysfs(temp
);
4208 *(hmp_attr
->value
) = temp
;
4214 static int hmp_period_tofrom_sysfs(int value
)
4216 return (LOAD_AVG_PERIOD
<< HMP_VARIABLE_SCALE_SHIFT
) / value
;
4219 /* max value for threshold is 1024 */
4220 static int hmp_theshold_from_sysfs(int value
)
4226 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4227 /* freqinvar control is only 0,1 off/on */
4228 static int hmp_freqinvar_from_sysfs(int value
)
4230 if (value
< 0 || value
> 1)
4235 static void hmp_attr_add(
4238 int (*to_sysfs
)(int),
4239 int (*from_sysfs
)(int))
4242 while (hmp_data
.attributes
[i
] != NULL
) {
4244 if (i
>= HMP_DATA_SYSFS_MAX
)
4247 hmp_data
.attr
[i
].attr
.mode
= 0644;
4248 hmp_data
.attr
[i
].show
= hmp_show
;
4249 hmp_data
.attr
[i
].store
= hmp_store
;
4250 hmp_data
.attr
[i
].attr
.name
= name
;
4251 hmp_data
.attr
[i
].value
= value
;
4252 hmp_data
.attr
[i
].to_sysfs
= to_sysfs
;
4253 hmp_data
.attr
[i
].from_sysfs
= from_sysfs
;
4254 hmp_data
.attributes
[i
] = &hmp_data
.attr
[i
].attr
;
4255 hmp_data
.attributes
[i
+ 1] = NULL
;
4258 static int hmp_attr_init(void)
4261 memset(&hmp_data
, sizeof(hmp_data
), 0);
4262 /* by default load_avg_period_ms == LOAD_AVG_PERIOD
4265 /* LOAD_AVG_PERIOD is too short to trigger heavy task indicator
4266 so we change it to LOAD_AVG_VARIABLE_PERIOD */
4267 hmp_data
.multiplier
= hmp_period_tofrom_sysfs(LOAD_AVG_VARIABLE_PERIOD
);
4269 hmp_attr_add("load_avg_period_ms",
4270 &hmp_data
.multiplier
,
4271 hmp_period_tofrom_sysfs
,
4272 hmp_period_tofrom_sysfs
);
4273 hmp_attr_add("up_threshold",
4276 hmp_theshold_from_sysfs
);
4277 hmp_attr_add("down_threshold",
4278 &hmp_down_threshold
,
4280 hmp_theshold_from_sysfs
);
4281 hmp_attr_add("init_task_load_period",
4282 &init_task_load_period
,
4285 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4286 /* default frequency-invariant scaling ON */
4287 hmp_data
.freqinvar_load_scale_enabled
= 1;
4288 hmp_attr_add("frequency_invariant_load_scale",
4289 &hmp_data
.freqinvar_load_scale_enabled
,
4291 hmp_freqinvar_from_sysfs
);
4293 hmp_data
.attr_group
.name
= "hmp";
4294 hmp_data
.attr_group
.attrs
= hmp_data
.attributes
;
4295 ret
= sysfs_create_group(kernel_kobj
,
4296 &hmp_data
.attr_group
);
4299 late_initcall(hmp_attr_init
);
4300 #endif /* CONFIG_HMP_VARIABLE_SCALE */
4302 static inline unsigned int hmp_domain_min_load(struct hmp_domain
*hmpd
,
4306 int min_cpu_runnable_temp
= NR_CPUS
;
4307 unsigned long min_runnable_load
= INT_MAX
;
4308 unsigned long contrib
;
4310 for_each_cpu_mask(cpu
, hmpd
->cpus
) {
4311 /* don't use the divisor in the loop, just at the end */
4312 contrib
= cpu_rq(cpu
)->avg
.runnable_avg_sum
* scale_load_down(1024);
4313 if (contrib
< min_runnable_load
) {
4314 min_runnable_load
= contrib
;
4315 min_cpu_runnable_temp
= cpu
;
4320 *min_cpu
= min_cpu_runnable_temp
;
4322 /* domain will often have at least one empty CPU */
4323 return min_runnable_load
? min_runnable_load
/ (LOAD_AVG_MAX
+ 1) : 0;
4327 * Calculate the task starvation
4328 * This is the ratio of actually running time vs. runnable time.
4329 * If the two are equal the task is getting the cpu time it needs or
4330 * it is alone on the cpu and the cpu is fully utilized.
4332 static inline unsigned int hmp_task_starvation(struct sched_entity
*se
)
4336 starvation
= se
->avg
.usage_avg_sum
* scale_load_down(NICE_0_LOAD
);
4337 starvation
/= (se
->avg
.runnable_avg_sum
+ 1);
4339 return scale_load(starvation
);
4342 static inline unsigned int hmp_offload_down(int cpu
, struct sched_entity
*se
)
4345 int dest_cpu
= NR_CPUS
;
4347 if (hmp_cpu_is_slowest(cpu
))
4350 /* Is the current domain fully loaded? */
4352 min_usage
= hmp_domain_min_load(hmp_cpu_domain(cpu
), NULL
);
4353 if (min_usage
< (NICE_0_LOAD
>>1))
4356 /* Is the task alone on the cpu? */
4357 if (cpu_rq(cpu
)->cfs
.nr_running
< 2)
4360 /* Is the task actually starving? */
4361 /* >=25% ratio running/runnable = starving */
4362 if (hmp_task_starvation(se
) > 768)
4365 /* Does the slower domain have spare cycles? */
4366 min_usage
= hmp_domain_min_load(hmp_slower_domain(cpu
), &dest_cpu
);
4368 if (min_usage
> NICE_0_LOAD
/2)
4371 if (cpumask_test_cpu(dest_cpu
, &hmp_slower_domain(cpu
)->cpus
))
4376 #endif /* CONFIG_SCHED_HMP */
4379 #ifdef CONFIG_MTK_SCHED_CMP
4380 /* Check if cpu is in fastest hmp_domain */
4381 unsigned int cmp_up_threshold
= 512;
4382 unsigned int cmp_down_threshold
= 256;
4383 #endif /* CONFIG_MTK_SCHED_CMP */
4385 #ifdef CONFIG_MTK_SCHED_CMP_TGS
4386 static void sched_tg_enqueue_fair(struct rq
*rq
, struct task_struct
*p
)
4389 unsigned long flags
;
4390 struct task_struct
*tg
= p
->group_leader
;
4392 if (group_leader_is_empty(p
))
4394 id
= get_cluster_id(rq
->cpu
);
4395 if (unlikely(WARN_ON(id
< 0)))
4398 raw_spin_lock_irqsave(&tg
->thread_group_info_lock
, flags
);
4399 tg
->thread_group_info
[id
].cfs_nr_running
++;
4400 raw_spin_unlock_irqrestore(&tg
->thread_group_info_lock
, flags
);
4403 static void sched_tg_dequeue_fair(struct rq
*rq
, struct task_struct
*p
)
4406 unsigned long flags
;
4407 struct task_struct
*tg
= p
->group_leader
;
4409 if (group_leader_is_empty(p
))
4411 id
= get_cluster_id(rq
->cpu
);
4412 if (unlikely(WARN_ON(id
< 0)))
4415 raw_spin_lock_irqsave(&tg
->thread_group_info_lock
, flags
);
4416 tg
->thread_group_info
[id
].cfs_nr_running
--;
4417 raw_spin_unlock_irqrestore(&tg
->thread_group_info_lock
, flags
);
4422 * The enqueue_task method is called before nr_running is
4423 * increased. Here we update the fair scheduling stats and
4424 * then put the task into the rbtree:
4427 enqueue_task_fair(struct rq
*rq
, struct task_struct
*p
, int flags
)
4429 struct cfs_rq
*cfs_rq
;
4430 struct sched_entity
*se
= &p
->se
;
4432 for_each_sched_entity(se
) {
4435 cfs_rq
= cfs_rq_of(se
);
4436 enqueue_entity(cfs_rq
, se
, flags
);
4439 * end evaluation on encountering a throttled cfs_rq
4441 * note: in the case of encountering a throttled cfs_rq we will
4442 * post the final h_nr_running increment below.
4444 if (cfs_rq_throttled(cfs_rq
))
4446 cfs_rq
->h_nr_running
++;
4448 flags
= ENQUEUE_WAKEUP
;
4451 for_each_sched_entity(se
) {
4452 cfs_rq
= cfs_rq_of(se
);
4453 cfs_rq
->h_nr_running
++;
4455 if (cfs_rq_throttled(cfs_rq
))
4458 update_cfs_shares(cfs_rq
);
4459 update_entity_load_avg(se
, 1);
4463 update_rq_runnable_avg(rq
, rq
->nr_running
);
4465 #ifndef CONFIG_CFS_BANDWIDTH
4466 BUG_ON(rq
->cfs
.nr_running
> rq
->cfs
.h_nr_running
);
4470 #ifdef CONFIG_HMP_TRACER
4471 trace_sched_runqueue_length(rq
->cpu
,rq
->nr_running
);
4472 trace_sched_cfs_length(rq
->cpu
,rq
->cfs
.h_nr_running
);
4474 #ifdef CONFIG_MET_SCHED_HMP
4475 RqLen(rq
->cpu
,rq
->nr_running
);
4476 CfsLen(rq
->cpu
,rq
->cfs
.h_nr_running
);
4479 #ifdef CONFIG_MTK_SCHED_CMP_TGS
4480 sched_tg_enqueue_fair(rq
, p
);
4484 static void set_next_buddy(struct sched_entity
*se
);
4487 * The dequeue_task method is called before nr_running is
4488 * decreased. We remove the task from the rbtree and
4489 * update the fair scheduling stats:
4491 static void dequeue_task_fair(struct rq
*rq
, struct task_struct
*p
, int flags
)
4493 struct cfs_rq
*cfs_rq
;
4494 struct sched_entity
*se
= &p
->se
;
4495 int task_sleep
= flags
& DEQUEUE_SLEEP
;
4497 for_each_sched_entity(se
) {
4498 cfs_rq
= cfs_rq_of(se
);
4499 dequeue_entity(cfs_rq
, se
, flags
);
4502 * end evaluation on encountering a throttled cfs_rq
4504 * note: in the case of encountering a throttled cfs_rq we will
4505 * post the final h_nr_running decrement below.
4507 if (cfs_rq_throttled(cfs_rq
))
4509 cfs_rq
->h_nr_running
--;
4511 /* Don't dequeue parent if it has other entities besides us */
4512 if (cfs_rq
->load
.weight
) {
4514 * Bias pick_next to pick a task from this cfs_rq, as
4515 * p is sleeping when it is within its sched_slice.
4517 if (task_sleep
&& parent_entity(se
))
4518 set_next_buddy(parent_entity(se
));
4520 /* avoid re-evaluating load for this entity */
4521 se
= parent_entity(se
);
4524 flags
|= DEQUEUE_SLEEP
;
4527 for_each_sched_entity(se
) {
4528 cfs_rq
= cfs_rq_of(se
);
4529 cfs_rq
->h_nr_running
--;
4531 if (cfs_rq_throttled(cfs_rq
))
4534 update_cfs_shares(cfs_rq
);
4535 update_entity_load_avg(se
, 1);
4540 #ifndef CONFIG_CFS_BANDWIDTH
4541 BUG_ON(rq
->cfs
.nr_running
> rq
->cfs
.h_nr_running
);
4543 update_rq_runnable_avg(rq
, 1);
4546 #ifdef CONFIG_HMP_TRACER
4547 trace_sched_runqueue_length(rq
->cpu
,rq
->nr_running
);
4548 trace_sched_cfs_length(rq
->cpu
,rq
->cfs
.h_nr_running
);
4550 #ifdef CONFIG_MET_SCHED_HMP
4551 RqLen(rq
->cpu
,rq
->nr_running
);
4552 CfsLen(rq
->cpu
,rq
->cfs
.h_nr_running
);
4555 #ifdef CONFIG_MTK_SCHED_CMP_TGS
4556 sched_tg_dequeue_fair(rq
, p
);
4561 /* Used instead of source_load when we know the type == 0 */
4562 static unsigned long weighted_cpuload(const int cpu
)
4564 return cpu_rq(cpu
)->cfs
.runnable_load_avg
;
4568 * Return a low guess at the load of a migration-source cpu weighted
4569 * according to the scheduling class and "nice" value.
4571 * We want to under-estimate the load of migration sources, to
4572 * balance conservatively.
4574 static unsigned long source_load(int cpu
, int type
)
4576 struct rq
*rq
= cpu_rq(cpu
);
4577 unsigned long total
= weighted_cpuload(cpu
);
4579 if (type
== 0 || !sched_feat(LB_BIAS
))
4582 return min(rq
->cpu_load
[type
-1], total
);
4586 * Return a high guess at the load of a migration-target cpu weighted
4587 * according to the scheduling class and "nice" value.
4589 static unsigned long target_load(int cpu
, int type
)
4591 struct rq
*rq
= cpu_rq(cpu
);
4592 unsigned long total
= weighted_cpuload(cpu
);
4594 if (type
== 0 || !sched_feat(LB_BIAS
))
4597 return max(rq
->cpu_load
[type
-1], total
);
4600 static unsigned long power_of(int cpu
)
4602 return cpu_rq(cpu
)->cpu_power
;
4605 static unsigned long cpu_avg_load_per_task(int cpu
)
4607 struct rq
*rq
= cpu_rq(cpu
);
4608 unsigned long nr_running
= ACCESS_ONCE(rq
->nr_running
);
4609 unsigned long load_avg
= rq
->cfs
.runnable_load_avg
;
4612 return load_avg
/ nr_running
;
4618 static void task_waking_fair(struct task_struct
*p
)
4620 struct sched_entity
*se
= &p
->se
;
4621 struct cfs_rq
*cfs_rq
= cfs_rq_of(se
);
4624 #ifndef CONFIG_64BIT
4625 u64 min_vruntime_copy
;
4628 min_vruntime_copy
= cfs_rq
->min_vruntime_copy
;
4630 min_vruntime
= cfs_rq
->min_vruntime
;
4631 } while (min_vruntime
!= min_vruntime_copy
);
4633 min_vruntime
= cfs_rq
->min_vruntime
;
4636 se
->vruntime
-= min_vruntime
;
4639 #ifdef CONFIG_FAIR_GROUP_SCHED
4641 * effective_load() calculates the load change as seen from the root_task_group
4643 * Adding load to a group doesn't make a group heavier, but can cause movement
4644 * of group shares between cpus. Assuming the shares were perfectly aligned one
4645 * can calculate the shift in shares.
4647 * Calculate the effective load difference if @wl is added (subtracted) to @tg
4648 * on this @cpu and results in a total addition (subtraction) of @wg to the
4649 * total group weight.
4651 * Given a runqueue weight distribution (rw_i) we can compute a shares
4652 * distribution (s_i) using:
4654 * s_i = rw_i / \Sum rw_j (1)
4656 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4657 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4658 * shares distribution (s_i):
4660 * rw_i = { 2, 4, 1, 0 }
4661 * s_i = { 2/7, 4/7, 1/7, 0 }
4663 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4664 * task used to run on and the CPU the waker is running on), we need to
4665 * compute the effect of waking a task on either CPU and, in case of a sync
4666 * wakeup, compute the effect of the current task going to sleep.
4668 * So for a change of @wl to the local @cpu with an overall group weight change
4669 * of @wl we can compute the new shares distribution (s'_i) using:
4671 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
4673 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4674 * differences in waking a task to CPU 0. The additional task changes the
4675 * weight and shares distributions like:
4677 * rw'_i = { 3, 4, 1, 0 }
4678 * s'_i = { 3/8, 4/8, 1/8, 0 }
4680 * We can then compute the difference in effective weight by using:
4682 * dw_i = S * (s'_i - s_i) (3)
4684 * Where 'S' is the group weight as seen by its parent.
4686 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4687 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4688 * 4/7) times the weight of the group.
4690 static long effective_load(struct task_group
*tg
, int cpu
, long wl
, long wg
)
4692 struct sched_entity
*se
= tg
->se
[cpu
];
4694 if (!tg
->parent
) /* the trivial, non-cgroup case */
4697 for_each_sched_entity(se
) {
4703 * W = @wg + \Sum rw_j
4705 W
= wg
+ calc_tg_weight(tg
, se
->my_q
);
4710 w
= se
->my_q
->load
.weight
+ wl
;
4713 * wl = S * s'_i; see (2)
4716 wl
= (w
* tg
->shares
) / W
;
4721 * Per the above, wl is the new se->load.weight value; since
4722 * those are clipped to [MIN_SHARES, ...) do so now. See
4723 * calc_cfs_shares().
4725 if (wl
< MIN_SHARES
)
4729 * wl = dw_i = S * (s'_i - s_i); see (3)
4731 wl
-= se
->load
.weight
;
4734 * Recursively apply this logic to all parent groups to compute
4735 * the final effective load change on the root group. Since
4736 * only the @tg group gets extra weight, all parent groups can
4737 * only redistribute existing shares. @wl is the shift in shares
4738 * resulting from this level per the above.
4747 static inline unsigned long effective_load(struct task_group
*tg
, int cpu
,
4748 unsigned long wl
, unsigned long wg
)
4755 static int wake_affine(struct sched_domain
*sd
, struct task_struct
*p
, int sync
)
4757 s64 this_load
, load
;
4758 int idx
, this_cpu
, prev_cpu
;
4759 unsigned long tl_per_task
;
4760 struct task_group
*tg
;
4761 unsigned long weight
;
4765 this_cpu
= smp_processor_id();
4766 prev_cpu
= task_cpu(p
);
4767 load
= source_load(prev_cpu
, idx
);
4768 this_load
= target_load(this_cpu
, idx
);
4771 * If sync wakeup then subtract the (maximum possible)
4772 * effect of the currently running task from the load
4773 * of the current CPU:
4776 tg
= task_group(current
);
4777 weight
= current
->se
.load
.weight
;
4779 this_load
+= effective_load(tg
, this_cpu
, -weight
, -weight
);
4780 load
+= effective_load(tg
, prev_cpu
, 0, -weight
);
4784 weight
= p
->se
.load
.weight
;
4787 * In low-load situations, where prev_cpu is idle and this_cpu is idle
4788 * due to the sync cause above having dropped this_load to 0, we'll
4789 * always have an imbalance, but there's really nothing you can do
4790 * about that, so that's good too.
4792 * Otherwise check if either cpus are near enough in load to allow this
4793 * task to be woken on this_cpu.
4795 if (this_load
> 0) {
4796 s64 this_eff_load
, prev_eff_load
;
4798 this_eff_load
= 100;
4799 this_eff_load
*= power_of(prev_cpu
);
4800 this_eff_load
*= this_load
+
4801 effective_load(tg
, this_cpu
, weight
, weight
);
4803 prev_eff_load
= 100 + (sd
->imbalance_pct
- 100) / 2;
4804 prev_eff_load
*= power_of(this_cpu
);
4805 prev_eff_load
*= load
+ effective_load(tg
, prev_cpu
, 0, weight
);
4807 balanced
= this_eff_load
<= prev_eff_load
;
4812 * If the currently running task will sleep within
4813 * a reasonable amount of time then attract this newly
4816 if (sync
&& balanced
)
4819 schedstat_inc(p
, se
.statistics
.nr_wakeups_affine_attempts
);
4820 tl_per_task
= cpu_avg_load_per_task(this_cpu
);
4823 (this_load
<= load
&&
4824 this_load
+ target_load(prev_cpu
, idx
) <= tl_per_task
)) {
4826 * This domain has SD_WAKE_AFFINE and
4827 * p is cache cold in this domain, and
4828 * there is no bad imbalance.
4830 schedstat_inc(sd
, ttwu_move_affine
);
4831 schedstat_inc(p
, se
.statistics
.nr_wakeups_affine
);
4839 * find_idlest_group finds and returns the least busy CPU group within the
4842 static struct sched_group
*
4843 find_idlest_group(struct sched_domain
*sd
, struct task_struct
*p
,
4844 int this_cpu
, int load_idx
)
4846 struct sched_group
*idlest
= NULL
, *group
= sd
->groups
;
4847 unsigned long min_load
= ULONG_MAX
, this_load
= 0;
4848 int imbalance
= 100 + (sd
->imbalance_pct
-100)/2;
4851 unsigned long load
, avg_load
;
4855 /* Skip over this group if it has no CPUs allowed */
4856 if (!cpumask_intersects(sched_group_cpus(group
),
4857 tsk_cpus_allowed(p
)))
4860 local_group
= cpumask_test_cpu(this_cpu
,
4861 sched_group_cpus(group
));
4863 /* Tally up the load of all CPUs in the group */
4866 for_each_cpu(i
, sched_group_cpus(group
)) {
4867 /* Bias balancing toward cpus of our domain */
4869 load
= source_load(i
, load_idx
);
4871 load
= target_load(i
, load_idx
);
4875 mt_sched_printf("find_idlest_group cpu=%d avg=%lu",
4879 /* Adjust by relative CPU power of the group */
4880 avg_load
= (avg_load
* SCHED_POWER_SCALE
) / group
->sgp
->power
;
4883 this_load
= avg_load
;
4884 mt_sched_printf("find_idlest_group this_load=%lu",
4886 } else if (avg_load
< min_load
) {
4887 min_load
= avg_load
;
4889 mt_sched_printf("find_idlest_group min_load=%lu",
4892 } while (group
= group
->next
, group
!= sd
->groups
);
4894 if (!idlest
|| 100*this_load
< imbalance
*min_load
){
4895 mt_sched_printf("find_idlest_group fail this_load=%lu min_load=%lu, imbalance=%d",
4896 this_load
, min_load
, imbalance
);
4903 * find_idlest_cpu - find the idlest cpu among the cpus in group.
4906 find_idlest_cpu(struct sched_group
*group
, struct task_struct
*p
, int this_cpu
)
4908 unsigned long load
, min_load
= ULONG_MAX
;
4912 /* Traverse only the allowed CPUs */
4913 for_each_cpu_and(i
, sched_group_cpus(group
), tsk_cpus_allowed(p
)) {
4914 load
= weighted_cpuload(i
);
4916 if (load
< min_load
|| (load
== min_load
&& i
== this_cpu
)) {
4926 * Try and locate an idle CPU in the sched_domain.
4928 static int select_idle_sibling(struct task_struct
*p
, int target
)
4930 struct sched_domain
*sd
;
4931 struct sched_group
*sg
;
4932 int i
= task_cpu(p
);
4934 if (idle_cpu(target
))
4938 * If the prevous cpu is cache affine and idle, don't be stupid.
4940 if (i
!= target
&& cpus_share_cache(i
, target
) && idle_cpu(i
))
4944 * Otherwise, iterate the domains and find an elegible idle cpu.
4946 sd
= rcu_dereference(per_cpu(sd_llc
, target
));
4947 for_each_lower_domain(sd
) {
4950 if (!cpumask_intersects(sched_group_cpus(sg
),
4951 tsk_cpus_allowed(p
)))
4954 for_each_cpu(i
, sched_group_cpus(sg
)) {
4955 if (i
== target
|| !idle_cpu(i
))
4959 target
= cpumask_first_and(sched_group_cpus(sg
),
4960 tsk_cpus_allowed(p
));
4964 } while (sg
!= sd
->groups
);
4970 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
4972 * @p: the task want to be located at.
4973 * @clid: the CPU cluster id to be search for the target CPU
4974 * @target: the appropriate CPU for task p, updated by this function.
4979 * 0 if target CPU is not found in this CPU cluster
4981 static int cmp_find_idle_cpu(struct task_struct
*p
, int clid
, int *target
)
4983 struct cpumask cls_cpus
;
4986 get_cluster_cpus(&cls_cpus
, clid
, true);
4987 *target
= cpumask_any_and(&cls_cpus
, tsk_cpus_allowed(p
));
4988 for_each_cpu(j
, &cls_cpus
) {
4989 if (idle_cpu(j
) && cpumask_test_cpu(j
, tsk_cpus_allowed(p
))) {
4994 if (*target
>= nr_cpu_ids
)
4995 return 0; // task is not allow in this CPU cluster
4996 mt_sched_printf("wakeup %d %s cpu=%d, max_clid/max_idle_clid=%d",
4997 p
->pid
, p
->comm
, *target
, clid
);
5002 #if !defined(CONFIG_SCHED_HMP)
5003 #define TGS_WAKEUP_EXPERIMENT
5005 static int cmp_select_task_rq_fair(struct task_struct
*p
, int sd_flag
, int *cpu
)
5008 int max_cnt
=0, tskcnt
;
5010 int idle_cnt
, max_idle_cnt
=0;
5011 int in_prev
=0, prev_cluster
=0;
5012 struct cpumask cls_cpus
;
5015 num_cluster
=arch_get_nr_clusters();
5016 for(i
=0; i
< num_cluster
; i
++) {
5017 tskcnt
= p
->group_leader
->thread_group_info
[i
].nr_running
;
5019 get_cluster_cpus(&cls_cpus
, i
, true);
5021 for_each_cpu(j
, &cls_cpus
) {
5022 #ifdef TGS_WAKEUP_EXPERIMENT
5023 if (arch_is_big_little()) {
5024 int bcpu
= arch_cpu_is_big(j
);
5025 if (bcpu
&& p
->se
.avg
.load_avg_ratio
>= cmp_up_threshold
) {
5028 mt_sched_printf("[heavy task] wakeup load=%ld up_th=%u pid=%d name=%s cpu=%d, tgs_clid=%d in_prev=%d",
5029 p
->se
.avg
.load_avg_ratio
, cmp_up_threshold
, p
->pid
, p
->comm
, *cpu
, tgs_clid
, in_prev
);
5032 if (!bcpu
&& p
->se
.avg
.load_avg_ratio
< cmp_down_threshold
) {
5035 mt_sched_printf("[light task] wakeup load=%ld down_th=%u pid=%d name=%s cpu=%d, tgs_clid=%d in_prev=%d",
5036 p
->se
.avg
.load_avg_ratio
, cmp_down_threshold
, p
->pid
, p
->comm
, *cpu
, tgs_clid
, in_prev
);
5044 mt_sched_printf("wakeup load=%ld pid=%d name=%s clid=%d idle_cnt=%d tskcnt=%d max_cnt=%d, cls_cpus=%02lx, onlineCPU=%02lx",
5045 p
->se
.avg
.load_avg_ratio
, p
->pid
, p
->comm
, i
, idle_cnt
, tskcnt
, max_cnt
,
5046 *cpumask_bits(&cls_cpus
), *cpumask_bits(cpu_online_mask
));
5051 if (i
== get_cluster_id(*cpu
))
5055 if ( (tskcnt
> max_cnt
) || ((tskcnt
== max_cnt
) && prev_cluster
)) {
5056 in_prev
= prev_cluster
;
5060 } else if (0 == max_cnt
) {
5061 if ((idle_cnt
> max_idle_cnt
) || ((idle_cnt
== max_idle_cnt
) && prev_cluster
)) {
5062 in_prev
= prev_cluster
;
5064 max_idle_cnt
= idle_cnt
;
5068 mt_sched_printf("wakeup %d %s i=%d idle_cnt=%d tgs_clid=%d max_cnt=%d max_idle_cnt=%d in_prev=%d",
5069 p
->pid
, p
->comm
, i
, idle_cnt
, tgs_clid
, max_cnt
, max_idle_cnt
, in_prev
);
5072 #ifdef TGS_WAKEUP_EXPERIMENT
5075 mt_sched_printf("wakeup %d %s cpu=%d, tgs_clid=%d in_prev=%d",
5076 p
->pid
, p
->comm
, *cpu
, tgs_clid
, in_prev
);
5078 if(-1 != tgs_clid
&& !in_prev
&& cmp_find_idle_cpu(p
, tgs_clid
, cpu
))
5085 #ifdef CONFIG_MTK_SCHED_TRACERS
5087 #define LB_AFFINITY 0x10
5088 #define LB_BUDDY 0x20
5089 #define LB_FORK 0x30
5090 #define LB_CMP_SHIFT 8
5091 #define LB_CMP 0x4000
5092 #define LB_SMP_SHIFT 16
5093 #define LB_SMP 0x500000
5094 #define LB_HMP_SHIFT 24
5095 #define LB_HMP 0x60000000
5099 * sched_balance_self: balance the current task (running on cpu) in domains
5100 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
5103 * Balance, ie. select the least loaded group.
5105 * Returns the target CPU number, or the same CPU if no balancing is needed.
5107 * preempt must be disabled.
5110 select_task_rq_fair(struct task_struct
*p
, int sd_flag
, int wake_flags
)
5112 struct sched_domain
*tmp
, *affine_sd
= NULL
, *sd
= NULL
;
5113 int cpu
= smp_processor_id();
5114 int prev_cpu
= task_cpu(p
);
5116 int want_affine
= 0;
5117 int sync
= wake_flags
& WF_SYNC
;
5118 #if defined(CONFIG_SCHED_HMP) && !defined(CONFIG_SCHED_HMP_ENHANCEMENT)
5119 int target_cpu
= nr_cpu_ids
;
5121 #ifdef CONFIG_MTK_SCHED_TRACERS
5124 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5126 int cmp_cpu_found
=0;
5128 #ifdef CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK
5129 int buddy_cpu
= per_cpu(sd_pack_buddy
, cpu
);
5132 if (p
->nr_cpus_allowed
== 1)
5134 #ifdef CONFIG_MTK_SCHED_TRACERS
5135 trace_sched_select_task_rq(p
, (LB_AFFINITY
| prev_cpu
), prev_cpu
, prev_cpu
);
5140 #ifdef CONFIG_HMP_PACK_SMALL_TASK
5141 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5142 if (check_pack_buddy(cpu
, p
) && PA_ENABLE
) {
5143 PACK_FROM_CPUX_TO_CPUY_COUNT
[cpu
][per_cpu(sd_pack_buddy
, cpu
)]++;
5145 #ifdef CONFIG_HMP_TRACER
5146 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_PACK_FORM_CPUX_TO_CPUY
, p
->pid
, cpu
, per_cpu(sd_pack_buddy
, cpu
));
5147 #endif /* CONFIG_HMP_TRACER */
5150 if(strcmp(p
->comm
, PA_MON
) == 0 && cpu
!= per_cpu(sd_pack_buddy
, cpu
)) {
5151 printk(KERN_EMERG
"[PA] %s PACK From CPU%d to CPU%d\n", p
->comm
, cpu
, per_cpu(sd_pack_buddy
, cpu
));
5152 printk(KERN_EMERG
"[PA] Buddy RQ Usage = %u, Period = %u, NR = %u\n",
5153 per_cpu(BUDDY_CPU_RQ_USAGE
, per_cpu(sd_pack_buddy
, cpu
)),
5154 per_cpu(BUDDY_CPU_RQ_PERIOD
, per_cpu(sd_pack_buddy
, cpu
)),
5155 per_cpu(BUDDY_CPU_RQ_NR
, per_cpu(sd_pack_buddy
, cpu
)));
5156 printk(KERN_EMERG
"[PA] Task Usage = %u, Period = %u\n",
5157 per_cpu(TASK_USGAE
, cpu
),
5158 per_cpu(TASK_PERIOD
, cpu
));
5161 #else /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5162 if (check_pack_buddy(cpu
, p
)) {
5163 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5164 #ifdef CONFIG_MTK_SCHED_TRACERS
5165 new_cpu
= per_cpu(sd_pack_buddy
, cpu
);
5166 trace_sched_select_task_rq(p
, (LB_BUDDY
| new_cpu
), prev_cpu
, new_cpu
);
5168 return per_cpu(sd_pack_buddy
, cpu
);
5170 #elif defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK)
5171 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5172 if (PA_ENABLE
&& (sd_flag
& SD_BALANCE_WAKE
) && (check_pack_buddy(buddy_cpu
, p
))) {
5174 if ((sd_flag
& SD_BALANCE_WAKE
) && (check_pack_buddy(buddy_cpu
, p
))) {
5176 struct thread_group_info_t
*src_tginfo
, *dst_tginfo
;
5177 src_tginfo
= &p
->group_leader
->thread_group_info
[get_cluster_id(prev_cpu
)]; //Compare with previous cpu(Not current cpu)
5178 dst_tginfo
= &p
->group_leader
->thread_group_info
[get_cluster_id(buddy_cpu
)];
5179 if((get_cluster_id(prev_cpu
) == get_cluster_id(buddy_cpu
)) ||
5180 (src_tginfo
->nr_running
< dst_tginfo
->nr_running
))
5182 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5183 PACK_FROM_CPUX_TO_CPUY_COUNT
[cpu
][buddy_cpu
]++;
5184 mt_sched_printf("[PA]pid=%d, Pack to CPU%d(CPU%d's buddy)\n", p
->pid
,buddy_cpu
,cpu
);
5188 if(strcmp(p
->comm
, &PA_MON
[i
][0]) == 0) {
5189 TASK_PACK_CPU_COUNT
[i
][buddy_cpu
]++;
5190 printk(KERN_EMERG
"[PA] %s PACK to CPU%d(CPU%d's buddy), pre(cpu%d)\n", p
->comm
, buddy_cpu
,cpu
, prev_cpu
);
5191 printk(KERN_EMERG
"[PA] Buddy RQ Usage = %u, Period = %u, NR = %u\n",
5192 per_cpu(BUDDY_CPU_RQ_USAGE
, buddy_cpu
),
5193 per_cpu(BUDDY_CPU_RQ_PERIOD
, buddy_cpu
),
5194 per_cpu(BUDDY_CPU_RQ_NR
, buddy_cpu
));
5195 printk(KERN_EMERG
"[PA] Task Usage = %u, Period = %u\n",
5196 per_cpu(TASK_USGAE
, cpu
),
5197 per_cpu(TASK_PERIOD
, cpu
));
5202 #endif //CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5203 #ifdef CONFIG_MTK_SCHED_TRACERS
5204 trace_sched_select_task_rq(p
, (LB_BUDDY
| buddy_cpu
), prev_cpu
, buddy_cpu
);
5209 #endif /* CONFIG_HMP_PACK_SMALL_TASK */
5211 #ifdef CONFIG_SCHED_HMP
5212 /* always put non-kernel forking tasks on a big domain */
5213 if (p
->mm
&& (sd_flag
& SD_BALANCE_FORK
)) {
5214 if(hmp_cpu_is_fastest(prev_cpu
)) {
5215 struct hmp_domain
*hmpdom
= list_entry(&hmp_cpu_domain(prev_cpu
)->hmp_domains
, struct hmp_domain
, hmp_domains
);
5216 __always_unused
int lowest_ratio
= hmp_domain_min_load(hmpdom
, &new_cpu
);
5217 if(new_cpu
< nr_cpu_ids
&& cpumask_test_cpu(new_cpu
,tsk_cpus_allowed(p
)))
5219 #ifdef CONFIG_MTK_SCHED_TRACERS
5220 trace_sched_select_task_rq(p
, (LB_FORK
| new_cpu
), prev_cpu
, new_cpu
);
5226 new_cpu
= cpumask_any_and(&hmp_faster_domain(cpu
)->cpus
,
5227 tsk_cpus_allowed(p
));
5228 if(new_cpu
< nr_cpu_ids
)
5230 #ifdef CONFIG_MTK_SCHED_TRACERS
5231 trace_sched_select_task_rq(p
, (LB_FORK
| new_cpu
), prev_cpu
, new_cpu
);
5237 new_cpu
= hmp_select_faster_cpu(p
, prev_cpu
);
5238 if (new_cpu
< nr_cpu_ids
)
5240 #ifdef CONFIG_MTK_SCHED_TRACERS
5241 trace_sched_select_task_rq(p
, (LB_FORK
| new_cpu
), prev_cpu
, new_cpu
);
5246 // to recover new_cpu value
5247 if (new_cpu
>= nr_cpu_ids
)
5252 if (sd_flag
& SD_BALANCE_WAKE
) {
5253 if (cpumask_test_cpu(cpu
, tsk_cpus_allowed(p
)))
5258 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5260 cmp_cpu_found
= cmp_select_task_rq_fair(p
, sd_flag
, &cmp_cpu
);
5261 if (cmp_cpu_found
&& (cmp_cpu
< nr_cpu_ids
)) {
5264 #ifdef CONFIG_MTK_SCHED_TRACERS
5265 policy
|= (new_cpu
<< LB_CMP_SHIFT
);
5268 mt_sched_printf("wakeup %d %s sd_flag=%x cmp_cpu_found=%d, cpu=%d, want_affine=%d ",
5269 p
->pid
, p
->comm
, sd_flag
, cmp_cpu_found
, cpu
, want_affine
);
5274 for_each_domain(cpu
, tmp
) {
5275 mt_sched_printf("wakeup %d %s tmp->flags=%x, cpu=%d, prev_cpu=%d, new_cpu=%d",
5276 p
->pid
, p
->comm
, tmp
->flags
, cpu
, prev_cpu
, new_cpu
);
5278 if (!(tmp
->flags
& SD_LOAD_BALANCE
))
5282 * If both cpu and prev_cpu are part of this domain,
5283 * cpu is a valid SD_WAKE_AFFINE target.
5285 if (want_affine
&& (tmp
->flags
& SD_WAKE_AFFINE
) &&
5286 cpumask_test_cpu(prev_cpu
, sched_domain_span(tmp
))) {
5291 if (tmp
->flags
& sd_flag
)
5296 if (cpu
!= prev_cpu
&& wake_affine(affine_sd
, p
, sync
))
5299 new_cpu
= select_idle_sibling(p
, prev_cpu
);
5303 mt_sched_printf("wakeup %d %s sd=%p", p
->pid
, p
->comm
, sd
);
5306 int load_idx
= sd
->forkexec_idx
;
5307 struct sched_group
*group
;
5310 mt_sched_printf("wakeup %d %s find_idlest_group cpu=%d sd->flags=%x sd_flag=%x",
5311 p
->pid
, p
->comm
, cpu
, sd
->flags
, sd_flag
);
5313 if (!(sd
->flags
& sd_flag
)) {
5318 if (sd_flag
& SD_BALANCE_WAKE
)
5319 load_idx
= sd
->wake_idx
;
5321 mt_sched_printf("wakeup %d %s find_idlest_group cpu=%d",
5322 p
->pid
, p
->comm
, cpu
);
5323 group
= find_idlest_group(sd
, p
, cpu
, load_idx
);
5326 mt_sched_printf("wakeup %d %s find_idlest_group child",
5331 new_cpu
= find_idlest_cpu(group
, p
, cpu
);
5332 if (new_cpu
== -1 || new_cpu
== cpu
) {
5333 /* Now try balancing at a lower domain level of cpu */
5335 mt_sched_printf("wakeup %d %s find_idlest_cpu sd->child=%p",
5336 p
->pid
, p
->comm
, sd
);
5340 /* Now try balancing at a lower domain level of new_cpu */
5341 mt_sched_printf("wakeup %d %s find_idlest_cpu cpu=%d sd=%p",
5342 p
->pid
, p
->comm
, new_cpu
, sd
);
5344 weight
= sd
->span_weight
;
5346 for_each_domain(cpu
, tmp
) {
5347 if (weight
<= tmp
->span_weight
)
5349 if (tmp
->flags
& sd_flag
)
5351 mt_sched_printf("wakeup %d %s sd=%p weight=%d, tmp->span_weight=%d",
5352 p
->pid
, p
->comm
, sd
, weight
, tmp
->span_weight
);
5354 /* while loop will break here if sd == NULL */
5357 #ifdef CONFIG_MTK_SCHED_TRACERS
5358 policy
|= (new_cpu
<< LB_SMP_SHIFT
);
5364 mt_sched_printf("wakeup %d %s new_cpu=%x", p
->pid
, p
->comm
, new_cpu
);
5366 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5370 #ifdef CONFIG_SCHED_HMP
5371 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
5372 new_cpu
= hmp_select_task_rq_fair(sd_flag
, p
, prev_cpu
, new_cpu
);
5373 #ifdef CONFIG_MTK_SCHED_TRACERS
5374 policy
|= (new_cpu
<< LB_HMP_SHIFT
);
5379 if (hmp_up_migration(prev_cpu
, &target_cpu
, &p
->se
)) {
5380 new_cpu
= hmp_select_faster_cpu(p
, prev_cpu
);
5381 hmp_next_up_delay(&p
->se
, new_cpu
);
5382 trace_sched_hmp_migrate(p
, new_cpu
, 0);
5385 if (hmp_down_migration(prev_cpu
, &p
->se
)) {
5386 new_cpu
= hmp_select_slower_cpu(p
, prev_cpu
);
5387 hmp_next_down_delay(&p
->se
, new_cpu
);
5388 trace_sched_hmp_migrate(p
, new_cpu
, 0);
5391 /* Make sure that the task stays in its previous hmp domain */
5392 if (!cpumask_test_cpu(new_cpu
, &hmp_cpu_domain(prev_cpu
)->cpus
))
5394 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
5395 #endif /* CONFIG_SCHED_HMP */
5397 #ifdef CONFIG_MTK_SCHED_TRACERS
5398 trace_sched_select_task_rq(p
, policy
, prev_cpu
, new_cpu
);
5401 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5403 if(strcmp(p
->comm
, PA_MON
) == 0 && cpu
!= new_cpu
) {
5404 printk(KERN_EMERG
"[PA] %s Select From CPU%d to CPU%d\n", p
->comm
, cpu
, new_cpu
);
5407 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5413 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5414 * cfs_rq_of(p) references at time of call are still valid and identify the
5415 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
5416 * other assumptions, including the state of rq->lock, should be made.
5419 migrate_task_rq_fair(struct task_struct
*p
, int next_cpu
)
5421 struct sched_entity
*se
= &p
->se
;
5422 struct cfs_rq
*cfs_rq
= cfs_rq_of(se
);
5425 * Load tracking: accumulate removed load so that it can be processed
5426 * when we next update owning cfs_rq under rq->lock. Tasks contribute
5427 * to blocked load iff they have a positive decay-count. It can never
5428 * be negative here since on-rq tasks have decay-count == 0.
5430 if (se
->avg
.decay_count
) {
5431 se
->avg
.decay_count
= -__synchronize_entity_decay(se
);
5432 atomic_long_add(se
->avg
.load_avg_contrib
,
5433 &cfs_rq
->removed_load
);
5436 #endif /* CONFIG_SMP */
5438 static unsigned long
5439 wakeup_gran(struct sched_entity
*curr
, struct sched_entity
*se
)
5441 unsigned long gran
= sysctl_sched_wakeup_granularity
;
5444 * Since its curr running now, convert the gran from real-time
5445 * to virtual-time in his units.
5447 * By using 'se' instead of 'curr' we penalize light tasks, so
5448 * they get preempted easier. That is, if 'se' < 'curr' then
5449 * the resulting gran will be larger, therefore penalizing the
5450 * lighter, if otoh 'se' > 'curr' then the resulting gran will
5451 * be smaller, again penalizing the lighter task.
5453 * This is especially important for buddies when the leftmost
5454 * task is higher priority than the buddy.
5456 return calc_delta_fair(gran
, se
);
5460 * Should 'se' preempt 'curr'.
5474 wakeup_preempt_entity(struct sched_entity
*curr
, struct sched_entity
*se
)
5476 s64 gran
, vdiff
= curr
->vruntime
- se
->vruntime
;
5481 gran
= wakeup_gran(curr
, se
);
5488 static void set_last_buddy(struct sched_entity
*se
)
5490 if (entity_is_task(se
) && unlikely(task_of(se
)->policy
== SCHED_IDLE
))
5493 for_each_sched_entity(se
)
5494 cfs_rq_of(se
)->last
= se
;
5497 static void set_next_buddy(struct sched_entity
*se
)
5499 if (entity_is_task(se
) && unlikely(task_of(se
)->policy
== SCHED_IDLE
))
5502 for_each_sched_entity(se
)
5503 cfs_rq_of(se
)->next
= se
;
5506 static void set_skip_buddy(struct sched_entity
*se
)
5508 for_each_sched_entity(se
)
5509 cfs_rq_of(se
)->skip
= se
;
5513 * Preempt the current task with a newly woken task if needed:
5515 static void check_preempt_wakeup(struct rq
*rq
, struct task_struct
*p
, int wake_flags
)
5517 struct task_struct
*curr
= rq
->curr
;
5518 struct sched_entity
*se
= &curr
->se
, *pse
= &p
->se
;
5519 struct cfs_rq
*cfs_rq
= task_cfs_rq(curr
);
5520 int scale
= cfs_rq
->nr_running
>= sched_nr_latency
;
5521 int next_buddy_marked
= 0;
5523 if (unlikely(se
== pse
))
5527 * This is possible from callers such as move_task(), in which we
5528 * unconditionally check_prempt_curr() after an enqueue (which may have
5529 * lead to a throttle). This both saves work and prevents false
5530 * next-buddy nomination below.
5532 if (unlikely(throttled_hierarchy(cfs_rq_of(pse
))))
5535 if (sched_feat(NEXT_BUDDY
) && scale
&& !(wake_flags
& WF_FORK
)) {
5536 set_next_buddy(pse
);
5537 next_buddy_marked
= 1;
5541 * We can come here with TIF_NEED_RESCHED already set from new task
5544 * Note: this also catches the edge-case of curr being in a throttled
5545 * group (e.g. via set_curr_task), since update_curr() (in the
5546 * enqueue of curr) will have resulted in resched being set. This
5547 * prevents us from potentially nominating it as a false LAST_BUDDY
5550 if (test_tsk_need_resched(curr
))
5553 /* Idle tasks are by definition preempted by non-idle tasks. */
5554 if (unlikely(curr
->policy
== SCHED_IDLE
) &&
5555 likely(p
->policy
!= SCHED_IDLE
))
5559 * Batch and idle tasks do not preempt non-idle tasks (their preemption
5560 * is driven by the tick):
5562 if (unlikely(p
->policy
!= SCHED_NORMAL
) || !sched_feat(WAKEUP_PREEMPTION
))
5565 find_matching_se(&se
, &pse
);
5566 update_curr(cfs_rq_of(se
));
5568 if (wakeup_preempt_entity(se
, pse
) == 1) {
5570 * Bias pick_next to pick the sched entity that is
5571 * triggering this preemption.
5573 if (!next_buddy_marked
)
5574 set_next_buddy(pse
);
5583 * Only set the backward buddy when the current task is still
5584 * on the rq. This can happen when a wakeup gets interleaved
5585 * with schedule on the ->pre_schedule() or idle_balance()
5586 * point, either of which can * drop the rq lock.
5588 * Also, during early boot the idle thread is in the fair class,
5589 * for obvious reasons its a bad idea to schedule back to it.
5591 if (unlikely(!se
->on_rq
|| curr
== rq
->idle
))
5594 if (sched_feat(LAST_BUDDY
) && scale
&& entity_is_task(se
))
5598 static struct task_struct
*pick_next_task_fair(struct rq
*rq
)
5600 struct task_struct
*p
;
5601 struct cfs_rq
*cfs_rq
= &rq
->cfs
;
5602 struct sched_entity
*se
;
5604 // in case nr_running!=0 but h_nr_running==0
5605 if (!cfs_rq
->nr_running
|| !cfs_rq
->h_nr_running
)
5609 se
= pick_next_entity(cfs_rq
);
5610 set_next_entity(cfs_rq
, se
);
5611 cfs_rq
= group_cfs_rq(se
);
5615 if (hrtick_enabled(rq
))
5616 hrtick_start_fair(rq
, p
);
5622 * Account for a descheduled task:
5624 static void put_prev_task_fair(struct rq
*rq
, struct task_struct
*prev
)
5626 struct sched_entity
*se
= &prev
->se
;
5627 struct cfs_rq
*cfs_rq
;
5629 for_each_sched_entity(se
) {
5630 cfs_rq
= cfs_rq_of(se
);
5631 put_prev_entity(cfs_rq
, se
);
5636 * sched_yield() is very simple
5638 * The magic of dealing with the ->skip buddy is in pick_next_entity.
5640 static void yield_task_fair(struct rq
*rq
)
5642 struct task_struct
*curr
= rq
->curr
;
5643 struct cfs_rq
*cfs_rq
= task_cfs_rq(curr
);
5644 struct sched_entity
*se
= &curr
->se
;
5647 * Are we the only task in the tree?
5649 if (unlikely(rq
->nr_running
== 1))
5652 clear_buddies(cfs_rq
, se
);
5654 if (curr
->policy
!= SCHED_BATCH
) {
5655 update_rq_clock(rq
);
5657 * Update run-time statistics of the 'current'.
5659 update_curr(cfs_rq
);
5661 * Tell update_rq_clock() that we've just updated,
5662 * so we don't do microscopic update in schedule()
5663 * and double the fastpath cost.
5665 rq
->skip_clock_update
= 1;
5671 static bool yield_to_task_fair(struct rq
*rq
, struct task_struct
*p
, bool preempt
)
5673 struct sched_entity
*se
= &p
->se
;
5675 /* throttled hierarchies are not runnable */
5676 if (!se
->on_rq
|| throttled_hierarchy(cfs_rq_of(se
)))
5679 /* Tell the scheduler that we'd really like pse to run next. */
5682 yield_task_fair(rq
);
5688 /**************************************************
5689 * Fair scheduling class load-balancing methods.
5693 * The purpose of load-balancing is to achieve the same basic fairness the
5694 * per-cpu scheduler provides, namely provide a proportional amount of compute
5695 * time to each task. This is expressed in the following equation:
5697 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
5699 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5700 * W_i,0 is defined as:
5702 * W_i,0 = \Sum_j w_i,j (2)
5704 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5705 * is derived from the nice value as per prio_to_weight[].
5707 * The weight average is an exponential decay average of the instantaneous
5710 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
5712 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
5713 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5714 * can also include other factors [XXX].
5716 * To achieve this balance we define a measure of imbalance which follows
5717 * directly from (1):
5719 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
5721 * We them move tasks around to minimize the imbalance. In the continuous
5722 * function space it is obvious this converges, in the discrete case we get
5723 * a few fun cases generally called infeasible weight scenarios.
5726 * - infeasible weights;
5727 * - local vs global optima in the discrete case. ]
5732 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5733 * for all i,j solution, we create a tree of cpus that follows the hardware
5734 * topology where each level pairs two lower groups (or better). This results
5735 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5736 * tree to only the first of the previous level and we decrease the frequency
5737 * of load-balance at each level inv. proportional to the number of cpus in
5743 * \Sum { --- * --- * 2^i } = O(n) (5)
5745 * `- size of each group
5746 * | | `- number of cpus doing load-balance
5748 * `- sum over all levels
5750 * Coupled with a limit on how many tasks we can migrate every balance pass,
5751 * this makes (5) the runtime complexity of the balancer.
5753 * An important property here is that each CPU is still (indirectly) connected
5754 * to every other cpu in at most O(log n) steps:
5756 * The adjacency matrix of the resulting graph is given by:
5759 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
5762 * And you'll find that:
5764 * A^(log_2 n)_i,j != 0 for all i,j (7)
5766 * Showing there's indeed a path between every cpu in at most O(log n) steps.
5767 * The task movement gives a factor of O(m), giving a convergence complexity
5770 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
5775 * In order to avoid CPUs going idle while there's still work to do, new idle
5776 * balancing is more aggressive and has the newly idle cpu iterate up the domain
5777 * tree itself instead of relying on other CPUs to bring it work.
5779 * This adds some complexity to both (5) and (8) but it reduces the total idle
5787 * Cgroups make a horror show out of (2), instead of a simple sum we get:
5790 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
5795 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
5797 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5799 * The big problem is S_k, its a global sum needed to compute a local (W_i)
5802 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5803 * rewrite all of this once again.]
5806 static unsigned long __read_mostly max_load_balance_interval
= HZ
/10;
5808 #define LBF_ALL_PINNED 0x01
5809 #define LBF_NEED_BREAK 0x02
5810 #define LBF_SOME_PINNED 0x04
5813 struct sched_domain
*sd
;
5821 struct cpumask
*dst_grpmask
;
5823 enum cpu_idle_type idle
;
5825 /* The set of CPUs under consideration for load-balancing */
5826 struct cpumask
*cpus
;
5831 unsigned int loop_break
;
5832 unsigned int loop_max
;
5833 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
5834 int mt_check_cache_in_idle
;
5836 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5837 unsigned int fail_reason
;
5842 * move_task - move a task from one runqueue to another runqueue.
5843 * Both runqueues must be locked.
5845 static void move_task(struct task_struct
*p
, struct lb_env
*env
)
5847 deactivate_task(env
->src_rq
, p
, 0);
5848 set_task_cpu(p
, env
->dst_cpu
);
5849 activate_task(env
->dst_rq
, p
, 0);
5850 check_preempt_curr(env
->dst_rq
, p
, 0);
5852 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5854 if(strcmp(p
->comm
, PA_MON
) == 0) {
5855 printk(KERN_EMERG
"[PA] %s Balance From CPU%d to CPU%d\n", p
->comm
, env
->src_rq
->cpu
, env
->dst_rq
->cpu
);
5858 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5863 * Is this task likely cache-hot:
5865 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
5867 task_hot(struct task_struct
*p
, u64 now
, struct sched_domain
*sd
, int mt_check_cache_in_idle
)
5870 task_hot(struct task_struct
*p
, u64 now
, struct sched_domain
*sd
)
5875 if (p
->sched_class
!= &fair_sched_class
)
5878 if (unlikely(p
->policy
== SCHED_IDLE
))
5882 * Buddy candidates are cache hot:
5884 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
5885 if (!mt_check_cache_in_idle
){
5886 if ( !this_rq()->nr_running
&& (task_rq(p
)->nr_running
>= 2) )
5890 if (sched_feat(CACHE_HOT_BUDDY
) && this_rq()->nr_running
&&
5891 (&p
->se
== cfs_rq_of(&p
->se
)->next
||
5892 &p
->se
== cfs_rq_of(&p
->se
)->last
))
5895 if (sysctl_sched_migration_cost
== -1)
5897 if (sysctl_sched_migration_cost
== 0)
5900 delta
= now
- p
->se
.exec_start
;
5902 return delta
< (s64
)sysctl_sched_migration_cost
;
5906 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5909 int can_migrate_task(struct task_struct
*p
, struct lb_env
*env
)
5911 int tsk_cache_hot
= 0;
5913 * We do not migrate tasks that are:
5914 * 1) throttled_lb_pair, or
5915 * 2) cannot be migrated to this CPU due to cpus_allowed, or
5916 * 3) running (obviously), or
5917 * 4) are cache-hot on their current CPU.
5919 if (throttled_lb_pair(task_group(p
), env
->src_cpu
, env
->dst_cpu
))
5922 if (!cpumask_test_cpu(env
->dst_cpu
, tsk_cpus_allowed(p
))) {
5925 schedstat_inc(p
, se
.statistics
.nr_failed_migrations_affine
);
5926 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5927 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_AFFINITY
);
5928 if(mt_lbprof_lt (env
->sd
->mt_lbprof_nr_balance_failed
, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND
)){
5929 char strings
[128]="";
5930 snprintf(strings
, 128, "%d:balance fail:affinity:%d:%d:%s:0x%lu"
5931 , env
->dst_cpu
, env
->src_cpu
, p
->pid
, p
->comm
, p
->cpus_allowed
.bits
[0]);
5932 trace_sched_lbprof_log(strings
);
5937 * Remember if this task can be migrated to any other cpu in
5938 * our sched_group. We may want to revisit it if we couldn't
5939 * meet load balance goals by pulling other tasks on src_cpu.
5941 * Also avoid computing new_dst_cpu if we have already computed
5942 * one in current iteration.
5944 if (!env
->dst_grpmask
|| (env
->flags
& LBF_SOME_PINNED
))
5947 /* Prevent to re-select dst_cpu via env's cpus */
5948 for_each_cpu_and(cpu
, env
->dst_grpmask
, env
->cpus
) {
5949 if (cpumask_test_cpu(cpu
, tsk_cpus_allowed(p
))) {
5950 env
->flags
|= LBF_SOME_PINNED
;
5951 env
->new_dst_cpu
= cpu
;
5959 /* Record that we found atleast one task that could run on dst_cpu */
5960 env
->flags
&= ~LBF_ALL_PINNED
;
5962 if (task_running(env
->src_rq
, p
)) {
5963 schedstat_inc(p
, se
.statistics
.nr_failed_migrations_running
);
5964 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5965 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_RUNNING
);
5966 if( mt_lbprof_lt (env
->sd
->mt_lbprof_nr_balance_failed
, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND
)){
5967 char strings
[128]="";
5968 snprintf(strings
, 128, "%d:balance fail:running:%d:%d:%s"
5969 , env
->dst_cpu
, env
->src_cpu
, p
->pid
, p
->comm
);
5970 trace_sched_lbprof_log(strings
);
5977 * Aggressive migration if:
5978 * 1) task is cache cold, or
5979 * 2) too many balance attempts have failed.
5981 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
5982 tsk_cache_hot
= task_hot(p
, env
->src_rq
->clock_task
, env
->sd
, env
->mt_check_cache_in_idle
);
5984 tsk_cache_hot
= task_hot(p
, env
->src_rq
->clock_task
, env
->sd
);
5986 if (!tsk_cache_hot
||
5987 env
->sd
->nr_balance_failed
> env
->sd
->cache_nice_tries
) {
5989 if (tsk_cache_hot
) {
5990 schedstat_inc(env
->sd
, lb_hot_gained
[env
->idle
]);
5991 schedstat_inc(p
, se
.statistics
.nr_forced_migrations
);
5997 schedstat_inc(p
, se
.statistics
.nr_failed_migrations_hot
);
5998 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5999 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_CACHEHOT
);
6000 if(mt_lbprof_lt (env
->sd
->mt_lbprof_nr_balance_failed
, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND
)){
6001 char strings
[128]="";
6002 snprintf(strings
, 128, "%d:balance fail:cache hot:%d:%d:%s"
6003 , env
->dst_cpu
, env
->src_cpu
, p
->pid
, p
->comm
);
6004 trace_sched_lbprof_log(strings
);
6011 * move_one_task tries to move exactly one task from busiest to this_rq, as
6012 * part of active balancing operations within "domain".
6013 * Returns 1 if successful and 0 otherwise.
6015 * Called with both runqueues locked.
6017 static int move_one_task(struct lb_env
*env
)
6019 struct task_struct
*p
, *n
;
6020 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
6021 env
->mt_check_cache_in_idle
= 1;
6023 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
6024 mt_lbprof_stat_set(env
->fail_reason
, MT_LBPROF_NO_TRIGGER
);
6027 list_for_each_entry_safe(p
, n
, &env
->src_rq
->cfs_tasks
, se
.group_node
) {
6028 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6029 if(need_lazy_balance(env
->dst_cpu
, env
->src_cpu
, p
))
6032 if (!can_migrate_task(p
, env
))
6037 * Right now, this is only the second place move_task()
6038 * is called, so we can safely collect move_task()
6039 * stats here rather than inside move_task().
6041 schedstat_inc(env
->sd
, lb_gained
[env
->idle
]);
6047 static unsigned long task_h_load(struct task_struct
*p
);
6049 static const unsigned int sched_nr_migrate_break
= 32;
6051 /* in second round load balance, we migrate heavy load_weight task
6052 as long as RT tasks exist in busy cpu*/
6053 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
6054 #define over_imbalance(lw, im) \
6055 (((lw)/2 > (im)) && \
6056 ((env->mt_check_cache_in_idle==1) || \
6057 (env->src_rq->rt.rt_nr_running==0) || \
6060 #define over_imbalance(lw, im) (((lw) / 2) > (im))
6064 * move_tasks tries to move up to imbalance weighted load from busiest to
6065 * this_rq, as part of a balancing operation within domain "sd".
6066 * Returns 1 if successful and 0 otherwise.
6068 * Called with both runqueues locked.
6070 static int move_tasks(struct lb_env
*env
)
6072 struct list_head
*tasks
= &env
->src_rq
->cfs_tasks
;
6073 struct task_struct
*p
;
6077 if (env
->imbalance
<= 0)
6080 mt_sched_printf("move_tasks start ");
6082 while (!list_empty(tasks
)) {
6083 p
= list_first_entry(tasks
, struct task_struct
, se
.group_node
);
6086 /* We've more or less seen every task there is, call it quits */
6087 if (env
->loop
> env
->loop_max
)
6090 /* take a breather every nr_migrate tasks */
6091 if (env
->loop
> env
->loop_break
) {
6092 env
->loop_break
+= sched_nr_migrate_break
;
6093 env
->flags
|= LBF_NEED_BREAK
;
6096 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6097 if(need_lazy_balance(env
->dst_cpu
, env
->src_cpu
, p
))
6100 if (!can_migrate_task(p
, env
))
6103 load
= task_h_load(p
);
6105 if (sched_feat(LB_MIN
) && load
< 16 && !env
->sd
->nr_balance_failed
)
6108 if (over_imbalance(load
, env
->imbalance
))
6115 env
->imbalance
-= load
;
6117 #ifdef CONFIG_PREEMPT
6119 * NEWIDLE balancing is a source of latency, so preemptible
6120 * kernels will stop after the first task is pulled to minimize
6121 * the critical section.
6123 if (env
->idle
== CPU_NEWLY_IDLE
)
6128 * We only want to steal up to the prescribed amount of
6131 if (env
->imbalance
<= 0)
6136 list_move_tail(&p
->se
.group_node
, tasks
);
6140 * Right now, this is one of only two places move_task() is called,
6141 * so we can safely collect move_task() stats here rather than
6142 * inside move_task().
6144 schedstat_add(env
->sd
, lb_gained
[env
->idle
], pulled
);
6146 mt_sched_printf("move_tasks end");
6151 #ifdef CONFIG_MTK_SCHED_CMP
6152 #ifdef CONFIG_MTK_SCHED_CMP_TGS
6153 static int cmp_can_migrate_task(struct task_struct
*p
, struct lb_env
*env
)
6155 struct sched_domain
*sd
= env
->sd
;
6159 if (!(sd
->flags
& SD_BALANCE_TG
))
6162 if (arch_is_multi_cluster()) {
6163 int src_clid
, dst_clid
;
6165 struct thread_group_info_t
*src_tginfo
, *dst_tginfo
;
6167 src_clid
= get_cluster_id(env
->src_cpu
);
6168 dst_clid
= get_cluster_id(env
->dst_cpu
);
6169 BUG_ON(dst_clid
== -1 || src_clid
== -1);
6170 BUG_ON(p
== NULL
|| p
->group_leader
== NULL
);
6171 src_tginfo
= &p
->group_leader
->thread_group_info
[src_clid
];
6172 dst_tginfo
= &p
->group_leader
->thread_group_info
[dst_clid
];
6173 src_nr_cpus
= nr_cpus_in_cluster(src_clid
, false);
6175 #ifdef CONFIG_MT_SCHED_INFO
6176 mt_sched_printf("check rule0: pid=%d comm=%s load=%ld src:clid=%d tginfo->nr_running=%ld nr_cpus=%d load_avg_ratio=%ld",
6177 p
->pid
, p
->comm
, p
->se
.avg
.load_avg_ratio
,
6178 src_clid
, src_tginfo
->nr_running
, src_nr_cpus
,
6179 src_tginfo
->load_avg_ratio
);
6181 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6182 if ( (!thread_group_empty(p
)) &&
6183 (src_tginfo
->nr_running
<= src_nr_cpus
) &&
6184 (src_tginfo
->nr_running
> dst_tginfo
->nr_running
)){
6185 mt_sched_printf("hit ruleA: bypass pid=%d comm=%s src:nr_running=%lu nr_cpus=%d dst:nr_running=%lu",
6186 p
->pid
, p
->comm
, src_tginfo
->nr_running
, src_nr_cpus
, dst_tginfo
->nr_running
);
6194 static int need_migrate_task_immediately(struct task_struct
*p
,
6195 struct lb_env
*env
, struct clb_env
*clbenv
)
6197 struct sched_domain
*sd
= env
->sd
;
6201 if (arch_is_big_little()) {
6202 mt_sched_printf("[%s] b.L arch", __func__
);
6203 #ifdef CONFIG_MT_SCHED_INFO
6204 mt_sched_printf("check rule0: pid=%d comm=%s src=%d dst=%d p->prio=%d p->se.avg.load_avg_ratio=%ld",
6205 p
->pid
, p
->comm
, env
->src_cpu
, env
->dst_cpu
, p
->prio
, p
->se
.avg
.load_avg_ratio
);
6207 /* from LITTLE to big */
6208 if (arch_cpu_is_little(env
->src_cpu
) && arch_cpu_is_big(env
->dst_cpu
)) {
6209 BUG_ON(env
->src_cpu
!= clbenv
->ltarget
);
6210 if (p
->se
.avg
.load_avg_ratio
>= clbenv
->bstats
.threshold
)
6213 /* from big to LITTLE */
6214 } else if (arch_cpu_is_big(env
->src_cpu
) && arch_cpu_is_little(env
->dst_cpu
)) {
6215 BUG_ON(env
->src_cpu
!= clbenv
->btarget
);
6216 if (p
->se
.avg
.load_avg_ratio
< clbenv
->lstats
.threshold
)
6222 if (arch_is_multi_cluster() && (sd
->flags
& SD_BALANCE_TG
)) {
6223 int src_clid
, dst_clid
;
6225 struct thread_group_info_t
*src_tginfo
, *dst_tginfo
;
6227 src_clid
= get_cluster_id(env
->src_cpu
);
6228 dst_clid
= get_cluster_id(env
->dst_cpu
);
6229 BUG_ON(dst_clid
== -1 || src_clid
== -1);
6230 BUG_ON(p
== NULL
|| p
->group_leader
== NULL
);
6231 src_tginfo
= &p
->group_leader
->thread_group_info
[src_clid
];
6232 dst_tginfo
= &p
->group_leader
->thread_group_info
[dst_clid
];
6233 src_nr_cpus
= nr_cpus_in_cluster(src_clid
, false);
6234 mt_sched_printf("[%s] L.L arch", __func__
);
6236 if ((p
->se
.avg
.load_avg_ratio
*4 >= NICE_0_LOAD
*3) &&
6237 src_tginfo
->nr_running
> src_nr_cpus
&&
6238 src_tginfo
->load_avg_ratio
*10 > NICE_0_LOAD
*src_nr_cpus
*9) {
6239 //pr_warn("[%s] hit rule0, candidate_load_move/load_move (%ld/%ld)\n",
6240 // __func__, candidate_load_move, env->imbalance);
6250 * move_tasks tries to move up to load_move weighted load from busiest to
6251 * this_rq, as part of a balancing operation within domain "sd".
6252 * Returns 1 if successful and 0 otherwise.
6254 * Called with both runqueues locked.
6256 static int cmp_move_tasks(struct sched_domain
*sd
, struct lb_env
*env
)
6258 struct list_head
*tasks
= &env
->src_rq
->cfs_tasks
;
6259 struct task_struct
*p
;
6260 unsigned long load
= 0;
6263 long tg_load_move
, other_load_move
;
6264 struct list_head tg_tasks
, other_tasks
;
6265 int src_clid
, dst_clid
;
6266 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6267 struct cpumask tmp
, *cpus
= &tmp
;
6272 struct clb_env clbenv
;
6273 struct cpumask srcmask
, dstmask
;
6275 if (env
->imbalance
<= 0)
6278 other_load_move
= env
->imbalance
;
6279 INIT_LIST_HEAD(&other_tasks
);
6281 // if (sd->flags & SD_BALANCE_TG) {
6282 tg_load_move
= env
->imbalance
;
6283 INIT_LIST_HEAD(&tg_tasks
);
6284 src_clid
= get_cluster_id(env
->src_cpu
);
6285 dst_clid
= get_cluster_id(env
->dst_cpu
);
6286 BUG_ON(dst_clid
== -1 || src_clid
== -1);
6288 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6289 get_cluster_cpus(cpus
, src_clid
, true);
6291 mt_sched_printf("move_tasks_tg start: src:cpu=%d clid=%d runnable_load=%lu dst:cpu=%d clid=%d runnable_load=%lu imbalance=%ld curr->on_rq=%d",
6292 env
->src_cpu
, src_clid
, cpu_rq(env
->src_cpu
)->cfs
.runnable_load_avg
,
6293 env
->dst_cpu
, dst_clid
, cpu_rq(env
->dst_cpu
)->cfs
.runnable_load_avg
,
6294 env
->imbalance
, env
->dst_rq
->curr
->on_rq
);
6297 mt_sched_printf("max=%d busiest->nr_running=%d",
6298 env
->loop_max
, cpu_rq(env
->src_cpu
)->nr_running
);
6300 if (arch_is_big_little()) {
6301 get_cluster_cpus(&srcmask
, src_clid
, true);
6302 get_cluster_cpus(&dstmask
, dst_clid
, true);
6303 memset(&clbenv
, 0, sizeof(clbenv
));
6304 clbenv
.flags
|= HMP_LB
;
6305 clbenv
.ltarget
= arch_cpu_is_little(env
->src_cpu
) ? env
->src_cpu
: env
->dst_cpu
;
6306 clbenv
.btarget
= arch_cpu_is_big(env
->src_cpu
) ? env
->src_cpu
: env
->dst_cpu
;
6307 clbenv
.lcpus
= arch_cpu_is_little(env
->src_cpu
) ? &srcmask
: &dstmask
;
6308 clbenv
.bcpus
= arch_cpu_is_big(env
->src_cpu
) ? &srcmask
: &dstmask
;
6309 sched_update_clbstats(&clbenv
);
6312 while (!list_empty(tasks
)) {
6313 struct thread_group_info_t
*src_tginfo
, *dst_tginfo
;
6315 p
= list_first_entry(tasks
, struct task_struct
, se
.group_node
);
6317 #ifdef CONFIG_MT_SCHED_INFO
6318 mt_sched_printf("check: pid=%d comm=%s load_avg_contrib=%lu h_load=%lu runnable_load_avg=%lu loop=%d, env->imbalance=%ld tg_load_move=%ld",
6319 p
->pid
, p
->comm
, p
->se
.avg
.load_avg_contrib
,
6320 task_cfs_rq(p
)->h_load
, task_cfs_rq(p
)->runnable_load_avg
,
6321 env
->loop
, env
->imbalance
, tg_load_move
);
6324 /* We've more or less seen every task there is, call it quits */
6325 if (env
->loop
> env
->loop_max
)
6329 /* take a breather every nr_migrate tasks */
6330 if (env
->loop
> env
->loop_break
) {
6331 env
->loop_break
+= sched_nr_migrate_break
;
6332 env
->flags
|= LBF_NEED_BREAK
;
6336 BUG_ON(p
== NULL
|| p
->group_leader
== NULL
);
6337 src_tginfo
= &p
->group_leader
->thread_group_info
[src_clid
];
6338 dst_tginfo
= &p
->group_leader
->thread_group_info
[dst_clid
];
6341 if (!can_migrate_task(p
, env
)) {
6342 mt_sched_printf("can not migrate: pid=%d comm=%s",
6347 load
= task_h_load(p
);
6349 if (sched_feat(LB_MIN
) && load
< 16 && !env
->sd
->nr_balance_failed
) {
6350 mt_sched_printf("can not migrate: pid=%d comm=%s sched_feat",
6355 if (over_imbalance(load
, env
->imbalance
)) {
6356 mt_sched_printf("can not migrate: pid=%d comm=%s load=%ld imbalance=%ld",
6357 p
->pid
, p
->comm
, load
, env
->imbalance
);
6361 /* meet rule0 , migrate immediately */
6362 if (need_migrate_task_immediately(p
, env
, &clbenv
)) {
6364 env
->imbalance
-= load
;
6365 tg_load_move
-= load
;
6366 other_load_move
-= load
;
6367 mt_sched_printf("hit rule0: pid=%d comm=%s load=%ld imbalance=%ld tg_imbalance=%ld other_load_move=%ld",
6368 p
->pid
, p
->comm
, load
, env
->imbalance
, tg_load_move
, other_load_move
);
6370 if (env
->imbalance
<= 0)
6376 if (!cmp_can_migrate_task(p
, env
))
6379 if (sd
->flags
& SD_BALANCE_TG
){
6380 if (over_imbalance(load
, tg_load_move
)) {
6381 mt_sched_printf("can not migrate: pid=%d comm=%s load=%ld imbalance=%ld",
6382 p
->pid
, p
->comm
, load
, tg_load_move
);
6387 if (candidate_load_move
<= 0) {
6388 mt_sched_printf("check: pid=%d comm=%s candidate_load_move=%d",
6389 p
->pid
, p
->comm
, candidate_load_move
);
6394 /* rule1, single thread */
6395 #ifdef CONFIG_MT_SCHED_INFO
6396 mt_sched_printf("check rule1: pid=%d p->comm=%s thread_group_cnt=%lu thread_group_empty(p)=%d",
6398 p
->group_leader
->thread_group_info
[0].nr_running
+
6399 p
->group_leader
->thread_group_info
[1].nr_running
,
6400 thread_group_empty(p
));
6403 if (thread_group_empty(p
)) {
6404 list_move_tail(&p
->se
.group_node
, &tg_tasks
);
6405 tg_load_move
-= load
;
6406 other_load_move
-= load
;
6407 mt_sched_printf("hit rule1: pid=%d p->comm=%s load=%ld tg_imbalance=%ld",
6408 p
->pid
, p
->comm
, load
, tg_load_move
);
6413 #ifdef CONFIG_MT_SCHED_INFO
6414 mt_sched_printf("check rule2: pid=%d p->comm=%s %ld, %ld, %ld, %ld, %ld",
6415 p
->pid
, p
->comm
, src_tginfo
->nr_running
, src_tginfo
->cfs_nr_running
, dst_tginfo
->nr_running
,
6416 p
->se
.avg
.load_avg_ratio
, src_tginfo
->load_avg_ratio
);
6418 if ((src_tginfo
->nr_running
< dst_tginfo
->nr_running
) &&
6419 ((p
->se
.avg
.load_avg_ratio
* src_tginfo
->cfs_nr_running
) <=
6420 src_tginfo
->load_avg_ratio
)) {
6421 list_move_tail(&p
->se
.group_node
, &tg_tasks
);
6422 tg_load_move
-= load
;
6423 other_load_move
-= load
;
6424 mt_sched_printf("hit rule2: pid=%d p->comm=%s load=%ld tg_imbalance=%ld",
6425 p
->pid
, p
->comm
, load
, tg_load_move
);
6429 if (over_imbalance(load
, other_load_move
))
6432 if (other_load_move <= 0)
6436 list_move_tail(&p
->se
.group_node
, &other_tasks
);
6437 other_load_move
-= load
;
6440 list_move_tail(&p
->se
.group_node
, &other_tasks
);
6441 other_load_move
-= load
;
6446 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6447 if(need_lazy_balance(env
->dst_cpu
, env
->src_cpu
, p
))
6453 list_move_tail(&p
->se
.group_node
, tasks
);
6456 if ( sd
->flags
& SD_BALANCE_TG
){
6457 while (!list_empty(&tg_tasks
)) {
6458 p
= list_first_entry(&tg_tasks
, struct task_struct
, se
.group_node
);
6459 list_move_tail(&p
->se
.group_node
, tasks
);
6461 if (env
->imbalance
> 0) {
6462 load
= task_h_load(p
);
6463 if (over_imbalance(load
, env
->imbalance
)){
6464 mt_sched_printf("overload rule1,2: pid=%d p->comm=%s load=%ld imbalance=%ld",
6465 p
->pid
, p
->comm
, load
, env
->imbalance
);
6474 env
->imbalance
-= load
;
6477 mt_sched_printf("migrate hit rule1,2: pid=%d p->comm=%s load=%ld imbalance=%ld",
6478 p
->pid
, p
->comm
, load
, env
->imbalance
);
6483 mt_sched_printf("move_tasks_tg finish rule migrate");
6485 while (!list_empty(&other_tasks
)) {
6486 p
= list_first_entry(&other_tasks
, struct task_struct
, se
.group_node
);
6487 list_move_tail(&p
->se
.group_node
, tasks
);
6490 if (!flag
&& (env
->imbalance
> 0)) {
6492 if (env
->imbalance
> 0) {
6494 load
= task_h_load(p
);
6496 if (over_imbalance(load
, env
->imbalance
)){
6497 mt_sched_printf("overload others: pid=%d p->comm=%s load=%ld imbalance=%ld",
6498 p
->pid
, p
->comm
, load
, env
->imbalance
);
6503 env
->imbalance
-= load
;
6506 mt_sched_printf("migrate others: pid=%d p->comm=%s load=%ld imbalance=%ld",
6507 p
->pid
, p
->comm
, load
, env
->imbalance
);
6512 * Right now, this is one of only two places move_task() is called,
6513 * so we can safely collect move_task() stats here rather than
6514 * inside move_task().
6516 schedstat_add(env
->sd
, lb_gained
[env
->idle
], pulled
);
6518 mt_sched_printf("move_tasks_tg finish pulled=%d imbalance=%ld", pulled
, env
->imbalance
);
6523 #endif /* CONFIG_MTK_SCHED_CMP */
6526 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6527 static int need_lazy_balance(int dst_cpu
, int src_cpu
, struct task_struct
*p
)
6529 /* Lazy balnace for small task
6530 1. src cpu is buddy cpu
6531 2. src cpu is not busy cpu
6534 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
6535 if ( PA_ENABLE
&& cpumask_test_cpu(src_cpu
, &buddy_cpu_map
) &&
6536 !is_buddy_busy(src_cpu
) && is_light_task(p
)) {
6538 if (cpumask_test_cpu(src_cpu
, &buddy_cpu_map
) &&
6539 !is_buddy_busy(src_cpu
) && is_light_task(p
)) {
6541 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
6543 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT
[src_cpu
][dst_cpu
]++;
6544 mt_sched_printf("[PA]pid=%d, Lazy balance from CPU%d to CPU%d\n)\n", p
->pid
, src_cpu
, dst_cpu
);
6546 if(PA_MON_ENABLE
&& (strcmp(p
->comm
, &PA_MON
[i
][0]) == 0)) {
6547 printk(KERN_EMERG
"[PA] %s Lazy balance from CPU%d to CPU%d\n", p
->comm
, src_cpu
, dst_cpu
);
6548 // printk(KERN_EMERG "[PA] src_cpu RQ Usage = %u, Period = %u, NR = %u\n",
6549 // per_cpu(BUDDY_CPU_RQ_USAGE, src_cpu),
6550 // per_cpu(BUDDY_CPU_RQ_PERIOD, src_cpu),
6551 // per_cpu(BUDDY_CPU_RQ_NR, src_cpu));
6552 // printk(KERN_EMERG "[PA] Task Usage = %u, Period = %u\n",
6553 // p->se.avg.usage_avg_sum,
6554 // p->se.avg.runnable_avg_period);
6564 #ifdef CONFIG_FAIR_GROUP_SCHED
6566 * update tg->load_weight by folding this cpu's load_avg
6568 static void __update_blocked_averages_cpu(struct task_group
*tg
, int cpu
)
6570 struct sched_entity
*se
= tg
->se
[cpu
];
6571 struct cfs_rq
*cfs_rq
= tg
->cfs_rq
[cpu
];
6573 /* throttled entities do not contribute to load */
6574 if (throttled_hierarchy(cfs_rq
))
6577 update_cfs_rq_blocked_load(cfs_rq
, 1);
6580 update_entity_load_avg(se
, 1);
6582 * We pivot on our runnable average having decayed to zero for
6583 * list removal. This generally implies that all our children
6584 * have also been removed (modulo rounding error or bandwidth
6585 * control); however, such cases are rare and we can fix these
6588 * TODO: fix up out-of-order children on enqueue.
6590 if (!se
->avg
.runnable_avg_sum
&& !cfs_rq
->nr_running
)
6591 list_del_leaf_cfs_rq(cfs_rq
);
6593 struct rq
*rq
= rq_of(cfs_rq
);
6594 update_rq_runnable_avg(rq
, rq
->nr_running
);
6598 static void update_blocked_averages(int cpu
)
6600 struct rq
*rq
= cpu_rq(cpu
);
6601 struct cfs_rq
*cfs_rq
;
6602 unsigned long flags
;
6604 raw_spin_lock_irqsave(&rq
->lock
, flags
);
6605 update_rq_clock(rq
);
6607 * Iterates the task_group tree in a bottom up fashion, see
6608 * list_add_leaf_cfs_rq() for details.
6610 for_each_leaf_cfs_rq(rq
, cfs_rq
) {
6612 * Note: We may want to consider periodically releasing
6613 * rq->lock about these updates so that creating many task
6614 * groups does not result in continually extending hold time.
6616 __update_blocked_averages_cpu(cfs_rq
->tg
, rq
->cpu
);
6619 raw_spin_unlock_irqrestore(&rq
->lock
, flags
);
6623 * Compute the cpu's hierarchical load factor for each task group.
6624 * This needs to be done in a top-down fashion because the load of a child
6625 * group is a fraction of its parents load.
6627 static int tg_load_down(struct task_group
*tg
, void *data
)
6630 long cpu
= (long)data
;
6634 * rq's sched_avg is not updated accordingly. adopt rq's
6635 * corresponding cfs_rq runnable loading instead.
6637 * a003a25b sched: Consider runnable load average...
6640 load = cpu_rq(cpu)->avg.load_avg_contrib;
6643 load
= cpu_rq(cpu
)->cfs
.runnable_load_avg
;
6645 load
= tg
->parent
->cfs_rq
[cpu
]->h_load
;
6646 load
= div64_ul(load
* tg
->se
[cpu
]->avg
.load_avg_contrib
,
6647 tg
->parent
->cfs_rq
[cpu
]->runnable_load_avg
+ 1);
6650 tg
->cfs_rq
[cpu
]->h_load
= load
;
6655 static void update_h_load(long cpu
)
6658 walk_tg_tree(tg_load_down
, tg_nop
, (void *)cpu
);
6662 static unsigned long task_h_load(struct task_struct
*p
)
6664 struct cfs_rq
*cfs_rq
= task_cfs_rq(p
);
6666 return div64_ul(p
->se
.avg
.load_avg_contrib
* cfs_rq
->h_load
,
6667 cfs_rq
->runnable_load_avg
+ 1);
6670 static inline void update_blocked_averages(int cpu
)
6674 static inline void update_h_load(long cpu
)
6678 static unsigned long task_h_load(struct task_struct
*p
)
6680 return p
->se
.avg
.load_avg_contrib
;
6684 /********** Helpers for find_busiest_group ************************/
6686 * sd_lb_stats - Structure to store the statistics of a sched_domain
6687 * during load balancing.
6689 struct sd_lb_stats
{
6690 struct sched_group
*busiest
; /* Busiest group in this sd */
6691 struct sched_group
*this; /* Local group in this sd */
6692 unsigned long total_load
; /* Total load of all groups in sd */
6693 unsigned long total_pwr
; /* Total power of all groups in sd */
6694 unsigned long avg_load
; /* Average load across all groups in sd */
6696 /** Statistics of this group */
6697 unsigned long this_load
;
6698 unsigned long this_load_per_task
;
6699 unsigned long this_nr_running
;
6700 unsigned long this_has_capacity
;
6701 unsigned int this_idle_cpus
;
6703 /* Statistics of the busiest group */
6704 unsigned int busiest_idle_cpus
;
6705 unsigned long max_load
;
6706 unsigned long busiest_load_per_task
;
6707 unsigned long busiest_nr_running
;
6708 unsigned long busiest_group_capacity
;
6709 unsigned long busiest_has_capacity
;
6710 unsigned int busiest_group_weight
;
6712 int group_imb
; /* Is there imbalance in this sd */
6716 * sg_lb_stats - stats of a sched_group required for load_balancing
6718 struct sg_lb_stats
{
6719 unsigned long avg_load
; /*Avg load across the CPUs of the group */
6720 unsigned long group_load
; /* Total load over the CPUs of the group */
6721 unsigned long sum_nr_running
; /* Nr tasks running in the group */
6722 unsigned long sum_weighted_load
; /* Weighted load of group's tasks */
6723 unsigned long group_capacity
;
6724 unsigned long idle_cpus
;
6725 unsigned long group_weight
;
6726 int group_imb
; /* Is there an imbalance in the group ? */
6727 int group_has_capacity
; /* Is there extra capacity in the group? */
6731 * get_sd_load_idx - Obtain the load index for a given sched domain.
6732 * @sd: The sched_domain whose load_idx is to be obtained.
6733 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
6735 static inline int get_sd_load_idx(struct sched_domain
*sd
,
6736 enum cpu_idle_type idle
)
6742 load_idx
= sd
->busy_idx
;
6745 case CPU_NEWLY_IDLE
:
6746 load_idx
= sd
->newidle_idx
;
6749 load_idx
= sd
->idle_idx
;
6756 static unsigned long default_scale_freq_power(struct sched_domain
*sd
, int cpu
)
6758 return SCHED_POWER_SCALE
;
6761 unsigned long __weak
arch_scale_freq_power(struct sched_domain
*sd
, int cpu
)
6763 return default_scale_freq_power(sd
, cpu
);
6766 static unsigned long default_scale_smt_power(struct sched_domain
*sd
, int cpu
)
6768 unsigned long weight
= sd
->span_weight
;
6769 unsigned long smt_gain
= sd
->smt_gain
;
6776 unsigned long __weak
arch_scale_smt_power(struct sched_domain
*sd
, int cpu
)
6778 return default_scale_smt_power(sd
, cpu
);
6781 static unsigned long scale_rt_power(int cpu
)
6783 struct rq
*rq
= cpu_rq(cpu
);
6784 u64 total
, available
, age_stamp
, avg
;
6787 * Since we're reading these variables without serialization make sure
6788 * we read them once before doing sanity checks on them.
6790 age_stamp
= ACCESS_ONCE(rq
->age_stamp
);
6791 avg
= ACCESS_ONCE(rq
->rt_avg
);
6793 total
= sched_avg_period() + (rq
->clock
- age_stamp
);
6795 if (unlikely(total
< avg
)) {
6796 /* Ensures that power won't end up being negative */
6799 available
= total
- avg
;
6802 if (unlikely((s64
)total
< SCHED_POWER_SCALE
))
6803 total
= SCHED_POWER_SCALE
;
6805 total
>>= SCHED_POWER_SHIFT
;
6807 return div_u64(available
, total
);
6810 static void update_cpu_power(struct sched_domain
*sd
, int cpu
)
6812 unsigned long weight
= sd
->span_weight
;
6813 unsigned long power
= SCHED_POWER_SCALE
;
6814 struct sched_group
*sdg
= sd
->groups
;
6816 if ((sd
->flags
& SD_SHARE_CPUPOWER
) && weight
> 1) {
6817 if (sched_feat(ARCH_POWER
))
6818 power
*= arch_scale_smt_power(sd
, cpu
);
6820 power
*= default_scale_smt_power(sd
, cpu
);
6822 power
>>= SCHED_POWER_SHIFT
;
6825 sdg
->sgp
->power_orig
= power
;
6827 if (sched_feat(ARCH_POWER
))
6828 power
*= arch_scale_freq_power(sd
, cpu
);
6830 power
*= default_scale_freq_power(sd
, cpu
);
6832 power
>>= SCHED_POWER_SHIFT
;
6834 power
*= scale_rt_power(cpu
);
6835 power
>>= SCHED_POWER_SHIFT
;
6840 cpu_rq(cpu
)->cpu_power
= power
;
6841 sdg
->sgp
->power
= power
;
6844 void update_group_power(struct sched_domain
*sd
, int cpu
)
6846 struct sched_domain
*child
= sd
->child
;
6847 struct sched_group
*group
, *sdg
= sd
->groups
;
6848 unsigned long power
;
6849 unsigned long interval
;
6851 interval
= msecs_to_jiffies(sd
->balance_interval
);
6852 interval
= clamp(interval
, 1UL, max_load_balance_interval
);
6853 sdg
->sgp
->next_update
= jiffies
+ interval
;
6856 update_cpu_power(sd
, cpu
);
6862 if (child
->flags
& SD_OVERLAP
) {
6864 * SD_OVERLAP domains cannot assume that child groups
6865 * span the current group.
6868 for_each_cpu(cpu
, sched_group_cpus(sdg
))
6869 power
+= power_of(cpu
);
6872 * !SD_OVERLAP domains can assume that child groups
6873 * span the current group.
6876 group
= child
->groups
;
6878 power
+= group
->sgp
->power
;
6879 group
= group
->next
;
6880 } while (group
!= child
->groups
);
6883 sdg
->sgp
->power_orig
= sdg
->sgp
->power
= power
;
6887 * Try and fix up capacity for tiny siblings, this is needed when
6888 * things like SD_ASYM_PACKING need f_b_g to select another sibling
6889 * which on its own isn't powerful enough.
6891 * See update_sd_pick_busiest() and check_asym_packing().
6894 fix_small_capacity(struct sched_domain
*sd
, struct sched_group
*group
)
6897 * Only siblings can have significantly less than SCHED_POWER_SCALE
6899 if (!(sd
->flags
& SD_SHARE_CPUPOWER
))
6903 * If ~90% of the cpu_power is still there, we're good.
6905 if (group
->sgp
->power
* 32 > group
->sgp
->power_orig
* 29)
6912 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6913 * @env: The load balancing environment.
6914 * @group: sched_group whose statistics are to be updated.
6915 * @load_idx: Load index of sched_domain of this_cpu for load calc.
6916 * @local_group: Does group contain this_cpu.
6917 * @balance: Should we balance.
6918 * @sgs: variable to hold the statistics for this group.
6920 static inline void update_sg_lb_stats(struct lb_env
*env
,
6921 struct sched_group
*group
, int load_idx
,
6922 int local_group
, int *balance
, struct sg_lb_stats
*sgs
)
6924 unsigned long nr_running
, max_nr_running
, min_nr_running
;
6925 unsigned long load
, max_cpu_load
, min_cpu_load
;
6926 unsigned int balance_cpu
= -1, first_idle_cpu
= 0;
6927 unsigned long avg_load_per_task
= 0;
6931 balance_cpu
= group_balance_cpu(group
);
6933 /* Tally up the load of all CPUs in the group */
6935 min_cpu_load
= ~0UL;
6937 min_nr_running
= ~0UL;
6939 for_each_cpu_and(i
, sched_group_cpus(group
), env
->cpus
) {
6940 struct rq
*rq
= cpu_rq(i
);
6942 nr_running
= rq
->nr_running
;
6944 /* Bias balancing toward cpus of our domain */
6946 if (idle_cpu(i
) && !first_idle_cpu
&&
6947 cpumask_test_cpu(i
, sched_group_mask(group
))) {
6952 load
= target_load(i
, load_idx
);
6954 load
= source_load(i
, load_idx
);
6955 if (load
> max_cpu_load
)
6956 max_cpu_load
= load
;
6957 if (min_cpu_load
> load
)
6958 min_cpu_load
= load
;
6960 if (nr_running
> max_nr_running
)
6961 max_nr_running
= nr_running
;
6962 if (min_nr_running
> nr_running
)
6963 min_nr_running
= nr_running
;
6965 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
6966 if((load_idx
> 0) && (load
== cpu_rq(i
)->cpu_load
[load_idx
-1]))
6967 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_HISTORY
);
6971 sgs
->group_load
+= load
;
6972 sgs
->sum_nr_running
+= nr_running
;
6973 sgs
->sum_weighted_load
+= weighted_cpuload(i
);
6979 * First idle cpu or the first cpu(busiest) in this sched group
6980 * is eligible for doing load balancing at this and above
6981 * domains. In the newly idle case, we will allow all the cpu's
6982 * to do the newly idle load balance.
6985 if (env
->idle
!= CPU_NEWLY_IDLE
) {
6986 if (balance_cpu
!= env
->dst_cpu
) {
6990 update_group_power(env
->sd
, env
->dst_cpu
);
6991 } else if (time_after_eq(jiffies
, group
->sgp
->next_update
))
6992 update_group_power(env
->sd
, env
->dst_cpu
);
6995 /* Adjust by relative CPU power of the group */
6996 sgs
->avg_load
= (sgs
->group_load
*SCHED_POWER_SCALE
) / group
->sgp
->power
;
6999 * Consider the group unbalanced when the imbalance is larger
7000 * than the average weight of a task.
7002 * APZ: with cgroup the avg task weight can vary wildly and
7003 * might not be a suitable number - should we keep a
7004 * normalized nr_running number somewhere that negates
7007 if (sgs
->sum_nr_running
)
7008 avg_load_per_task
= sgs
->sum_weighted_load
/ sgs
->sum_nr_running
;
7010 if ((max_cpu_load
- min_cpu_load
) >= avg_load_per_task
&&
7011 (max_nr_running
- min_nr_running
) > 1)
7014 sgs
->group_capacity
= DIV_ROUND_CLOSEST(group
->sgp
->power
,
7016 if (!sgs
->group_capacity
)
7017 sgs
->group_capacity
= fix_small_capacity(env
->sd
, group
);
7018 sgs
->group_weight
= group
->group_weight
;
7020 if (sgs
->group_capacity
> sgs
->sum_nr_running
)
7021 sgs
->group_has_capacity
= 1;
7025 * update_sd_pick_busiest - return 1 on busiest group
7026 * @env: The load balancing environment.
7027 * @sds: sched_domain statistics
7028 * @sg: sched_group candidate to be checked for being the busiest
7029 * @sgs: sched_group statistics
7031 * Determine if @sg is a busier group than the previously selected
7034 static bool update_sd_pick_busiest(struct lb_env
*env
,
7035 struct sd_lb_stats
*sds
,
7036 struct sched_group
*sg
,
7037 struct sg_lb_stats
*sgs
)
7039 if (sgs
->avg_load
<= sds
->max_load
) {
7040 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_PICK_BUSIEST_FAIL_1
);
7044 if (sgs
->sum_nr_running
> sgs
->group_capacity
)
7051 * ASYM_PACKING needs to move all the work to the lowest
7052 * numbered CPUs in the group, therefore mark all groups
7053 * higher than ourself as busy.
7055 if ((env
->sd
->flags
& SD_ASYM_PACKING
) && sgs
->sum_nr_running
&&
7056 env
->dst_cpu
< group_first_cpu(sg
)) {
7060 if (group_first_cpu(sds
->busiest
) > group_first_cpu(sg
))
7064 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_PICK_BUSIEST_FAIL_2
);
7069 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
7070 * @env: The load balancing environment.
7071 * @balance: Should we balance.
7072 * @sds: variable to hold the statistics for this sched_domain.
7074 static inline void update_sd_lb_stats(struct lb_env
*env
,
7075 int *balance
, struct sd_lb_stats
*sds
)
7077 struct sched_domain
*child
= env
->sd
->child
;
7078 struct sched_group
*sg
= env
->sd
->groups
;
7079 struct sg_lb_stats sgs
;
7080 int load_idx
, prefer_sibling
= 0;
7082 if (child
&& child
->flags
& SD_PREFER_SIBLING
)
7085 load_idx
= get_sd_load_idx(env
->sd
, env
->idle
);
7090 local_group
= cpumask_test_cpu(env
->dst_cpu
, sched_group_cpus(sg
));
7091 memset(&sgs
, 0, sizeof(sgs
));
7092 update_sg_lb_stats(env
, sg
, load_idx
, local_group
, balance
, &sgs
);
7094 if (local_group
&& !(*balance
))
7097 sds
->total_load
+= sgs
.group_load
;
7098 sds
->total_pwr
+= sg
->sgp
->power
;
7101 * In case the child domain prefers tasks go to siblings
7102 * first, lower the sg capacity to one so that we'll try
7103 * and move all the excess tasks away. We lower the capacity
7104 * of a group only if the local group has the capacity to fit
7105 * these excess tasks, i.e. nr_running < group_capacity. The
7106 * extra check prevents the case where you always pull from the
7107 * heaviest group when it is already under-utilized (possible
7108 * with a large weight task outweighs the tasks on the system).
7110 if (prefer_sibling
&& !local_group
&& sds
->this_has_capacity
)
7111 sgs
.group_capacity
= min(sgs
.group_capacity
, 1UL);
7114 sds
->this_load
= sgs
.avg_load
;
7116 sds
->this_nr_running
= sgs
.sum_nr_running
;
7117 sds
->this_load_per_task
= sgs
.sum_weighted_load
;
7118 sds
->this_has_capacity
= sgs
.group_has_capacity
;
7119 sds
->this_idle_cpus
= sgs
.idle_cpus
;
7120 } else if (update_sd_pick_busiest(env
, sds
, sg
, &sgs
)) {
7121 sds
->max_load
= sgs
.avg_load
;
7123 sds
->busiest_nr_running
= sgs
.sum_nr_running
;
7124 sds
->busiest_idle_cpus
= sgs
.idle_cpus
;
7125 sds
->busiest_group_capacity
= sgs
.group_capacity
;
7126 sds
->busiest_load_per_task
= sgs
.sum_weighted_load
;
7127 sds
->busiest_has_capacity
= sgs
.group_has_capacity
;
7128 sds
->busiest_group_weight
= sgs
.group_weight
;
7129 sds
->group_imb
= sgs
.group_imb
;
7133 } while (sg
!= env
->sd
->groups
);
7137 * check_asym_packing - Check to see if the group is packed into the
7140 * This is primarily intended to used at the sibling level. Some
7141 * cores like POWER7 prefer to use lower numbered SMT threads. In the
7142 * case of POWER7, it can move to lower SMT modes only when higher
7143 * threads are idle. When in lower SMT modes, the threads will
7144 * perform better since they share less core resources. Hence when we
7145 * have idle threads, we want them to be the higher ones.
7147 * This packing function is run on idle threads. It checks to see if
7148 * the busiest CPU in this domain (core in the P7 case) has a higher
7149 * CPU number than the packing function is being run on. Here we are
7150 * assuming lower CPU number will be equivalent to lower a SMT thread
7153 * Returns 1 when packing is required and a task should be moved to
7154 * this CPU. The amount of the imbalance is returned in *imbalance.
7156 * @env: The load balancing environment.
7157 * @sds: Statistics of the sched_domain which is to be packed
7159 static int check_asym_packing(struct lb_env
*env
, struct sd_lb_stats
*sds
)
7163 if (!(env
->sd
->flags
& SD_ASYM_PACKING
))
7169 busiest_cpu
= group_first_cpu(sds
->busiest
);
7170 if (env
->dst_cpu
> busiest_cpu
)
7173 env
->imbalance
= DIV_ROUND_CLOSEST(
7174 sds
->max_load
* sds
->busiest
->sgp
->power
, SCHED_POWER_SCALE
);
7180 * fix_small_imbalance - Calculate the minor imbalance that exists
7181 * amongst the groups of a sched_domain, during
7183 * @env: The load balancing environment.
7184 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
7187 void fix_small_imbalance(struct lb_env
*env
, struct sd_lb_stats
*sds
)
7189 unsigned long tmp
, pwr_now
= 0, pwr_move
= 0;
7190 unsigned int imbn
= 2;
7191 unsigned long scaled_busy_load_per_task
;
7193 if (sds
->this_nr_running
) {
7194 sds
->this_load_per_task
/= sds
->this_nr_running
;
7195 if (sds
->busiest_load_per_task
>
7196 sds
->this_load_per_task
)
7199 sds
->this_load_per_task
=
7200 cpu_avg_load_per_task(env
->dst_cpu
);
7203 scaled_busy_load_per_task
= sds
->busiest_load_per_task
7204 * SCHED_POWER_SCALE
;
7205 scaled_busy_load_per_task
/= sds
->busiest
->sgp
->power
;
7207 if (sds
->max_load
- sds
->this_load
+ scaled_busy_load_per_task
>=
7208 (scaled_busy_load_per_task
* imbn
)) {
7209 env
->imbalance
= sds
->busiest_load_per_task
;
7214 * OK, we don't have enough imbalance to justify moving tasks,
7215 * however we may be able to increase total CPU power used by
7219 pwr_now
+= sds
->busiest
->sgp
->power
*
7220 min(sds
->busiest_load_per_task
, sds
->max_load
);
7221 pwr_now
+= sds
->this->sgp
->power
*
7222 min(sds
->this_load_per_task
, sds
->this_load
);
7223 pwr_now
/= SCHED_POWER_SCALE
;
7225 /* Amount of load we'd subtract */
7226 tmp
= (sds
->busiest_load_per_task
* SCHED_POWER_SCALE
) /
7227 sds
->busiest
->sgp
->power
;
7228 if (sds
->max_load
> tmp
)
7229 pwr_move
+= sds
->busiest
->sgp
->power
*
7230 min(sds
->busiest_load_per_task
, sds
->max_load
- tmp
);
7232 /* Amount of load we'd add */
7233 if (sds
->max_load
* sds
->busiest
->sgp
->power
<
7234 sds
->busiest_load_per_task
* SCHED_POWER_SCALE
)
7235 tmp
= (sds
->max_load
* sds
->busiest
->sgp
->power
) /
7236 sds
->this->sgp
->power
;
7238 tmp
= (sds
->busiest_load_per_task
* SCHED_POWER_SCALE
) /
7239 sds
->this->sgp
->power
;
7240 pwr_move
+= sds
->this->sgp
->power
*
7241 min(sds
->this_load_per_task
, sds
->this_load
+ tmp
);
7242 pwr_move
/= SCHED_POWER_SCALE
;
7244 /* Move if we gain throughput */
7245 if (pwr_move
> pwr_now
)
7246 env
->imbalance
= sds
->busiest_load_per_task
;
7250 * calculate_imbalance - Calculate the amount of imbalance present within the
7251 * groups of a given sched_domain during load balance.
7252 * @env: load balance environment
7253 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
7255 static inline void calculate_imbalance(struct lb_env
*env
, struct sd_lb_stats
*sds
)
7257 unsigned long max_pull
, load_above_capacity
= ~0UL;
7259 sds
->busiest_load_per_task
/= sds
->busiest_nr_running
;
7260 if (sds
->group_imb
) {
7261 sds
->busiest_load_per_task
=
7262 min(sds
->busiest_load_per_task
, sds
->avg_load
);
7266 * In the presence of smp nice balancing, certain scenarios can have
7267 * max load less than avg load(as we skip the groups at or below
7268 * its cpu_power, while calculating max_load..)
7270 if (sds
->max_load
< sds
->avg_load
) {
7272 return fix_small_imbalance(env
, sds
);
7275 if (!sds
->group_imb
) {
7277 * Don't want to pull so many tasks that a group would go idle.
7279 load_above_capacity
= (sds
->busiest_nr_running
-
7280 sds
->busiest_group_capacity
);
7282 load_above_capacity
*= (SCHED_LOAD_SCALE
* SCHED_POWER_SCALE
);
7284 load_above_capacity
/= sds
->busiest
->sgp
->power
;
7288 * We're trying to get all the cpus to the average_load, so we don't
7289 * want to push ourselves above the average load, nor do we wish to
7290 * reduce the max loaded cpu below the average load. At the same time,
7291 * we also don't want to reduce the group load below the group capacity
7292 * (so that we can implement power-savings policies etc). Thus we look
7293 * for the minimum possible imbalance.
7294 * Be careful of negative numbers as they'll appear as very large values
7295 * with unsigned longs.
7297 max_pull
= min(sds
->max_load
- sds
->avg_load
, load_above_capacity
);
7299 /* How much load to actually move to equalise the imbalance */
7300 env
->imbalance
= min(max_pull
* sds
->busiest
->sgp
->power
,
7301 (sds
->avg_load
- sds
->this_load
) * sds
->this->sgp
->power
)
7302 / SCHED_POWER_SCALE
;
7305 * if *imbalance is less than the average load per runnable task
7306 * there is no guarantee that any tasks will be moved so we'll have
7307 * a think about bumping its value to force at least one task to be
7310 if (env
->imbalance
< sds
->busiest_load_per_task
)
7311 return fix_small_imbalance(env
, sds
);
7315 /******* find_busiest_group() helpers end here *********************/
7318 * find_busiest_group - Returns the busiest group within the sched_domain
7319 * if there is an imbalance. If there isn't an imbalance, and
7320 * the user has opted for power-savings, it returns a group whose
7321 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
7322 * such a group exists.
7324 * Also calculates the amount of weighted load which should be moved
7325 * to restore balance.
7327 * @env: The load balancing environment.
7328 * @balance: Pointer to a variable indicating if this_cpu
7329 * is the appropriate cpu to perform load balancing at this_level.
7331 * Returns: - the busiest group if imbalance exists.
7332 * - If no imbalance and user has opted for power-savings balance,
7333 * return the least loaded group whose CPUs can be
7334 * put to idle by rebalancing its tasks onto our group.
7336 static struct sched_group
*
7337 find_busiest_group(struct lb_env
*env
, int *balance
)
7339 struct sd_lb_stats sds
;
7341 memset(&sds
, 0, sizeof(sds
));
7344 * Compute the various statistics relavent for load balancing at
7347 update_sd_lb_stats(env
, balance
, &sds
);
7350 * this_cpu is not the appropriate cpu to perform load balancing at
7354 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_BALANCE
);
7358 if ((env
->idle
== CPU_IDLE
|| env
->idle
== CPU_NEWLY_IDLE
) &&
7359 check_asym_packing(env
, &sds
))
7362 /* There is no busy sibling group to pull tasks from */
7363 if (!sds
.busiest
|| sds
.busiest_nr_running
== 0){
7365 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_NOBUSYG_BUSIEST_NO_TASK
);
7367 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_NOBUSYG_NO_BUSIEST
);
7372 sds
.avg_load
= (SCHED_POWER_SCALE
* sds
.total_load
) / sds
.total_pwr
;
7375 * If the busiest group is imbalanced the below checks don't
7376 * work because they assumes all things are equal, which typically
7377 * isn't true due to cpus_allowed constraints and the like.
7382 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
7383 if (env
->idle
== CPU_NEWLY_IDLE
&& sds
.this_has_capacity
&&
7384 !sds
.busiest_has_capacity
)
7388 * If the local group is more busy than the selected busiest group
7389 * don't try and pull any tasks.
7391 if (sds
.this_load
>= sds
.max_load
){
7392 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_NOBUSYG_NO_LARGER_THAN
);
7397 * Don't pull any tasks if this group is already above the domain
7400 if (sds
.this_load
>= sds
.avg_load
){
7401 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_NOBUSYG_NO_LARGER_THAN
);
7405 if (env
->idle
== CPU_IDLE
) {
7407 * This cpu is idle. If the busiest group load doesn't
7408 * have more tasks than the number of available cpu's and
7409 * there is no imbalance between this and busiest group
7410 * wrt to idle cpu's, it is balanced.
7412 if ((sds
.this_idle_cpus
<= sds
.busiest_idle_cpus
+ 1) &&
7413 sds
.busiest_nr_running
<= sds
.busiest_group_weight
)
7417 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
7418 * imbalance_pct to be conservative.
7420 if (100 * sds
.max_load
<= env
->sd
->imbalance_pct
* sds
.this_load
){
7421 mt_lbprof_stat_or(env
->fail_reason
, MT_LBPROF_NOBUSYG_CHECK_FAIL
);
7427 /* Looks like there is an imbalance. Compute it */
7428 calculate_imbalance(env
, &sds
);
7438 * find_busiest_queue - find the busiest runqueue among the cpus in group.
7440 static struct rq
*find_busiest_queue(struct lb_env
*env
,
7441 struct sched_group
*group
)
7443 struct rq
*busiest
= NULL
, *rq
;
7444 unsigned long max_load
= 0;
7447 for_each_cpu(i
, sched_group_cpus(group
)) {
7448 unsigned long power
= power_of(i
);
7449 unsigned long capacity
= DIV_ROUND_CLOSEST(power
,
7454 capacity
= fix_small_capacity(env
->sd
, group
);
7456 if (!cpumask_test_cpu(i
, env
->cpus
))
7460 wl
= weighted_cpuload(i
);
7463 * When comparing with imbalance, use weighted_cpuload()
7464 * which is not scaled with the cpu power.
7466 if (capacity
&& rq
->nr_running
== 1 && wl
> env
->imbalance
)
7470 * For the load comparisons with the other cpu's, consider
7471 * the weighted_cpuload() scaled with the cpu power, so that
7472 * the load can be moved away from the cpu that is potentially
7473 * running at a lower capacity.
7475 wl
= (wl
* SCHED_POWER_SCALE
) / power
;
7477 if (wl
> max_load
) {
7487 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
7488 * so long as it is large enough.
7490 #define MAX_PINNED_INTERVAL 512
7492 /* Working cpumask for load_balance and load_balance_newidle. */
7493 DEFINE_PER_CPU(cpumask_var_t
, load_balance_mask
);
7495 static int need_active_balance(struct lb_env
*env
)
7497 struct sched_domain
*sd
= env
->sd
;
7499 if (env
->idle
== CPU_NEWLY_IDLE
) {
7502 * ASYM_PACKING needs to force migrate tasks from busy but
7503 * higher numbered CPUs in order to pack all tasks in the
7504 * lowest numbered CPUs.
7506 if ((sd
->flags
& SD_ASYM_PACKING
) && env
->src_cpu
> env
->dst_cpu
)
7510 return unlikely(sd
->nr_balance_failed
> sd
->cache_nice_tries
+2);
7513 static int active_load_balance_cpu_stop(void *data
);
7516 * Check this_cpu to ensure it is balanced within domain. Attempt to move
7517 * tasks if there is an imbalance.
7519 static int load_balance(int this_cpu
, struct rq
*this_rq
,
7520 struct sched_domain
*sd
, enum cpu_idle_type idle
,
7523 int ld_moved
, cur_ld_moved
, active_balance
= 0;
7524 struct sched_group
*group
;
7526 unsigned long flags
;
7527 struct cpumask
*cpus
= __get_cpu_var(load_balance_mask
);
7529 struct lb_env env
= {
7531 .dst_cpu
= this_cpu
,
7533 .dst_grpmask
= sched_group_cpus(sd
->groups
),
7535 .loop_break
= sched_nr_migrate_break
,
7537 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7538 .fail_reason
= MT_LBPROF_NO_TRIGGER
,
7543 * For NEWLY_IDLE load_balancing, we don't need to consider
7544 * other cpus in our group
7546 if (idle
== CPU_NEWLY_IDLE
)
7547 env
.dst_grpmask
= NULL
;
7549 cpumask_copy(cpus
, cpu_active_mask
);
7551 schedstat_inc(sd
, lb_count
[idle
]);
7554 group
= find_busiest_group(&env
, balance
);
7560 schedstat_inc(sd
, lb_nobusyg
[idle
]);
7561 if(mt_lbprof_test(env
.fail_reason
, MT_LBPROF_HISTORY
)){
7563 for_each_cpu(tmp_cpu
, cpu_possible_mask
){
7564 if (tmp_cpu
== this_rq
->cpu
)
7566 mt_lbprof_update_state(tmp_cpu
, MT_LBPROF_BALANCE_FAIL_STATE
);
7572 busiest
= find_busiest_queue(&env
, group
);
7574 schedstat_inc(sd
, lb_nobusyq
[idle
]);
7575 mt_lbprof_stat_or(env
.fail_reason
, MT_LBPROF_NOBUSYQ
);
7579 #ifdef CONFIG_HMP_LAZY_BALANCE
7581 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7582 if (PA_ENABLE
&& LB_ENABLE
) {
7583 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7585 if (per_cpu(sd_pack_buddy
, this_cpu
) == busiest
->cpu
&& !is_buddy_busy(per_cpu(sd_pack_buddy
, this_cpu
))) {
7587 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7588 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT
[this_cpu
][busiest
->cpu
]++;
7590 #ifdef CONFIG_HMP_TRACER
7591 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_BALANCE_FORM_CPUX_TO_CPUY
, 0, this_cpu
, busiest
->cpu
);
7592 #endif /* CONFIG_HMP_TRACER */
7594 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7596 schedstat_inc(sd
, lb_nobusyq
[idle
]);
7600 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7602 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7604 #endif /* CONFIG_HMP_LAZY_BALANCE */
7606 BUG_ON(busiest
== env
.dst_rq
);
7608 schedstat_add(sd
, lb_imbalance
[idle
], env
.imbalance
);
7611 if (busiest
->nr_running
> 1) {
7613 * Attempt to move tasks. If find_busiest_group has found
7614 * an imbalance but busiest->nr_running <= 1, the group is
7615 * still unbalanced. ld_moved simply stays zero, so it is
7616 * correctly treated as an imbalance.
7618 env
.flags
|= LBF_ALL_PINNED
;
7619 env
.src_cpu
= busiest
->cpu
;
7620 env
.src_rq
= busiest
;
7621 env
.loop_max
= min(sysctl_sched_nr_migrate
, busiest
->nr_running
);
7622 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7623 env
.mt_check_cache_in_idle
= 1;
7626 update_h_load(env
.src_cpu
);
7628 local_irq_save(flags
);
7629 double_rq_lock(env
.dst_rq
, busiest
);
7630 #ifdef CONFIG_MTK_SCHED_CMP
7631 env
.loop_max
= min_t(unsigned long, sysctl_sched_nr_migrate
, busiest
->nr_running
);
7632 mt_sched_printf("1 env.loop_max=%d, busiest->nr_running=%d src=%d, dst=%d, cpus_share_cache=%d",
7633 env
.loop_max
, busiest
->nr_running
, env
.src_cpu
, env
.dst_cpu
, cpus_share_cache(env
.src_cpu
, env
.dst_cpu
));
7634 #endif /* CONFIG_MTK_SCHED_CMP */
7636 * cur_ld_moved - load moved in current iteration
7637 * ld_moved - cumulative load moved across iterations
7639 #ifdef CONFIG_MTK_SCHED_CMP
7640 if (!cpus_share_cache(env
.src_cpu
, env
.dst_cpu
))
7641 cur_ld_moved
= cmp_move_tasks(sd
, &env
);
7643 cur_ld_moved
= move_tasks(&env
);
7644 #else /* !CONFIG_MTK_SCHED_CMP */
7645 cur_ld_moved
= move_tasks(&env
);
7646 #endif /* CONFIG_MTK_SCHED_CMP */
7647 ld_moved
+= cur_ld_moved
;
7648 double_rq_unlock(env
.dst_rq
, busiest
);
7649 local_irq_restore(flags
);
7652 * some other cpu did the load balance for us.
7654 if (cur_ld_moved
&& env
.dst_cpu
!= smp_processor_id())
7655 resched_cpu(env
.dst_cpu
);
7657 if (env
.flags
& LBF_NEED_BREAK
) {
7658 env
.flags
&= ~LBF_NEED_BREAK
;
7663 * Revisit (affine) tasks on src_cpu that couldn't be moved to
7664 * us and move them to an alternate dst_cpu in our sched_group
7665 * where they can run. The upper limit on how many times we
7666 * iterate on same src_cpu is dependent on number of cpus in our
7669 * This changes load balance semantics a bit on who can move
7670 * load to a given_cpu. In addition to the given_cpu itself
7671 * (or a ilb_cpu acting on its behalf where given_cpu is
7672 * nohz-idle), we now have balance_cpu in a position to move
7673 * load to given_cpu. In rare situations, this may cause
7674 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7675 * _independently_ and at _same_ time to move some load to
7676 * given_cpu) causing exceess load to be moved to given_cpu.
7677 * This however should not happen so much in practice and
7678 * moreover subsequent load balance cycles should correct the
7679 * excess load moved.
7681 if ((env
.flags
& LBF_SOME_PINNED
) && env
.imbalance
> 0) {
7683 env
.dst_rq
= cpu_rq(env
.new_dst_cpu
);
7684 env
.dst_cpu
= env
.new_dst_cpu
;
7685 env
.flags
&= ~LBF_SOME_PINNED
;
7687 env
.loop_break
= sched_nr_migrate_break
;
7689 /* Prevent to re-select dst_cpu via env's cpus */
7690 cpumask_clear_cpu(env
.dst_cpu
, env
.cpus
);
7693 * Go back to "more_balance" rather than "redo" since we
7694 * need to continue with same src_cpu.
7699 /* All tasks on this runqueue were pinned by CPU affinity */
7700 if (unlikely(env
.flags
& LBF_ALL_PINNED
)) {
7701 mt_lbprof_update_state(busiest
->cpu
, MT_LBPROF_ALLPINNED
);
7702 cpumask_clear_cpu(cpu_of(busiest
), cpus
);
7703 if (!cpumask_empty(cpus
)) {
7705 env
.loop_break
= sched_nr_migrate_break
;
7711 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7712 /* when move tasks fil, force migration no matter cache-hot */
7713 /* use mt_check_cache_in_idle */
7714 if (!ld_moved
&& ((CPU_NEWLY_IDLE
== idle
) || (CPU_IDLE
== idle
) ) ) {
7715 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7716 mt_lbprof_stat_set(env
.fail_reason
, MT_LBPROF_DO_LB
);
7718 env
.mt_check_cache_in_idle
= 0;
7720 local_irq_save(flags
);
7721 double_rq_lock(env
.dst_rq
, busiest
);
7722 #ifdef CONFIG_MTK_SCHED_CMP
7723 env
.loop_max
= min_t(unsigned long, sysctl_sched_nr_migrate
, busiest
->nr_running
);
7724 mt_sched_printf("2 env.loop_max=%d, busiest->nr_running=%d",
7725 env
.loop_max
, busiest
->nr_running
);
7726 #endif /* CONFIG_MTK_SCHED_CMP */
7728 update_h_load(env
.src_cpu
);
7729 #ifdef CONFIG_MTK_SCHED_CMP_TGS
7730 if (!cpus_share_cache(env
.src_cpu
, env
.dst_cpu
))
7731 ld_moved
= cmp_move_tasks(sd
, &env
);
7733 ld_moved
= move_tasks(&env
);
7735 #else /* !CONFIG_MTK_SCHED_CMP_TGS */
7736 ld_moved
= move_tasks(&env
);
7737 #endif /* CONFIG_MTK_SCHED_CMP_TGS */
7738 double_rq_unlock(env
.dst_rq
, busiest
);
7739 local_irq_restore(flags
);
7742 * some other cpu did the load balance for us.
7744 if (ld_moved
&& this_cpu
!= smp_processor_id())
7745 resched_cpu(this_cpu
);
7751 schedstat_inc(sd
, lb_failed
[idle
]);
7752 mt_lbprof_stat_or(env
.fail_reason
, MT_LBPROF_FAILED
);
7753 if ( mt_lbprof_test(env
.fail_reason
, MT_LBPROF_AFFINITY
) ) {
7754 mt_lbprof_update_state(busiest
->cpu
, MT_LBPROF_FAILURE_STATE
);
7755 }else if ( mt_lbprof_test(env
.fail_reason
, MT_LBPROF_CACHEHOT
) ) {
7756 mt_lbprof_update_state(busiest
->cpu
, MT_LBPROF_FAILURE_STATE
);
7760 * Increment the failure counter only on periodic balance.
7761 * We do not want newidle balance, which can be very
7762 * frequent, pollute the failure counter causing
7763 * excessive cache_hot migrations and active balances.
7765 if (idle
!= CPU_NEWLY_IDLE
)
7766 sd
->nr_balance_failed
++;
7767 mt_lbprof_stat_inc(sd
, mt_lbprof_nr_balance_failed
);
7769 if (need_active_balance(&env
)) {
7770 raw_spin_lock_irqsave(&busiest
->lock
, flags
);
7772 /* don't kick the active_load_balance_cpu_stop,
7773 * if the curr task on busiest cpu can't be
7776 if (!cpumask_test_cpu(this_cpu
,
7777 tsk_cpus_allowed(busiest
->curr
))) {
7778 raw_spin_unlock_irqrestore(&busiest
->lock
,
7780 env
.flags
|= LBF_ALL_PINNED
;
7781 goto out_one_pinned
;
7785 * ->active_balance synchronizes accesses to
7786 * ->active_balance_work. Once set, it's cleared
7787 * only after active load balance is finished.
7789 if (!busiest
->active_balance
) {
7790 busiest
->active_balance
= 1;
7791 busiest
->push_cpu
= this_cpu
;
7794 raw_spin_unlock_irqrestore(&busiest
->lock
, flags
);
7796 if (active_balance
) {
7797 stop_one_cpu_nowait(cpu_of(busiest
),
7798 active_load_balance_cpu_stop
, busiest
,
7799 &busiest
->active_balance_work
);
7803 * We've kicked active balancing, reset the failure
7806 sd
->nr_balance_failed
= sd
->cache_nice_tries
+1;
7809 sd
->nr_balance_failed
= 0;
7811 if (likely(!active_balance
)) {
7812 /* We were unbalanced, so reset the balancing interval */
7813 sd
->balance_interval
= sd
->min_interval
;
7816 * If we've begun active balancing, start to back off. This
7817 * case may not be covered by the all_pinned logic if there
7818 * is only 1 task on the busy runqueue (because we don't call
7821 if (sd
->balance_interval
< sd
->max_interval
)
7822 sd
->balance_interval
*= 2;
7828 schedstat_inc(sd
, lb_balanced
[idle
]);
7830 sd
->nr_balance_failed
= 0;
7831 mt_lbprof_stat_set(sd
->mt_lbprof_nr_balance_failed
, 0);
7834 /* tune up the balancing interval */
7835 if (((env
.flags
& LBF_ALL_PINNED
) &&
7836 sd
->balance_interval
< MAX_PINNED_INTERVAL
) ||
7837 (sd
->balance_interval
< sd
->max_interval
))
7838 sd
->balance_interval
*= 2;
7843 mt_lbprof_stat_or(env
.fail_reason
, MT_LBPROF_SUCCESS
);
7844 mt_lbprof_stat_set(sd
->mt_lbprof_nr_balance_failed
, 0);
7847 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7848 if( CPU_NEWLY_IDLE
== idle
){
7849 char strings
[128]="";
7850 snprintf(strings
, 128, "%d:idle balance:%d:0x%x ", this_cpu
, ld_moved
, env
.fail_reason
);
7851 mt_lbprof_rqinfo(strings
);
7852 trace_sched_lbprof_log(strings
);
7854 char strings
[128]="";
7855 snprintf(strings
, 128, "%d:periodic balance:%d:0x%x ", this_cpu
, ld_moved
, env
.fail_reason
);
7856 mt_lbprof_rqinfo(strings
);
7857 trace_sched_lbprof_log(strings
);
7865 * idle_balance is called by schedule() if this_cpu is about to become
7866 * idle. Attempts to pull tasks from other CPUs.
7868 void idle_balance(int this_cpu
, struct rq
*this_rq
)
7870 struct sched_domain
*sd
;
7871 int pulled_task
= 0;
7872 unsigned long next_balance
= jiffies
+ HZ
;
7873 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) || defined(CONFIG_MT_LOAD_BALANCE_PROFILER)
7874 unsigned long counter
= 0;
7877 this_rq
->idle_stamp
= this_rq
->clock
;
7879 mt_lbprof_update_state_has_lock(this_cpu
, MT_LBPROF_UPDATE_STATE
);
7880 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7881 #ifdef CONFIG_LOCAL_TIMERS
7882 counter
= localtimer_get_counter();
7883 if ( counter
>= 260000 ) // 20ms
7885 if ( time_before(jiffies
+ 2, this_rq
->next_balance
) ) // 20ms
7890 if (this_rq
->avg_idle
< sysctl_sched_migration_cost
){
7891 #if defined(CONFIG_MT_LOAD_BALANCE_PROFILER)
7892 char strings
[128]="";
7893 mt_lbprof_update_state_has_lock(this_cpu
, MT_LBPROF_ALLOW_UNBLANCE_STATE
);
7894 snprintf(strings
, 128, "%d:idle balance bypass: %llu %lu ", this_cpu
, this_rq
->avg_idle
, counter
);
7895 mt_lbprof_rqinfo(strings
);
7896 trace_sched_lbprof_log(strings
);
7901 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7906 * Drop the rq->lock, but keep IRQ/preempt disabled.
7908 raw_spin_unlock(&this_rq
->lock
);
7910 mt_lbprof_update_status();
7911 update_blocked_averages(this_cpu
);
7913 for_each_domain(this_cpu
, sd
) {
7914 unsigned long interval
;
7917 if (!(sd
->flags
& SD_LOAD_BALANCE
))
7920 if (sd
->flags
& SD_BALANCE_NEWIDLE
) {
7921 /* If we've pulled tasks over stop searching: */
7922 pulled_task
= load_balance(this_cpu
, this_rq
,
7923 sd
, CPU_NEWLY_IDLE
, &balance
);
7926 interval
= msecs_to_jiffies(sd
->balance_interval
);
7927 if (time_after(next_balance
, sd
->last_balance
+ interval
))
7928 next_balance
= sd
->last_balance
+ interval
;
7930 this_rq
->idle_stamp
= 0;
7936 raw_spin_lock(&this_rq
->lock
);
7938 if (pulled_task
|| time_after(jiffies
, this_rq
->next_balance
)) {
7940 * We are going idle. next_balance may be set based on
7941 * a busy processor. So reset next_balance.
7943 this_rq
->next_balance
= next_balance
;
7948 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7949 * running tasks off the busiest CPU onto idle CPUs. It requires at
7950 * least 1 task to be running on each physical CPU where possible, and
7951 * avoids physical / logical imbalances.
7953 static int active_load_balance_cpu_stop(void *data
)
7955 struct rq
*busiest_rq
= data
;
7956 int busiest_cpu
= cpu_of(busiest_rq
);
7957 int target_cpu
= busiest_rq
->push_cpu
;
7958 struct rq
*target_rq
= cpu_rq(target_cpu
);
7959 struct sched_domain
*sd
;
7961 raw_spin_lock_irq(&busiest_rq
->lock
);
7963 /* make sure the requested cpu hasn't gone down in the meantime */
7964 if (unlikely(busiest_cpu
!= smp_processor_id() ||
7965 !busiest_rq
->active_balance
))
7968 /* Is there any task to move? */
7969 if (busiest_rq
->nr_running
<= 1)
7973 * This condition is "impossible", if it occurs
7974 * we need to fix it. Originally reported by
7975 * Bjorn Helgaas on a 128-cpu setup.
7977 BUG_ON(busiest_rq
== target_rq
);
7979 /* move a task from busiest_rq to target_rq */
7980 double_lock_balance(busiest_rq
, target_rq
);
7982 /* Search for an sd spanning us and the target CPU. */
7984 for_each_domain(target_cpu
, sd
) {
7985 if ((sd
->flags
& SD_LOAD_BALANCE
) &&
7986 cpumask_test_cpu(busiest_cpu
, sched_domain_span(sd
)))
7991 struct lb_env env
= {
7993 .dst_cpu
= target_cpu
,
7994 .dst_rq
= target_rq
,
7995 .src_cpu
= busiest_rq
->cpu
,
7996 .src_rq
= busiest_rq
,
8000 schedstat_inc(sd
, alb_count
);
8002 if (move_one_task(&env
))
8003 schedstat_inc(sd
, alb_pushed
);
8005 schedstat_inc(sd
, alb_failed
);
8008 double_unlock_balance(busiest_rq
, target_rq
);
8010 busiest_rq
->active_balance
= 0;
8011 raw_spin_unlock_irq(&busiest_rq
->lock
);
8015 #ifdef CONFIG_NO_HZ_COMMON
8017 * idle load balancing details
8018 * - When one of the busy CPUs notice that there may be an idle rebalancing
8019 * needed, they will kick the idle load balancer, which then does idle
8020 * load balancing for all the idle CPUs.
8023 cpumask_var_t idle_cpus_mask
;
8025 unsigned long next_balance
; /* in jiffy units */
8026 } nohz ____cacheline_aligned
;
8029 static inline int find_new_ilb(int call_cpu
)
8031 #ifdef CONFIG_HMP_PACK_SMALL_TASK
8033 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
8035 struct sched_domain
*sd
;
8037 int ilb_new
= nr_cpu_ids
;
8041 int ilb
= cpumask_first(nohz
.idle_cpus_mask
);
8046 int buddy
= per_cpu(sd_pack_buddy
, call_cpu
);
8049 * If we have a pack buddy CPU, we try to run load balance on a CPU
8050 * that is close to the buddy.
8053 for_each_domain(buddy
, sd
) {
8054 if (sd
->flags
& SD_SHARE_CPUPOWER
)
8057 ilb_new
= cpumask_first_and(sched_domain_span(sd
),
8058 nohz
.idle_cpus_mask
);
8060 if (ilb_new
< nr_cpu_ids
)
8066 if (ilb
< nr_cpu_ids
&& idle_cpu(ilb
)) {
8070 if (ilb_new
< nr_cpu_ids
) {
8071 if (idle_cpu(ilb_new
)) {
8072 if(PA_ENABLE
&& ilb_return
&& ilb_new
!= ilb
) {
8073 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT
[call_cpu
][ilb
]++;
8075 #ifdef CONFIG_HMP_TRACER
8076 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_WAKE_UP_FORM_CPUX_TO_CPUY
, 0, call_cpu
, ilb
);
8077 #endif /* CONFIG_HMP_TRACER */
8090 #else /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
8092 struct sched_domain
*sd
;
8093 int ilb
= cpumask_first(nohz
.idle_cpus_mask
);
8094 int buddy
= per_cpu(sd_pack_buddy
, call_cpu
);
8097 * If we have a pack buddy CPU, we try to run load balance on a CPU
8098 * that is close to the buddy.
8101 for_each_domain(buddy
, sd
) {
8102 if (sd
->flags
& SD_SHARE_CPUPOWER
)
8105 ilb
= cpumask_first_and(sched_domain_span(sd
),
8106 nohz
.idle_cpus_mask
);
8108 if (ilb
< nr_cpu_ids
)
8112 if (ilb
< nr_cpu_ids
&& idle_cpu(ilb
))
8117 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
8119 #else /* CONFIG_HMP_PACK_SMALL_TASK */
8121 int ilb
= cpumask_first(nohz
.idle_cpus_mask
);
8122 #ifdef CONFIG_MTK_SCHED_CMP_TGS
8123 /* Find nohz balancing to occur in the same cluster firstly */
8126 //Find idle cpu with online one
8127 get_cluster_cpus(&tmp
, get_cluster_id(call_cpu
), true);
8128 new_ilb
= cpumask_first_and(nohz
.idle_cpus_mask
, &tmp
);
8129 if (new_ilb
< nr_cpu_ids
&& idle_cpu(new_ilb
))
8131 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
8134 mt_sched_printf("[PA]find_new_ilb(cpu%x), new_ilb = %d, ilb = %d\n", call_cpu
, new_ilb
, ilb
);
8135 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT
[call_cpu
][ilb
]++;
8140 #endif /* CONFIG_MTK_SCHED_CMP_TGS */
8142 if (ilb
< nr_cpu_ids
&& idle_cpu(ilb
))
8147 #endif /* CONFIG_HMP_PACK_SMALL_TASK */
8153 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
8154 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
8155 * CPU (if there is one).
8157 static void nohz_balancer_kick(int cpu
)
8161 nohz
.next_balance
++;
8163 ilb_cpu
= find_new_ilb(cpu
);
8165 if (ilb_cpu
>= nr_cpu_ids
)
8168 if (test_and_set_bit(NOHZ_BALANCE_KICK
, nohz_flags(ilb_cpu
)))
8171 * Use smp_send_reschedule() instead of resched_cpu().
8172 * This way we generate a sched IPI on the target cpu which
8173 * is idle. And the softirq performing nohz idle load balance
8174 * will be run before returning from the IPI.
8176 smp_send_reschedule(ilb_cpu
);
8180 static inline void nohz_balance_exit_idle(int cpu
)
8182 if (unlikely(test_bit(NOHZ_TICK_STOPPED
, nohz_flags(cpu
)))) {
8183 cpumask_clear_cpu(cpu
, nohz
.idle_cpus_mask
);
8184 atomic_dec(&nohz
.nr_cpus
);
8185 clear_bit(NOHZ_TICK_STOPPED
, nohz_flags(cpu
));
8189 static inline void set_cpu_sd_state_busy(void)
8191 struct sched_domain
*sd
;
8192 int cpu
= smp_processor_id();
8195 sd
= rcu_dereference_check_sched_domain(cpu_rq(cpu
)->sd
);
8197 if (!sd
|| !sd
->nohz_idle
)
8201 for (; sd
; sd
= sd
->parent
)
8202 atomic_inc(&sd
->groups
->sgp
->nr_busy_cpus
);
8207 void set_cpu_sd_state_idle(void)
8209 struct sched_domain
*sd
;
8210 int cpu
= smp_processor_id();
8213 sd
= rcu_dereference_check_sched_domain(cpu_rq(cpu
)->sd
);
8215 if (!sd
|| sd
->nohz_idle
)
8219 for (; sd
; sd
= sd
->parent
)
8220 atomic_dec(&sd
->groups
->sgp
->nr_busy_cpus
);
8226 * This routine will record that the cpu is going idle with tick stopped.
8227 * This info will be used in performing idle load balancing in the future.
8229 void nohz_balance_enter_idle(int cpu
)
8232 * If this cpu is going down, then nothing needs to be done.
8234 if (!cpu_active(cpu
))
8237 if (test_bit(NOHZ_TICK_STOPPED
, nohz_flags(cpu
)))
8240 cpumask_set_cpu(cpu
, nohz
.idle_cpus_mask
);
8241 atomic_inc(&nohz
.nr_cpus
);
8242 set_bit(NOHZ_TICK_STOPPED
, nohz_flags(cpu
));
8245 static int __cpuinit
sched_ilb_notifier(struct notifier_block
*nfb
,
8246 unsigned long action
, void *hcpu
)
8248 switch (action
& ~CPU_TASKS_FROZEN
) {
8250 nohz_balance_exit_idle(smp_processor_id());
8258 static DEFINE_SPINLOCK(balancing
);
8261 * Scale the max load_balance interval with the number of CPUs in the system.
8262 * This trades load-balance latency on larger machines for less cross talk.
8264 void update_max_interval(void)
8266 max_load_balance_interval
= HZ
*num_online_cpus()/10;
8270 * It checks each scheduling domain to see if it is due to be balanced,
8271 * and initiates a balancing operation if so.
8273 * Balancing parameters are set up in init_sched_domains.
8275 static void rebalance_domains(int cpu
, enum cpu_idle_type idle
)
8278 struct rq
*rq
= cpu_rq(cpu
);
8279 unsigned long interval
;
8280 struct sched_domain
*sd
;
8281 /* Earliest time when we have to do rebalance again */
8282 unsigned long next_balance
= jiffies
+ 60*HZ
;
8283 int update_next_balance
= 0;
8286 update_blocked_averages(cpu
);
8289 for_each_domain(cpu
, sd
) {
8290 if (!(sd
->flags
& SD_LOAD_BALANCE
))
8293 interval
= sd
->balance_interval
;
8294 if (idle
!= CPU_IDLE
)
8295 interval
*= sd
->busy_factor
;
8297 /* scale ms to jiffies */
8298 interval
= msecs_to_jiffies(interval
);
8299 interval
= clamp(interval
, 1UL, max_load_balance_interval
);
8301 need_serialize
= sd
->flags
& SD_SERIALIZE
;
8303 if (need_serialize
) {
8304 if (!spin_trylock(&balancing
))
8308 if (time_after_eq(jiffies
, sd
->last_balance
+ interval
)) {
8309 if (load_balance(cpu
, rq
, sd
, idle
, &balance
)) {
8311 * The LBF_SOME_PINNED logic could have changed
8312 * env->dst_cpu, so we can't know our idle
8313 * state even if we migrated tasks. Update it.
8315 idle
= idle_cpu(cpu
) ? CPU_IDLE
: CPU_NOT_IDLE
;
8317 sd
->last_balance
= jiffies
;
8320 spin_unlock(&balancing
);
8322 if (time_after(next_balance
, sd
->last_balance
+ interval
)) {
8323 next_balance
= sd
->last_balance
+ interval
;
8324 update_next_balance
= 1;
8328 * Stop the load balance at this level. There is another
8329 * CPU in our sched group which is doing load balancing more
8338 * next_balance will be updated only when there is a need.
8339 * When the cpu is attached to null domain for ex, it will not be
8342 if (likely(update_next_balance
))
8343 rq
->next_balance
= next_balance
;
8346 #ifdef CONFIG_NO_HZ_COMMON
8348 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
8349 * rebalancing for all the cpus for whom scheduler ticks are stopped.
8351 static void nohz_idle_balance(int this_cpu
, enum cpu_idle_type idle
)
8353 struct rq
*this_rq
= cpu_rq(this_cpu
);
8357 if (idle
!= CPU_IDLE
||
8358 !test_bit(NOHZ_BALANCE_KICK
, nohz_flags(this_cpu
)))
8361 for_each_cpu(balance_cpu
, nohz
.idle_cpus_mask
) {
8362 if (balance_cpu
== this_cpu
|| !idle_cpu(balance_cpu
))
8366 * If this cpu gets work to do, stop the load balancing
8367 * work being done for other cpus. Next load
8368 * balancing owner will pick it up.
8373 rq
= cpu_rq(balance_cpu
);
8375 raw_spin_lock_irq(&rq
->lock
);
8376 update_rq_clock(rq
);
8377 update_idle_cpu_load(rq
);
8378 raw_spin_unlock_irq(&rq
->lock
);
8380 rebalance_domains(balance_cpu
, CPU_IDLE
);
8382 if (time_after(this_rq
->next_balance
, rq
->next_balance
))
8383 this_rq
->next_balance
= rq
->next_balance
;
8385 nohz
.next_balance
= this_rq
->next_balance
;
8387 clear_bit(NOHZ_BALANCE_KICK
, nohz_flags(this_cpu
));
8391 * Current heuristic for kicking the idle load balancer in the presence
8392 * of an idle cpu is the system.
8393 * - This rq has more than one task.
8394 * - At any scheduler domain level, this cpu's scheduler group has multiple
8395 * busy cpu's exceeding the group's power.
8396 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
8397 * domain span are idle.
8399 static inline int nohz_kick_needed(struct rq
*rq
, int cpu
)
8401 unsigned long now
= jiffies
;
8402 struct sched_domain
*sd
;
8404 if (unlikely(idle_cpu(cpu
)))
8408 * We may be recently in ticked or tickless idle mode. At the first
8409 * busy tick after returning from idle, we will update the busy stats.
8411 set_cpu_sd_state_busy();
8412 nohz_balance_exit_idle(cpu
);
8415 * None are in tickless mode and hence no need for NOHZ idle load
8418 if (likely(!atomic_read(&nohz
.nr_cpus
)))
8421 if (time_before(now
, nohz
.next_balance
))
8424 #ifdef CONFIG_SCHED_HMP
8426 * Bail out if there are no nohz CPUs in our
8427 * HMP domain, since we will move tasks between
8428 * domains through wakeup and force balancing
8429 * as necessary based upon task load.
8431 if (cpumask_first_and(nohz
.idle_cpus_mask
,
8432 &((struct hmp_domain
*)hmp_cpu_domain(cpu
))->cpus
) >= nr_cpu_ids
)
8436 if (rq
->nr_running
>= 2)
8440 for_each_domain(cpu
, sd
) {
8441 struct sched_group
*sg
= sd
->groups
;
8442 struct sched_group_power
*sgp
= sg
->sgp
;
8443 int nr_busy
= atomic_read(&sgp
->nr_busy_cpus
);
8445 if (sd
->flags
& SD_SHARE_PKG_RESOURCES
&& nr_busy
> 1)
8446 goto need_kick_unlock
;
8448 if (sd
->flags
& SD_ASYM_PACKING
&& nr_busy
!= sg
->group_weight
8449 && (cpumask_first_and(nohz
.idle_cpus_mask
,
8450 sched_domain_span(sd
)) < cpu
))
8451 goto need_kick_unlock
;
8453 if (!(sd
->flags
& (SD_SHARE_PKG_RESOURCES
| SD_ASYM_PACKING
)))
8465 static void nohz_idle_balance(int this_cpu
, enum cpu_idle_type idle
) { }
8468 #ifdef CONFIG_SCHED_HMP
8469 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
8472 * Heterogenous Multi-Processor (HMP) - Declaration and Useful Macro
8475 /* Function Declaration */
8476 static int hmp_up_stable(int cpu
);
8477 static int hmp_down_stable(int cpu
);
8478 static unsigned int hmp_up_migration(int cpu
, int *target_cpu
, struct sched_entity
*se
,
8479 struct clb_env
*clbenv
);
8480 static unsigned int hmp_down_migration(int cpu
, int *target_cpu
, struct sched_entity
*se
,
8481 struct clb_env
*clbenv
);
8483 #define hmp_caller_is_gb(caller) ((HMP_GB == caller)?1:0)
8485 #define hmp_cpu_is_fast(cpu) cpumask_test_cpu(cpu,&hmp_fast_cpu_mask)
8486 #define hmp_cpu_is_slow(cpu) cpumask_test_cpu(cpu,&hmp_slow_cpu_mask)
8487 #define hmp_cpu_stable(cpu) (hmp_cpu_is_fast(cpu)? \
8488 hmp_up_stable(cpu):hmp_down_stable(cpu))
8490 #define hmp_inc(v) ((v) + 1)
8491 #define hmp_dec(v) ((v) - 1)
8492 #define hmp_pos(v) ((v) < (0) ? (0) : (v))
8494 #define task_created(f) ((SD_BALANCE_EXEC == f || SD_BALANCE_FORK == f)?1:0)
8495 #define task_cpus_allowed(mask,p) cpumask_intersects(mask,tsk_cpus_allowed(p))
8496 #define task_slow_cpu_allowed(p) task_cpus_allowed(&hmp_slow_cpu_mask,p)
8497 #define task_fast_cpu_allowed(p) task_cpus_allowed(&hmp_fast_cpu_mask,p)
8500 * Heterogenous Multi-Processor (HMP) - Utility Function
8504 * These functions add next up/down migration delay that prevents the task from
8505 * doing another migration in the same direction until the delay has expired.
8507 static int hmp_up_stable(int cpu
)
8509 struct cfs_rq
*cfs_rq
= &cpu_rq(cpu
)->cfs
;
8510 u64 now
= cfs_rq_clock_task(cfs_rq
);
8511 if (((now
- hmp_last_up_migration(cpu
)) >> 10) < hmp_next_up_threshold
)
8516 static int hmp_down_stable(int cpu
)
8518 struct cfs_rq
*cfs_rq
= &cpu_rq(cpu
)->cfs
;
8519 u64 now
= cfs_rq_clock_task(cfs_rq
);
8520 if (((now
- hmp_last_down_migration(cpu
)) >> 10) < hmp_next_down_threshold
)
8525 /* Select the most appropriate CPU from hmp cluster */
8526 static unsigned int hmp_select_cpu(unsigned int caller
, struct task_struct
*p
,
8527 struct cpumask
*mask
, int prev
)
8530 int target
= NR_CPUS
;
8531 unsigned long curr_wload
= 0;
8532 unsigned long target_wload
= 0;
8533 struct cpumask srcp
;
8534 cpumask_and(&srcp
, cpu_online_mask
, mask
);
8535 target
= cpumask_any_and(&srcp
, tsk_cpus_allowed(p
));
8536 if (NR_CPUS
== target
)
8540 * RT class is taken into account because CPU load is multiplied
8541 * by the total number of CPU runnable tasks that includes RT tasks.
8543 target_wload
= hmp_inc(cfs_load(target
));
8544 target_wload
+= cfs_pending_load(target
);
8545 target_wload
*= rq_length(target
);
8546 for_each_cpu(curr
, mask
) {
8547 /* Check CPU status and task affinity */
8548 if(!cpu_online(curr
) || !cpumask_test_cpu(curr
, tsk_cpus_allowed(p
)))
8551 /* For global load balancing, unstable CPU will be bypassed */
8552 if(hmp_caller_is_gb(caller
) && !hmp_cpu_stable(curr
))
8555 curr_wload
= hmp_inc(cfs_load(curr
));
8556 curr_wload
+= cfs_pending_load(curr
);
8557 curr_wload
*= rq_length(curr
);
8558 if(curr_wload
< target_wload
) {
8559 target_wload
= curr_wload
;
8561 } else if(curr_wload
== target_wload
&& curr
== prev
) {
8571 * Heterogenous Multi-Processor (HMP) - Task Runqueue Selection
8574 /* This function enhances the original task selection function */
8575 static int hmp_select_task_rq_fair(int sd_flag
, struct task_struct
*p
,
8576 int prev_cpu
, int new_cpu
)
8578 #ifdef CONFIG_HMP_TASK_ASSIGNMENT
8580 struct sched_entity
*se
= &p
->se
;
8581 int B_target
= NR_CPUS
;
8582 int L_target
= NR_CPUS
;
8583 struct clb_env clbenv
;
8585 #ifdef CONFIG_HMP_TRACER
8587 for_each_online_cpu(cpu
)
8588 trace_sched_cfs_runnable_load(cpu
,cfs_load(cpu
),cfs_length(cpu
));
8592 if (prev_cpu
>= NR_CPUS
)
8596 * Skip all the checks if only one CPU is online.
8597 * Otherwise, select the most appropriate CPU from cluster.
8599 if (num_online_cpus() == 1)
8601 B_target
= hmp_select_cpu(HMP_SELECT_RQ
,p
,&hmp_fast_cpu_mask
,prev_cpu
);
8602 L_target
= hmp_select_cpu(HMP_SELECT_RQ
,p
,&hmp_slow_cpu_mask
,prev_cpu
);
8605 * Only one cluster exists or only one cluster is allowed for this task
8606 * Case 1: return the runqueue whose load is minimum
8607 * Case 2: return original CFS runqueue selection result
8609 #ifdef CONFIG_HMP_DISCARD_CFS_SELECTION_RESULT
8610 if(NR_CPUS
== B_target
&& NR_CPUS
== L_target
)
8612 if(NR_CPUS
== B_target
)
8614 if(NR_CPUS
== L_target
)
8617 if(NR_CPUS
== B_target
|| NR_CPUS
== L_target
)
8622 * Two clusters exist and both clusters are allowed for this task
8623 * Step 1: Move newly created task to the cpu where no tasks are running
8624 * Step 2: Migrate heavy-load task to big
8625 * Step 3: Migrate light-load task to LITTLE
8626 * Step 4: Make sure the task stays in its previous hmp domain
8629 if (task_created(sd_flag
) && !task_low_priority(p
->prio
)) {
8630 if (!rq_length(B_target
))
8632 if (!rq_length(L_target
))
8635 memset(&clbenv
, 0, sizeof(clbenv
));
8636 clbenv
.flags
|= HMP_SELECT_RQ
;
8637 clbenv
.lcpus
= &hmp_slow_cpu_mask
;
8638 clbenv
.bcpus
= &hmp_fast_cpu_mask
;
8639 clbenv
.ltarget
= L_target
;
8640 clbenv
.btarget
= B_target
;
8641 sched_update_clbstats(&clbenv
);
8643 if (hmp_up_migration(L_target
, &B_target
, se
, &clbenv
))
8646 if (hmp_down_migration(B_target
, &L_target
, se
, &clbenv
))
8649 if (hmp_cpu_is_slow(prev_cpu
))
8662 // it happens when num_online_cpus=1
8663 if (new_cpu
>= nr_cpu_ids
)
8669 cfs_nr_pending(new_cpu
)++;
8670 cfs_pending_load(new_cpu
) += se_load(se
);
8671 #ifdef CONFIG_HMP_TRACER
8672 trace_sched_hmp_load(clbenv
.bstats
.load_avg
, clbenv
.lstats
.load_avg
);
8673 trace_sched_hmp_select_task_rq(p
,step
,sd_flag
,prev_cpu
,new_cpu
,
8674 se_load(se
),&clbenv
.bstats
,&clbenv
.lstats
);
8676 #ifdef CONFIG_MET_SCHED_HMP
8677 HmpLoad(clbenv
.bstats
.load_avg
, clbenv
.lstats
.load_avg
);
8679 #endif /* CONFIG_HMP_TASK_ASSIGNMENT */
8684 * Heterogenous Multi-Processor (HMP) - Task Dynamic Migration Threshold
8688 * If the workload between clusters is not balanced, adjust migration
8689 * threshold in an attempt to move task to the cluster where the workload
8694 * According to ARM's cpu_efficiency table, the computing power of CA15 and
8695 * CA7 are 3891 and 2048 respectively. Thus, we assume big has twice the
8696 * computing power of LITTLE
8699 #define HMP_RATIO(v) ((v)*17/10)
8701 #define hmp_fast_cpu_has_spare_cycles(B,cpu_load) (cpu_load < \
8702 (HMP_RATIO(B->cpu_capacity) - (B->cpu_capacity >> 2)))
8704 #define hmp_task_fast_cpu_afford(B,se,cpu) (B->acap > 0 \
8705 && hmp_fast_cpu_has_spare_cycles(B,se_load(se) + cfs_load(cpu)))
8707 #define hmp_fast_cpu_oversubscribed(caller,B,se,cpu) \
8708 (hmp_caller_is_gb(caller)? \
8709 !hmp_fast_cpu_has_spare_cycles(B,cfs_load(cpu)): \
8710 !hmp_task_fast_cpu_afford(B,se,cpu))
8712 #define hmp_task_slow_cpu_afford(L,se) \
8713 (L->acap > 0 && L->acap >= se_load(se))
8715 /* Macro used by low-priorty task filter */
8716 #define hmp_low_prio_task_up_rejected(p,B,L) \
8717 (task_low_priority(p->prio) && \
8718 (B->ntask >= B->ncpu || 0 != L->nr_normal_prio_task) && \
8719 (p->se.avg.load_avg_ratio < 800))
8721 #define hmp_low_prio_task_down_allowed(p,B,L) \
8722 (task_low_priority(p->prio) && !B->nr_dequeuing_low_prio && \
8723 B->ntask >= B->ncpu && 0 != L->nr_normal_prio_task && \
8724 (p->se.avg.load_avg_ratio < 800))
8726 /* Migration check result */
8727 #define HMP_BIG_NOT_OVERSUBSCRIBED (0x01)
8728 #define HMP_BIG_CAPACITY_INSUFFICIENT (0x02)
8729 #define HMP_LITTLE_CAPACITY_INSUFFICIENT (0x04)
8730 #define HMP_LOW_PRIORITY_FILTER (0x08)
8731 #define HMP_BIG_BUSY_LITTLE_IDLE (0x10)
8732 #define HMP_BIG_IDLE (0x20)
8733 #define HMP_MIGRATION_APPROVED (0x100)
8734 #define HMP_TASK_UP_MIGRATION (0x200)
8735 #define HMP_TASK_DOWN_MIGRATION (0x400)
8737 /* Migration statistics */
8738 #ifdef CONFIG_HMP_TRACER
8739 struct hmp_statisic hmp_stats
;
8742 static inline void hmp_dynamic_threshold(struct clb_env
*clbenv
)
8744 struct clb_stats
*L
= &clbenv
->lstats
;
8745 struct clb_stats
*B
= &clbenv
->bstats
;
8746 unsigned int hmp_threshold_diff
= hmp_up_threshold
- hmp_down_threshold
;
8747 unsigned int B_normalized_acap
= hmp_pos(HMP_RATIO(B
->scaled_acap
));
8748 unsigned int B_normalized_atask
= hmp_pos(HMP_RATIO(B
->scaled_atask
));
8749 unsigned int L_normalized_acap
= hmp_pos(L
->scaled_acap
);
8750 unsigned int L_normalized_atask
= hmp_pos(L
->scaled_atask
);
8752 #ifdef CONFIG_HMP_DYNAMIC_THRESHOLD
8753 L
->threshold
= hmp_threshold_diff
;
8754 L
->threshold
*= hmp_inc(L_normalized_acap
) * hmp_inc(L_normalized_atask
);
8755 L
->threshold
/= hmp_inc(B_normalized_acap
+ L_normalized_acap
);
8756 L
->threshold
/= hmp_inc(B_normalized_atask
+ L_normalized_atask
);
8757 L
->threshold
= hmp_down_threshold
+ L
->threshold
;
8759 B
->threshold
= hmp_threshold_diff
;
8760 B
->threshold
*= hmp_inc(B_normalized_acap
) * hmp_inc(B_normalized_atask
);
8761 B
->threshold
/= hmp_inc(B_normalized_acap
+ L_normalized_acap
);
8762 B
->threshold
/= hmp_inc(B_normalized_atask
+ L_normalized_atask
);
8763 B
->threshold
= hmp_up_threshold
- B
->threshold
;
8764 #else /* !CONFIG_HMP_DYNAMIC_THRESHOLD */
8765 clbenv
->lstats
.threshold
= hmp_down_threshold
; // down threshold
8766 clbenv
->bstats
.threshold
= hmp_up_threshold
; // up threshold
8767 #endif /* CONFIG_HMP_DYNAMIC_THRESHOLD */
8769 mt_sched_printf("[%s]\tup/dl:%4d/%4d bcpu(%d):%d/%d, lcpu(%d):%d/%d\n", __func__
,
8770 B
->threshold
, L
->threshold
,
8771 clbenv
->btarget
, clbenv
->bstats
.cpu_capacity
, clbenv
->bstats
.cpu_power
,
8772 clbenv
->ltarget
, clbenv
->lstats
.cpu_capacity
, clbenv
->lstats
.cpu_power
);
8776 * Check whether this task should be migrated to big
8777 * Briefly summarize the flow as below;
8778 * 1) Migration stabilizing
8779 * 1.5) Keep all cpu busy
8780 * 2) Filter low-priorty task
8781 * 3) Check CPU capacity
8782 * 4) Check dynamic migration threshold
8784 static unsigned int hmp_up_migration(int cpu
, int *target_cpu
, struct sched_entity
*se
,
8785 struct clb_env
*clbenv
)
8787 struct task_struct
*p
= task_of(se
);
8788 struct clb_stats
*L
, *B
;
8789 struct mcheck
*check
;
8791 unsigned int caller
= clbenv
->flags
;
8793 L
= &clbenv
->lstats
;
8794 B
= &clbenv
->bstats
;
8795 check
= &clbenv
->mcheck
;
8797 check
->status
= clbenv
->flags
;
8798 check
->status
|= HMP_TASK_UP_MIGRATION
;
8802 * No migration is needed if
8803 * 1) There is only one cluster
8804 * 2) Task is already in big cluster
8805 * 3) It violates task affinity
8807 if (!L
->ncpu
|| !B
->ncpu
8808 || cpumask_test_cpu(curr_cpu
, clbenv
->bcpus
)
8809 || !cpumask_intersects(clbenv
->bcpus
, tsk_cpus_allowed(p
)))
8813 * [1] Migration stabilizing
8814 * Let the task load settle before doing another up migration.
8815 * It can prevent a bunch of tasks from migrating to a unstable CPU.
8817 if (!hmp_up_stable(*target_cpu
))
8820 /* [2] Filter low-priorty task */
8821 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8822 if (hmp_low_prio_task_up_rejected(p
,B
,L
)) {
8823 check
->status
|= HMP_LOW_PRIORITY_FILTER
;
8828 // [2.5]if big is idle, just go to big
8829 if (rq_length(*target_cpu
)==0)
8831 check
->status
|= HMP_BIG_IDLE
;
8832 check
->status
|= HMP_MIGRATION_APPROVED
;
8838 * [3] Check CPU capacity
8839 * Forbid up-migration if big CPU can't handle this task
8841 if (!hmp_task_fast_cpu_afford(B
,se
,*target_cpu
)) {
8842 check
->status
|= HMP_BIG_CAPACITY_INSUFFICIENT
;
8847 * [4] Check dynamic migration threshold
8848 * Migrate task from LITTLE to big if load is greater than up-threshold
8850 if (se_load(se
) > B
->threshold
) {
8851 check
->status
|= HMP_MIGRATION_APPROVED
;
8856 #ifdef CONFIG_HMP_TRACER
8857 if(check
->result
&& hmp_caller_is_gb(caller
))
8858 hmp_stats
.nr_force_up
++;
8859 trace_sched_hmp_stats(&hmp_stats
);
8860 trace_sched_dynamic_threshold(task_of(se
),B
->threshold
,check
->status
,
8861 curr_cpu
,*target_cpu
,se_load(se
),B
,L
);
8863 #ifdef CONFIG_MET_SCHED_HMP
8864 TaskTh(B
->threshold
,L
->threshold
);
8865 HmpStat(&hmp_stats
);
8868 return check
->result
;
8872 * Check whether this task should be migrated to LITTLE
8873 * Briefly summarize the flow as below;
8874 * 1) Migration stabilizing
8875 * 1.5) Keep all cpu busy
8876 * 2) Filter low-priorty task
8877 * 3) Check CPU capacity
8878 * 4) Check dynamic migration threshold
8880 static unsigned int hmp_down_migration(int cpu
, int *target_cpu
, struct sched_entity
*se
,
8881 struct clb_env
*clbenv
)
8883 struct task_struct
*p
= task_of(se
);
8884 struct clb_stats
*L
, *B
;
8885 struct mcheck
*check
;
8887 unsigned int caller
= clbenv
->flags
;
8889 L
= &clbenv
->lstats
;
8890 B
= &clbenv
->bstats
;
8891 check
= &clbenv
->mcheck
;
8893 check
->status
= caller
;
8894 check
->status
|= HMP_TASK_DOWN_MIGRATION
;
8898 * No migration is needed if
8899 * 1) There is only one cluster
8900 * 2) Task is already in LITTLE cluster
8901 * 3) It violates task affinity
8903 if (!L
->ncpu
|| !B
->ncpu
8904 || cpumask_test_cpu(curr_cpu
, clbenv
->lcpus
)
8905 || !cpumask_intersects(clbenv
->lcpus
, tsk_cpus_allowed(p
)))
8909 * [1] Migration stabilizing
8910 * Let the task load settle before doing another down migration.
8911 * It can prevent a bunch of tasks from migrating to a unstable CPU.
8913 if (!hmp_down_stable(*target_cpu
))
8916 // [1.5]if big is busy and little is idle, just go to little
8917 if (rq_length(*target_cpu
)==0 && caller
== HMP_SELECT_RQ
&& rq_length(curr_cpu
)>0)
8919 check
->status
|= HMP_BIG_BUSY_LITTLE_IDLE
;
8920 check
->status
|= HMP_MIGRATION_APPROVED
;
8925 /* [2] Filter low-priorty task */
8926 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8927 if (hmp_low_prio_task_down_allowed(p
,B
,L
)) {
8928 cfs_nr_dequeuing_low_prio(curr_cpu
)++;
8929 check
->status
|= HMP_LOW_PRIORITY_FILTER
;
8930 check
->status
|= HMP_MIGRATION_APPROVED
;
8937 * [3] Check CPU capacity
8938 * Forbid down-migration if either of the following conditions is true
8939 * 1) big cpu is not oversubscribed (if big CPU seems to have spare
8940 * cycles, do not force this task to run on LITTLE CPU, but
8941 * keep it staying in its previous cluster instead)
8942 * 2) LITTLE cpu doesn't have available capacity for this new task
8944 if (!hmp_fast_cpu_oversubscribed(caller
,B
,se
,curr_cpu
)) {
8945 check
->status
|= HMP_BIG_NOT_OVERSUBSCRIBED
;
8949 if (!hmp_task_slow_cpu_afford(L
,se
)) {
8950 check
->status
|= HMP_LITTLE_CAPACITY_INSUFFICIENT
;
8955 * [4] Check dynamic migration threshold
8956 * Migrate task from big to LITTLE if load ratio is less than
8957 * or equal to down-threshold
8959 if (L
->threshold
>= se_load(se
)) {
8960 check
->status
|= HMP_MIGRATION_APPROVED
;
8965 #ifdef CONFIG_HMP_TRACER
8966 if (check
->result
&& hmp_caller_is_gb(caller
))
8967 hmp_stats
.nr_force_down
++;
8968 trace_sched_hmp_stats(&hmp_stats
);
8969 trace_sched_dynamic_threshold(task_of(se
),L
->threshold
,check
->status
,
8970 curr_cpu
,*target_cpu
,se_load(se
),B
,L
);
8972 #ifdef CONFIG_MET_SCHED_HMP
8973 TaskTh(B
->threshold
,L
->threshold
);
8974 HmpStat(&hmp_stats
);
8977 return check
->result
;
8979 #else /* CONFIG_SCHED_HMP_ENHANCEMENT */
8980 /* Check if task should migrate to a faster cpu */
8981 static unsigned int hmp_up_migration(int cpu
, int *target_cpu
, struct sched_entity
*se
)
8983 struct task_struct
*p
= task_of(se
);
8984 struct cfs_rq
*cfs_rq
= &cpu_rq(cpu
)->cfs
;
8988 *target_cpu
= NR_CPUS
;
8990 if (hmp_cpu_is_fastest(cpu
))
8993 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8994 /* Filter by task priority */
8995 if (p
->prio
>= hmp_up_prio
)
8998 if (se
->avg
.load_avg_ratio
< hmp_up_threshold
)
9001 /* Let the task load settle before doing another up migration */
9002 now
= cfs_rq_clock_task(cfs_rq
);
9003 if (((now
- se
->avg
.hmp_last_up_migration
) >> 10)
9004 < hmp_next_up_threshold
)
9007 /* Target domain load < 94% */
9008 if (hmp_domain_min_load(hmp_faster_domain(cpu
), target_cpu
)
9012 if (cpumask_intersects(&hmp_faster_domain(cpu
)->cpus
,
9013 tsk_cpus_allowed(p
)))
9019 /* Check if task should migrate to a slower cpu */
9020 static unsigned int hmp_down_migration(int cpu
, struct sched_entity
*se
)
9022 struct task_struct
*p
= task_of(se
);
9023 struct cfs_rq
*cfs_rq
= &cpu_rq(cpu
)->cfs
;
9026 if (hmp_cpu_is_slowest(cpu
))
9029 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
9030 /* Filter by task priority */
9031 if ((p
->prio
>= hmp_up_prio
) &&
9032 cpumask_intersects(&hmp_slower_domain(cpu
)->cpus
,
9033 tsk_cpus_allowed(p
))) {
9038 /* Let the task load settle before doing another down migration */
9039 now
= cfs_rq_clock_task(cfs_rq
);
9040 if (((now
- se
->avg
.hmp_last_down_migration
) >> 10)
9041 < hmp_next_down_threshold
)
9044 if (cpumask_intersects(&hmp_slower_domain(cpu
)->cpus
,
9045 tsk_cpus_allowed(p
))
9046 && se
->avg
.load_avg_ratio
< hmp_down_threshold
) {
9051 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
9054 * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
9055 * Ideally this function should be merged with can_migrate_task() to avoid
9058 static int hmp_can_migrate_task(struct task_struct
*p
, struct lb_env
*env
)
9060 int tsk_cache_hot
= 0;
9063 * We do not migrate tasks that are:
9064 * 1) running (obviously), or
9065 * 2) cannot be migrated to this CPU due to cpus_allowed
9067 if (!cpumask_test_cpu(env
->dst_cpu
, tsk_cpus_allowed(p
))) {
9068 schedstat_inc(p
, se
.statistics
.nr_failed_migrations_affine
);
9071 env
->flags
&= ~LBF_ALL_PINNED
;
9073 if (task_running(env
->src_rq
, p
)) {
9074 schedstat_inc(p
, se
.statistics
.nr_failed_migrations_running
);
9079 * Aggressive migration if:
9080 * 1) task is cache cold, or
9081 * 2) too many balance attempts have failed.
9084 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
9085 tsk_cache_hot
= task_hot(p
, env
->src_rq
->clock_task
, env
->sd
, env
->mt_check_cache_in_idle
);
9087 tsk_cache_hot
= task_hot(p
, env
->src_rq
->clock_task
, env
->sd
);
9089 if (!tsk_cache_hot
||
9090 env
->sd
->nr_balance_failed
> env
->sd
->cache_nice_tries
) {
9091 #ifdef CONFIG_SCHEDSTATS
9092 if (tsk_cache_hot
) {
9093 schedstat_inc(env
->sd
, lb_hot_gained
[env
->idle
]);
9094 schedstat_inc(p
, se
.statistics
.nr_forced_migrations
);
9104 * move_specific_task tries to move a specific task.
9105 * Returns 1 if successful and 0 otherwise.
9106 * Called with both runqueues locked.
9108 static int move_specific_task(struct lb_env
*env
, struct task_struct
*pm
)
9110 struct task_struct
*p
, *n
;
9112 list_for_each_entry_safe(p
, n
, &env
->src_rq
->cfs_tasks
, se
.group_node
) {
9113 if (throttled_lb_pair(task_group(p
), env
->src_rq
->cpu
,
9117 if (!hmp_can_migrate_task(p
, env
))
9119 /* Check if we found the right task */
9125 * Right now, this is only the third place move_task()
9126 * is called, so we can safely collect move_task()
9127 * stats here rather than inside move_task().
9129 schedstat_inc(env
->sd
, lb_gained
[env
->idle
]);
9136 * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
9137 * migrate a specific task from one runqueue to another.
9138 * hmp_force_up_migration uses this to push a currently running task
9140 * Based on active_load_balance_stop_cpu and can potentially be merged.
9142 static int hmp_active_task_migration_cpu_stop(void *data
)
9144 struct rq
*busiest_rq
= data
;
9145 struct task_struct
*p
= busiest_rq
->migrate_task
;
9146 int busiest_cpu
= cpu_of(busiest_rq
);
9147 int target_cpu
= busiest_rq
->push_cpu
;
9148 struct rq
*target_rq
= cpu_rq(target_cpu
);
9149 struct sched_domain
*sd
;
9151 raw_spin_lock_irq(&busiest_rq
->lock
);
9152 /* make sure the requested cpu hasn't gone down in the meantime */
9153 if (unlikely(busiest_cpu
!= smp_processor_id() ||
9154 !busiest_rq
->active_balance
)) {
9157 /* Is there any task to move? */
9158 if (busiest_rq
->nr_running
<= 1)
9160 /* Task has migrated meanwhile, abort forced migration */
9161 if (task_rq(p
) != busiest_rq
)
9164 * This condition is "impossible", if it occurs
9165 * we need to fix it. Originally reported by
9166 * Bjorn Helgaas on a 128-cpu setup.
9168 BUG_ON(busiest_rq
== target_rq
);
9170 /* move a task from busiest_rq to target_rq */
9171 double_lock_balance(busiest_rq
, target_rq
);
9173 /* Search for an sd spanning us and the target CPU. */
9175 for_each_domain(target_cpu
, sd
) {
9176 if (cpumask_test_cpu(busiest_cpu
, sched_domain_span(sd
)))
9181 struct lb_env env
= {
9183 .dst_cpu
= target_cpu
,
9184 .dst_rq
= target_rq
,
9185 .src_cpu
= busiest_rq
->cpu
,
9186 .src_rq
= busiest_rq
,
9190 schedstat_inc(sd
, alb_count
);
9192 if (move_specific_task(&env
, p
))
9193 schedstat_inc(sd
, alb_pushed
);
9195 schedstat_inc(sd
, alb_failed
);
9198 double_unlock_balance(busiest_rq
, target_rq
);
9200 busiest_rq
->active_balance
= 0;
9201 raw_spin_unlock_irq(&busiest_rq
->lock
);
9205 static DEFINE_SPINLOCK(hmp_force_migration
);
9206 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
9208 * Heterogenous Multi-Processor (HMP) Global Load Balance
9212 * According to Linaro's comment, we should only check the currently running
9213 * tasks because selecting other tasks for migration will require extensive
9216 #ifdef CONFIG_HMP_GLOBAL_BALANCE
9217 static void hmp_force_down_migration(int this_cpu
)
9219 int curr_cpu
, target_cpu
;
9220 struct sched_entity
*se
;
9222 unsigned long flags
;
9224 struct task_struct
*p
;
9225 struct clb_env clbenv
;
9227 /* Migrate light task from big to LITTLE */
9228 for_each_cpu(curr_cpu
, &hmp_fast_cpu_mask
) {
9229 /* Check whether CPU is online */
9230 if(!cpu_online(curr_cpu
))
9234 target
= cpu_rq(curr_cpu
);
9235 raw_spin_lock_irqsave(&target
->lock
, flags
);
9236 se
= target
->cfs
.curr
;
9238 raw_spin_unlock_irqrestore(&target
->lock
, flags
);
9242 /* Find task entity */
9243 if (!entity_is_task(se
)) {
9244 struct cfs_rq
*cfs_rq
;
9245 cfs_rq
= group_cfs_rq(se
);
9248 cfs_rq
= group_cfs_rq(se
);
9253 target_cpu
= hmp_select_cpu(HMP_GB
,p
,&hmp_slow_cpu_mask
,-1);
9254 if(NR_CPUS
== target_cpu
) {
9255 raw_spin_unlock_irqrestore(&target
->lock
, flags
);
9259 /* Collect cluster information */
9260 memset(&clbenv
, 0, sizeof(clbenv
));
9261 clbenv
.flags
|= HMP_GB
;
9262 clbenv
.btarget
= curr_cpu
;
9263 clbenv
.ltarget
= target_cpu
;
9264 clbenv
.lcpus
= &hmp_slow_cpu_mask
;
9265 clbenv
.bcpus
= &hmp_fast_cpu_mask
;
9266 sched_update_clbstats(&clbenv
);
9268 /* Check migration threshold */
9269 if (!target
->active_balance
&&
9270 hmp_down_migration(curr_cpu
, &target_cpu
, se
, &clbenv
)) {
9271 target
->active_balance
= 1;
9272 target
->push_cpu
= target_cpu
;
9273 target
->migrate_task
= p
;
9275 trace_sched_hmp_migrate(p
, target
->push_cpu
, 1);
9276 hmp_next_down_delay(&p
->se
, target
->push_cpu
);
9278 raw_spin_unlock_irqrestore(&target
->lock
, flags
);
9280 stop_one_cpu_nowait(cpu_of(target
),
9281 hmp_active_task_migration_cpu_stop
,
9282 target
, &target
->active_balance_work
);
9286 #endif /* CONFIG_HMP_GLOBAL_BALANCE */
9287 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9288 u32 AVOID_FORCE_UP_MIGRATION_FROM_CPUX_TO_CPUY_COUNT
[NR_CPUS
][NR_CPUS
];
9289 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9291 static void hmp_force_up_migration(int this_cpu
)
9293 int curr_cpu
, target_cpu
;
9294 struct sched_entity
*se
;
9296 unsigned long flags
;
9298 struct task_struct
*p
;
9299 struct clb_env clbenv
;
9300 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9304 if (!spin_trylock(&hmp_force_migration
))
9307 #ifdef CONFIG_HMP_TRACER
9308 for_each_online_cpu(curr_cpu
)
9309 trace_sched_cfs_runnable_load(curr_cpu
,cfs_load(curr_cpu
),
9310 cfs_length(curr_cpu
));
9313 /* Migrate heavy task from LITTLE to big */
9314 for_each_cpu(curr_cpu
, &hmp_slow_cpu_mask
) {
9315 /* Check whether CPU is online */
9316 if(!cpu_online(curr_cpu
))
9320 target
= cpu_rq(curr_cpu
);
9321 raw_spin_lock_irqsave(&target
->lock
, flags
);
9322 se
= target
->cfs
.curr
;
9324 raw_spin_unlock_irqrestore(&target
->lock
, flags
);
9328 /* Find task entity */
9329 if (!entity_is_task(se
)) {
9330 struct cfs_rq
*cfs_rq
;
9331 cfs_rq
= group_cfs_rq(se
);
9334 cfs_rq
= group_cfs_rq(se
);
9339 target_cpu
= hmp_select_cpu(HMP_GB
,p
,&hmp_fast_cpu_mask
,-1);
9340 if(NR_CPUS
== target_cpu
) {
9341 raw_spin_unlock_irqrestore(&target
->lock
, flags
);
9345 /* Collect cluster information */
9346 memset(&clbenv
, 0, sizeof(clbenv
));
9347 clbenv
.flags
|= HMP_GB
;
9348 clbenv
.ltarget
= curr_cpu
;
9349 clbenv
.btarget
= target_cpu
;
9350 clbenv
.lcpus
= &hmp_slow_cpu_mask
;
9351 clbenv
.bcpus
= &hmp_fast_cpu_mask
;
9352 sched_update_clbstats(&clbenv
);
9354 #ifdef CONFIG_HMP_LAZY_BALANCE
9355 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9356 if (PA_ENABLE
&& LB_ENABLE
) {
9357 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9358 if (is_light_task(p
) && !is_buddy_busy(per_cpu(sd_pack_buddy
, curr_cpu
))) {
9359 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9360 push_cpu
= hmp_select_cpu(HMP_GB
,p
,&hmp_fast_cpu_mask
,-1);
9361 if (hmp_cpu_is_fast(push_cpu
)) {
9362 AVOID_FORCE_UP_MIGRATION_FROM_CPUX_TO_CPUY_COUNT
[curr_cpu
][push_cpu
]++;
9363 #ifdef CONFIG_HMP_TRACER
9364 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_FORCE_UP_FORM_CPUX_TO_CPUY
, p
->pid
, curr_cpu
, push_cpu
);
9365 #endif /* CONFIG_HMP_TRACER */
9367 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9370 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9372 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9373 #endif /* CONFIG_HMP_LAZY_BALANCE */
9375 /* Check migration threshold */
9376 if (!target
->active_balance
&&
9377 hmp_up_migration(curr_cpu
, &target_cpu
, se
, &clbenv
)) {
9378 target
->active_balance
= 1;
9379 target
->push_cpu
= target_cpu
;
9380 target
->migrate_task
= p
;
9382 trace_sched_hmp_migrate(p
, target
->push_cpu
, 1);
9383 hmp_next_up_delay(&p
->se
, target
->push_cpu
);
9386 #ifdef CONFIG_HMP_LAZY_BALANCE
9388 #endif /* CONFIG_HMP_LAZY_BALANCE */
9390 raw_spin_unlock_irqrestore(&target
->lock
, flags
);
9392 stop_one_cpu_nowait(cpu_of(target
),
9393 hmp_active_task_migration_cpu_stop
,
9394 target
, &target
->active_balance_work
);
9398 #ifdef CONFIG_HMP_GLOBAL_BALANCE
9399 hmp_force_down_migration(this_cpu
);
9401 #ifdef CONFIG_HMP_TRACER
9402 trace_sched_hmp_load(clbenv
.bstats
.load_avg
, clbenv
.lstats
.load_avg
);
9404 spin_unlock(&hmp_force_migration
);
9406 #else /* CONFIG_SCHED_HMP_ENHANCEMENT */
9408 * hmp_force_up_migration checks runqueues for tasks that need to
9409 * be actively migrated to a faster cpu.
9411 static void hmp_force_up_migration(int this_cpu
)
9413 int cpu
, target_cpu
;
9414 struct sched_entity
*curr
;
9416 unsigned long flags
;
9418 struct task_struct
*p
;
9420 if (!spin_trylock(&hmp_force_migration
))
9422 for_each_online_cpu(cpu
) {
9424 target
= cpu_rq(cpu
);
9425 raw_spin_lock_irqsave(&target
->lock
, flags
);
9426 curr
= target
->cfs
.curr
;
9428 raw_spin_unlock_irqrestore(&target
->lock
, flags
);
9431 if (!entity_is_task(curr
)) {
9432 struct cfs_rq
*cfs_rq
;
9434 cfs_rq
= group_cfs_rq(curr
);
9436 curr
= cfs_rq
->curr
;
9437 cfs_rq
= group_cfs_rq(curr
);
9441 if (hmp_up_migration(cpu
, &target_cpu
, curr
)) {
9442 if (!target
->active_balance
) {
9443 target
->active_balance
= 1;
9444 target
->push_cpu
= target_cpu
;
9445 target
->migrate_task
= p
;
9447 trace_sched_hmp_migrate(p
, target
->push_cpu
, 1);
9448 hmp_next_up_delay(&p
->se
, target
->push_cpu
);
9451 if (!force
&& !target
->active_balance
) {
9453 * For now we just check the currently running task.
9454 * Selecting the lightest task for offloading will
9455 * require extensive book keeping.
9457 target
->push_cpu
= hmp_offload_down(cpu
, curr
);
9458 if (target
->push_cpu
< NR_CPUS
) {
9459 target
->active_balance
= 1;
9460 target
->migrate_task
= p
;
9462 trace_sched_hmp_migrate(p
, target
->push_cpu
, 2);
9463 hmp_next_down_delay(&p
->se
, target
->push_cpu
);
9466 raw_spin_unlock_irqrestore(&target
->lock
, flags
);
9468 stop_one_cpu_nowait(cpu_of(target
),
9469 hmp_active_task_migration_cpu_stop
,
9470 target
, &target
->active_balance_work
);
9472 spin_unlock(&hmp_force_migration
);
9474 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
9476 static void hmp_force_up_migration(int this_cpu
) { }
9477 #endif /* CONFIG_SCHED_HMP */
9480 * run_rebalance_domains is triggered when needed from the scheduler tick.
9481 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
9483 static void run_rebalance_domains(struct softirq_action
*h
)
9485 int this_cpu
= smp_processor_id();
9486 struct rq
*this_rq
= cpu_rq(this_cpu
);
9487 enum cpu_idle_type idle
= this_rq
->idle_balance
?
9488 CPU_IDLE
: CPU_NOT_IDLE
;
9490 hmp_force_up_migration(this_cpu
);
9492 rebalance_domains(this_cpu
, idle
);
9495 * If this cpu has a pending nohz_balance_kick, then do the
9496 * balancing on behalf of the other idle cpus whose ticks are
9499 nohz_idle_balance(this_cpu
, idle
);
9502 static inline int on_null_domain(int cpu
)
9504 return !rcu_dereference_sched(cpu_rq(cpu
)->sd
);
9508 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
9510 void trigger_load_balance(struct rq
*rq
, int cpu
)
9512 /* Don't need to rebalance while attached to NULL domain */
9513 if (time_after_eq(jiffies
, rq
->next_balance
) &&
9514 likely(!on_null_domain(cpu
)))
9515 raise_softirq(SCHED_SOFTIRQ
);
9516 #ifdef CONFIG_NO_HZ_COMMON
9517 if (nohz_kick_needed(rq
, cpu
) && likely(!on_null_domain(cpu
)))
9518 nohz_balancer_kick(cpu
);
9522 static void rq_online_fair(struct rq
*rq
)
9524 #ifdef CONFIG_SCHED_HMP
9525 hmp_online_cpu(rq
->cpu
);
9530 static void rq_offline_fair(struct rq
*rq
)
9532 #ifdef CONFIG_SCHED_HMP
9533 hmp_offline_cpu(rq
->cpu
);
9537 /* Ensure any throttled groups are reachable by pick_next_task */
9538 unthrottle_offline_cfs_rqs(rq
);
9541 #endif /* CONFIG_SMP */
9544 * scheduler tick hitting a task of our scheduling class:
9546 static void task_tick_fair(struct rq
*rq
, struct task_struct
*curr
, int queued
)
9548 struct cfs_rq
*cfs_rq
;
9549 struct sched_entity
*se
= &curr
->se
;
9551 for_each_sched_entity(se
) {
9552 cfs_rq
= cfs_rq_of(se
);
9553 entity_tick(cfs_rq
, se
, queued
);
9556 if (sched_feat_numa(NUMA
))
9557 task_tick_numa(rq
, curr
);
9559 update_rq_runnable_avg(rq
, 1);
9563 * called on fork with the child task as argument from the parent's context
9564 * - child not yet on the tasklist
9565 * - preemption disabled
9567 static void task_fork_fair(struct task_struct
*p
)
9569 struct cfs_rq
*cfs_rq
;
9570 struct sched_entity
*se
= &p
->se
, *curr
;
9571 int this_cpu
= smp_processor_id();
9572 struct rq
*rq
= this_rq();
9573 unsigned long flags
;
9575 raw_spin_lock_irqsave(&rq
->lock
, flags
);
9577 update_rq_clock(rq
);
9579 cfs_rq
= task_cfs_rq(current
);
9580 curr
= cfs_rq
->curr
;
9583 * Not only the cpu but also the task_group of the parent might have
9584 * been changed after parent->se.parent,cfs_rq were copied to
9585 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
9586 * of child point to valid ones.
9589 __set_task_cpu(p
, this_cpu
);
9592 update_curr(cfs_rq
);
9595 se
->vruntime
= curr
->vruntime
;
9596 place_entity(cfs_rq
, se
, 1);
9598 if (sysctl_sched_child_runs_first
&& curr
&& entity_before(curr
, se
)) {
9600 * Upon rescheduling, sched_class::put_prev_task() will place
9601 * 'current' within the tree based on its new key value.
9603 swap(curr
->vruntime
, se
->vruntime
);
9604 resched_task(rq
->curr
);
9607 se
->vruntime
-= cfs_rq
->min_vruntime
;
9609 raw_spin_unlock_irqrestore(&rq
->lock
, flags
);
9613 * Priority of the task has changed. Check to see if we preempt
9617 prio_changed_fair(struct rq
*rq
, struct task_struct
*p
, int oldprio
)
9623 * Reschedule if we are currently running on this runqueue and
9624 * our priority decreased, or if we are not currently running on
9625 * this runqueue and our priority is higher than the current's
9627 if (rq
->curr
== p
) {
9628 if (p
->prio
> oldprio
)
9629 resched_task(rq
->curr
);
9631 check_preempt_curr(rq
, p
, 0);
9634 static void switched_from_fair(struct rq
*rq
, struct task_struct
*p
)
9636 struct sched_entity
*se
= &p
->se
;
9637 struct cfs_rq
*cfs_rq
= cfs_rq_of(se
);
9640 * Ensure the task's vruntime is normalized, so that when it's
9641 * switched back to the fair class the enqueue_entity(.flags=0) will
9642 * do the right thing.
9644 * If it's on_rq, then the dequeue_entity(.flags=0) will already
9645 * have normalized the vruntime, if it's !on_rq, then only when
9646 * the task is sleeping will it still have non-normalized vruntime.
9648 if (!p
->on_rq
&& p
->state
!= TASK_RUNNING
) {
9650 * Fix up our vruntime so that the current sleep doesn't
9651 * cause 'unlimited' sleep bonus.
9653 place_entity(cfs_rq
, se
, 0);
9654 se
->vruntime
-= cfs_rq
->min_vruntime
;
9659 * Remove our load from contribution when we leave sched_fair
9660 * and ensure we don't carry in an old decay_count if we
9663 if (p
->se
.avg
.decay_count
) {
9664 struct cfs_rq
*cfs_rq
= cfs_rq_of(&p
->se
);
9665 __synchronize_entity_decay(&p
->se
);
9666 subtract_blocked_load_contrib(cfs_rq
,
9667 p
->se
.avg
.load_avg_contrib
);
9673 * We switched to the sched_fair class.
9675 static void switched_to_fair(struct rq
*rq
, struct task_struct
*p
)
9681 * We were most likely switched from sched_rt, so
9682 * kick off the schedule if running, otherwise just see
9683 * if we can still preempt the current task.
9686 resched_task(rq
->curr
);
9689 When task p change priority form RT to normal priority
9690 in switch_from_rt(), it might call pull_rt_task
9691 and potentially double_lock_balance will unlock rq.
9692 Task p might migrate to other CPU and result in task p is NOT at rq.
9693 In this case, it is not necessary to check preempt for rq.
9694 (Because task p is NOT at rq anymore)
9695 and the migrate flow for task p will check preempt in enqueue flow.
9696 So bypass the check_preempt_curr.
9698 if (rq
== task_rq(p
)) {
9699 check_preempt_curr(rq
, p
, 0);
9704 /* Account for a task changing its policy or group.
9706 * This routine is mostly called to set cfs_rq->curr field when a task
9707 * migrates between groups/classes.
9709 static void set_curr_task_fair(struct rq
*rq
)
9711 struct sched_entity
*se
= &rq
->curr
->se
;
9713 for_each_sched_entity(se
) {
9714 struct cfs_rq
*cfs_rq
= cfs_rq_of(se
);
9716 set_next_entity(cfs_rq
, se
);
9717 /* ensure bandwidth has been allocated on our new cfs_rq */
9718 account_cfs_rq_runtime(cfs_rq
, 0);
9722 void init_cfs_rq(struct cfs_rq
*cfs_rq
)
9724 cfs_rq
->tasks_timeline
= RB_ROOT
;
9725 cfs_rq
->min_vruntime
= (u64
)(-(1LL << 20));
9726 #ifndef CONFIG_64BIT
9727 cfs_rq
->min_vruntime_copy
= cfs_rq
->min_vruntime
;
9730 atomic64_set(&cfs_rq
->decay_counter
, 1);
9731 atomic_long_set(&cfs_rq
->removed_load
, 0);
9735 #ifdef CONFIG_FAIR_GROUP_SCHED
9736 static void task_move_group_fair(struct task_struct
*p
, int on_rq
)
9738 struct cfs_rq
*cfs_rq
;
9740 * If the task was not on the rq at the time of this cgroup movement
9741 * it must have been asleep, sleeping tasks keep their ->vruntime
9742 * absolute on their old rq until wakeup (needed for the fair sleeper
9743 * bonus in place_entity()).
9745 * If it was on the rq, we've just 'preempted' it, which does convert
9746 * ->vruntime to a relative base.
9748 * Make sure both cases convert their relative position when migrating
9749 * to another cgroup's rq. This does somewhat interfere with the
9750 * fair sleeper stuff for the first placement, but who cares.
9753 * When !on_rq, vruntime of the task has usually NOT been normalized.
9754 * But there are some cases where it has already been normalized:
9756 * - Moving a forked child which is waiting for being woken up by
9757 * wake_up_new_task().
9758 * - Moving a task which has been woken up by try_to_wake_up() and
9759 * waiting for actually being woken up by sched_ttwu_pending().
9761 * To prevent boost or penalty in the new cfs_rq caused by delta
9762 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
9764 if (!on_rq
&& (!p
->se
.sum_exec_runtime
|| p
->state
== TASK_WAKING
))
9768 p
->se
.vruntime
-= cfs_rq_of(&p
->se
)->min_vruntime
;
9769 set_task_rq(p
, task_cpu(p
));
9771 cfs_rq
= cfs_rq_of(&p
->se
);
9772 p
->se
.vruntime
+= cfs_rq
->min_vruntime
;
9775 * migrate_task_rq_fair() will have removed our previous
9776 * contribution, but we must synchronize for ongoing future
9779 p
->se
.avg
.decay_count
= atomic64_read(&cfs_rq
->decay_counter
);
9780 cfs_rq
->blocked_load_avg
+= p
->se
.avg
.load_avg_contrib
;
9785 void free_fair_sched_group(struct task_group
*tg
)
9789 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg
));
9791 for_each_possible_cpu(i
) {
9793 kfree(tg
->cfs_rq
[i
]);
9802 int alloc_fair_sched_group(struct task_group
*tg
, struct task_group
*parent
)
9804 struct cfs_rq
*cfs_rq
;
9805 struct sched_entity
*se
;
9808 tg
->cfs_rq
= kzalloc(sizeof(cfs_rq
) * nr_cpu_ids
, GFP_KERNEL
);
9811 tg
->se
= kzalloc(sizeof(se
) * nr_cpu_ids
, GFP_KERNEL
);
9815 tg
->shares
= NICE_0_LOAD
;
9817 init_cfs_bandwidth(tg_cfs_bandwidth(tg
));
9819 for_each_possible_cpu(i
) {
9820 cfs_rq
= kzalloc_node(sizeof(struct cfs_rq
),
9821 GFP_KERNEL
, cpu_to_node(i
));
9825 se
= kzalloc_node(sizeof(struct sched_entity
),
9826 GFP_KERNEL
, cpu_to_node(i
));
9830 init_cfs_rq(cfs_rq
);
9831 init_tg_cfs_entry(tg
, cfs_rq
, se
, i
, parent
->se
[i
]);
9842 void unregister_fair_sched_group(struct task_group
*tg
, int cpu
)
9844 struct rq
*rq
= cpu_rq(cpu
);
9845 unsigned long flags
;
9848 * Only empty task groups can be destroyed; so we can speculatively
9849 * check on_list without danger of it being re-added.
9851 if (!tg
->cfs_rq
[cpu
]->on_list
)
9854 raw_spin_lock_irqsave(&rq
->lock
, flags
);
9855 list_del_leaf_cfs_rq(tg
->cfs_rq
[cpu
]);
9856 raw_spin_unlock_irqrestore(&rq
->lock
, flags
);
9859 void init_tg_cfs_entry(struct task_group
*tg
, struct cfs_rq
*cfs_rq
,
9860 struct sched_entity
*se
, int cpu
,
9861 struct sched_entity
*parent
)
9863 struct rq
*rq
= cpu_rq(cpu
);
9867 init_cfs_rq_runtime(cfs_rq
);
9869 tg
->cfs_rq
[cpu
] = cfs_rq
;
9872 /* se could be NULL for root_task_group */
9877 se
->cfs_rq
= &rq
->cfs
;
9879 se
->cfs_rq
= parent
->my_q
;
9882 /* guarantee group entities always have weight */
9883 update_load_set(&se
->load
, NICE_0_LOAD
);
9884 se
->parent
= parent
;
9887 static DEFINE_MUTEX(shares_mutex
);
9889 int sched_group_set_shares(struct task_group
*tg
, unsigned long shares
)
9892 unsigned long flags
;
9895 * We can't change the weight of the root cgroup.
9900 shares
= clamp(shares
, scale_load(MIN_SHARES
), scale_load(MAX_SHARES
));
9902 mutex_lock(&shares_mutex
);
9903 if (tg
->shares
== shares
)
9906 tg
->shares
= shares
;
9907 for_each_possible_cpu(i
) {
9908 struct rq
*rq
= cpu_rq(i
);
9909 struct sched_entity
*se
;
9912 /* Propagate contribution to hierarchy */
9913 raw_spin_lock_irqsave(&rq
->lock
, flags
);
9914 for_each_sched_entity(se
)
9915 update_cfs_shares(group_cfs_rq(se
));
9916 raw_spin_unlock_irqrestore(&rq
->lock
, flags
);
9920 mutex_unlock(&shares_mutex
);
9923 #else /* CONFIG_FAIR_GROUP_SCHED */
9925 void free_fair_sched_group(struct task_group
*tg
) { }
9927 int alloc_fair_sched_group(struct task_group
*tg
, struct task_group
*parent
)
9932 void unregister_fair_sched_group(struct task_group
*tg
, int cpu
) { }
9934 #endif /* CONFIG_FAIR_GROUP_SCHED */
9937 static unsigned int get_rr_interval_fair(struct rq
*rq
, struct task_struct
*task
)
9939 struct sched_entity
*se
= &task
->se
;
9940 unsigned int rr_interval
= 0;
9943 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
9946 if (rq
->cfs
.load
.weight
)
9947 rr_interval
= NS_TO_JIFFIES(sched_slice(cfs_rq_of(se
), se
));
9953 * All the scheduling class methods:
9955 const struct sched_class fair_sched_class
= {
9956 .next
= &idle_sched_class
,
9957 .enqueue_task
= enqueue_task_fair
,
9958 .dequeue_task
= dequeue_task_fair
,
9959 .yield_task
= yield_task_fair
,
9960 .yield_to_task
= yield_to_task_fair
,
9962 .check_preempt_curr
= check_preempt_wakeup
,
9964 .pick_next_task
= pick_next_task_fair
,
9965 .put_prev_task
= put_prev_task_fair
,
9968 .select_task_rq
= select_task_rq_fair
,
9969 .migrate_task_rq
= migrate_task_rq_fair
,
9971 .rq_online
= rq_online_fair
,
9972 .rq_offline
= rq_offline_fair
,
9974 .task_waking
= task_waking_fair
,
9977 .set_curr_task
= set_curr_task_fair
,
9978 .task_tick
= task_tick_fair
,
9979 .task_fork
= task_fork_fair
,
9981 .prio_changed
= prio_changed_fair
,
9982 .switched_from
= switched_from_fair
,
9983 .switched_to
= switched_to_fair
,
9985 .get_rr_interval
= get_rr_interval_fair
,
9987 #ifdef CONFIG_FAIR_GROUP_SCHED
9988 .task_move_group
= task_move_group_fair
,
9992 #ifdef CONFIG_SCHED_DEBUG
9993 void print_cfs_stats(struct seq_file
*m
, int cpu
)
9995 struct cfs_rq
*cfs_rq
;
9998 for_each_leaf_cfs_rq(cpu_rq(cpu
), cfs_rq
)
9999 print_cfs_rq(m
, cpu
, cfs_rq
);
10004 __init
void init_sched_fair_class(void)
10007 open_softirq(SCHED_SOFTIRQ
, run_rebalance_domains
);
10009 #ifdef CONFIG_NO_HZ_COMMON
10010 nohz
.next_balance
= jiffies
;
10011 zalloc_cpumask_var(&nohz
.idle_cpus_mask
, GFP_NOWAIT
);
10012 cpu_notifier(sched_ilb_notifier
, 0);
10015 cmp_cputopo_domain_setup();
10016 #ifdef CONFIG_SCHED_HMP
10017 hmp_cpu_mask_setup();
10022 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
10023 static u32
cpufreq_calc_scale(u32 min
, u32 max
, u32 curr
)
10025 u32 result
= curr
/ max
;
10029 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
10030 DEFINE_PER_CPU(u32
, FREQ_CPU
);
10031 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
10033 /* Called when the CPU Frequency is changed.
10034 * Once for each CPU.
10036 static int cpufreq_callback(struct notifier_block
*nb
,
10037 unsigned long val
, void *data
)
10039 struct cpufreq_freqs
*freq
= data
;
10040 int cpu
= freq
->cpu
;
10041 struct cpufreq_extents
*extents
;
10042 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10043 struct cpumask
* mask
;
10047 if (freq
->flags
& CPUFREQ_CONST_LOOPS
)
10050 if (val
!= CPUFREQ_POSTCHANGE
)
10053 /* if dynamic load scale is disabled, set the load scale to 1.0 */
10054 if (!hmp_data
.freqinvar_load_scale_enabled
) {
10055 freq_scale
[cpu
].curr_scale
= 1024;
10059 extents
= &freq_scale
[cpu
];
10060 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10061 if (extents
->max
< extents
->const_max
){
10062 extents
->throttling
=1;
10065 extents
->throttling
=0;
10068 if (extents
->flags
& SCHED_LOAD_FREQINVAR_SINGLEFREQ
) {
10069 /* If our governor was recognised as a single-freq governor,
10072 extents
->curr_scale
= 1024;
10074 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10075 extents
->curr_scale
= cpufreq_calc_scale(extents
->min
,
10076 extents
->const_max
, freq
->new);
10078 extents
->curr_scale
= cpufreq_calc_scale(extents
->min
,
10079 extents
->max
, freq
->new);
10083 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10084 mask
= arch_cpu_is_big(cpu
)?&hmp_fast_cpu_mask
:&hmp_slow_cpu_mask
;
10085 for_each_cpu(id
, mask
)
10086 freq_scale
[id
].curr_scale
= extents
->curr_scale
;
10090 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10094 (extents
+ 1)->curr_scale
= extents
->curr_scale
;
10099 (extents
- 1)->curr_scale
= extents
->curr_scale
;
10109 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
10110 per_cpu(FREQ_CPU
, cpu
) = freq
->new;
10111 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
10115 /* Called when the CPUFreq governor is changed.
10116 * Only called for the CPUs which are actually changed by the
10119 static int cpufreq_policy_callback(struct notifier_block
*nb
,
10120 unsigned long event
, void *data
)
10122 struct cpufreq_policy
*policy
= data
;
10123 struct cpufreq_extents
*extents
;
10124 int cpu
, singleFreq
= 0;
10125 static const char performance_governor
[] = "performance";
10126 static const char powersave_governor
[] = "powersave";
10128 if (event
== CPUFREQ_START
)
10131 if (event
!= CPUFREQ_INCOMPATIBLE
)
10134 /* CPUFreq governors do not accurately report the range of
10135 * CPU Frequencies they will choose from.
10136 * We recognise performance and powersave governors as
10137 * single-frequency only.
10139 if (!strncmp(policy
->governor
->name
, performance_governor
,
10140 strlen(performance_governor
)) ||
10141 !strncmp(policy
->governor
->name
, powersave_governor
,
10142 strlen(powersave_governor
)))
10145 /* Make sure that all CPUs impacted by this policy are
10146 * updated since we will only get a notification when the
10147 * user explicitly changes the policy on a CPU.
10149 for_each_cpu(cpu
, policy
->cpus
) {
10150 extents
= &freq_scale
[cpu
];
10151 extents
->max
= policy
->max
>> SCHED_FREQSCALE_SHIFT
;
10152 extents
->min
= policy
->min
>> SCHED_FREQSCALE_SHIFT
;
10153 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10154 extents
->const_max
= policy
->cpuinfo
.max_freq
>> SCHED_FREQSCALE_SHIFT
;
10156 if (!hmp_data
.freqinvar_load_scale_enabled
) {
10157 extents
->curr_scale
= 1024;
10158 } else if (singleFreq
) {
10159 extents
->flags
|= SCHED_LOAD_FREQINVAR_SINGLEFREQ
;
10160 extents
->curr_scale
= 1024;
10162 extents
->flags
&= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ
;
10163 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10164 extents
->curr_scale
= cpufreq_calc_scale(extents
->min
,
10165 extents
->const_max
, policy
->cur
);
10167 extents
->curr_scale
= cpufreq_calc_scale(extents
->min
,
10168 extents
->max
, policy
->cur
);
10176 static struct notifier_block cpufreq_notifier
= {
10177 .notifier_call
= cpufreq_callback
,
10179 static struct notifier_block cpufreq_policy_notifier
= {
10180 .notifier_call
= cpufreq_policy_callback
,
10183 static int __init
register_sched_cpufreq_notifier(void)
10187 /* init safe defaults since there are no policies at registration */
10188 for (ret
= 0; ret
< CONFIG_NR_CPUS
; ret
++) {
10189 /* safe defaults */
10190 freq_scale
[ret
].max
= 1024;
10191 freq_scale
[ret
].min
= 1024;
10192 freq_scale
[ret
].curr_scale
= 1024;
10195 pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
10196 ret
= cpufreq_register_notifier(&cpufreq_policy_notifier
,
10197 CPUFREQ_POLICY_NOTIFIER
);
10199 if (ret
!= -EINVAL
)
10200 ret
= cpufreq_register_notifier(&cpufreq_notifier
,
10201 CPUFREQ_TRANSITION_NOTIFIER
);
10206 core_initcall(register_sched_cpufreq_notifier
);
10207 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
10209 #ifdef CONFIG_HEVTASK_INTERFACE
10211 * * This allows printing both to /proc/task_detect and
10214 #ifndef CONFIG_KGDB_KDB
10215 #define SEQ_printf(m, x...) \
10218 seq_printf(m, x); \
10223 #define SEQ_printf(m, x...) \
10226 seq_printf(m, x); \
10227 else if (__get_cpu_var(kdb_in_use) == 1) \
10234 static int task_detect_show(struct seq_file
*m
, void *v
)
10236 struct task_struct
*p
;
10237 unsigned long flags
;
10240 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
10241 for(i
=0;i
<NR_CPUS
;i
++){
10242 SEQ_printf(m
,"%5d ",freq_scale
[i
].curr_scale
);
10246 SEQ_printf(m
, "\n%lu\n ",jiffies_to_cputime(jiffies
));
10248 for(i
=0;i
<NR_CPUS
;i
++){
10249 raw_spin_lock_irqsave(&cpu_rq(i
)->lock
,flags
);
10251 list_for_each_entry(p
,&cpu_rq(i
)->cfs_tasks
,se
.group_node
){
10252 SEQ_printf(m
, "%lu %5d %5d %lu (%15s)\n ",
10253 p
->se
.avg
.load_avg_ratio
,p
->pid
,task_cpu(p
),
10254 (p
->utime
+p
->stime
),p
->comm
);
10257 raw_spin_unlock_irqrestore(&cpu_rq(i
)->lock
,flags
);
10264 static int task_detect_open(struct inode
*inode
, struct file
*filp
)
10266 return single_open(filp
, task_detect_show
, NULL
);
10269 static const struct file_operations task_detect_fops
= {
10270 .open
= task_detect_open
,
10272 .llseek
= seq_lseek
,
10273 .release
= single_release
,
10276 static int __init
init_task_detect_procfs(void)
10278 struct proc_dir_entry
*pe
;
10280 pe
= proc_create("task_detect", 0444, NULL
, &task_detect_fops
);
10286 __initcall(init_task_detect_procfs
);
10287 #endif /* CONFIG_HEVTASK_INTERFACE */