Linux 3.10.54
[GitHub/mt8127/android_kernel_alcatel_ttab.git] / kernel / sched / fair.c
CommitLineData
bf0f6f24
IM
1/*
2 * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
3 *
4 * Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
5 *
6 * Interactivity improvements by Mike Galbraith
7 * (C) 2007 Mike Galbraith <efault@gmx.de>
8 *
9 * Various enhancements by Dmitry Adamushko.
10 * (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
11 *
12 * Group scheduling enhancements by Srivatsa Vaddagiri
13 * Copyright IBM Corporation, 2007
14 * Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
15 *
16 * Scaled math optimizations by Thomas Gleixner
17 * Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
21805085
PZ
18 *
19 * Adaptive scheduling granularity, math enhancements by Peter Zijlstra
20 * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
bf0f6f24
IM
21 */
22
9745512c 23#include <linux/latencytop.h>
1983a922 24#include <linux/sched.h>
3436ae12 25#include <linux/cpumask.h>
029632fb
PZ
26#include <linux/slab.h>
27#include <linux/profile.h>
28#include <linux/interrupt.h>
cbee9f88 29#include <linux/mempolicy.h>
e14808b4 30#include <linux/migrate.h>
cbee9f88 31#include <linux/task_work.h>
029632fb
PZ
32
33#include <trace/events/sched.h>
34
35#include "sched.h"
9745512c 36
bf0f6f24 37/*
21805085 38 * Targeted preemption latency for CPU-bound tasks:
864616ee 39 * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24 40 *
21805085 41 * NOTE: this latency value is not the same as the concept of
d274a4ce
IM
42 * 'timeslice length' - timeslices in CFS are of variable length
43 * and have no persistent notion like in traditional, time-slice
44 * based scheduling concepts.
bf0f6f24 45 *
d274a4ce
IM
46 * (to see the precise effective timeslice length of your workload,
47 * run vmstat and monitor the context-switches (cs) field)
bf0f6f24 48 */
21406928
MG
49unsigned int sysctl_sched_latency = 6000000ULL;
50unsigned int normalized_sysctl_sched_latency = 6000000ULL;
2bd8e6d4 51
1983a922
CE
52/*
53 * The initial- and re-scaling of tunables is configurable
54 * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
55 *
56 * Options are:
57 * SCHED_TUNABLESCALING_NONE - unscaled, always *1
58 * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
59 * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
60 */
61enum sched_tunable_scaling sysctl_sched_tunable_scaling
62 = SCHED_TUNABLESCALING_LOG;
63
2bd8e6d4 64/*
b2be5e96 65 * Minimal preemption granularity for CPU-bound tasks:
864616ee 66 * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
2bd8e6d4 67 */
0bf377bb
IM
68unsigned int sysctl_sched_min_granularity = 750000ULL;
69unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
21805085
PZ
70
71/*
b2be5e96
PZ
72 * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
73 */
0bf377bb 74static unsigned int sched_nr_latency = 8;
b2be5e96
PZ
75
76/*
2bba22c5 77 * After fork, child runs first. If set to 0 (default) then
b2be5e96 78 * parent will (try to) run first.
21805085 79 */
2bba22c5 80unsigned int sysctl_sched_child_runs_first __read_mostly;
bf0f6f24 81
bf0f6f24
IM
82/*
83 * SCHED_OTHER wake-up granularity.
172e082a 84 * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
bf0f6f24
IM
85 *
86 * This option delays the preemption effects of decoupled workloads
87 * and reduces their over-scheduling. Synchronous workloads will still
88 * have immediate wakeup/sleep latencies.
89 */
172e082a 90unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
0bcdcf28 91unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
bf0f6f24 92
da84d961
IM
93const_debug unsigned int sysctl_sched_migration_cost = 500000UL;
94
a7a4f8a7
PT
95/*
96 * The exponential sliding window over which load is averaged for shares
97 * distribution.
98 * (default: 10msec)
99 */
100unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
101
ec12cb7f
PT
102#ifdef CONFIG_CFS_BANDWIDTH
103/*
104 * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
105 * each time a cfs_rq requests quota.
106 *
107 * Note: in the case that the slice exceeds the runtime remaining (either due
108 * to consumption or the quota being specified to be smaller than the slice)
109 * we will always only issue the remaining available time.
110 *
111 * default: 5 msec, units: microseconds
112 */
113unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
114#endif
115
029632fb
PZ
116/*
117 * Increase the granularity value when there are more CPUs,
118 * because with more CPUs the 'effective latency' as visible
119 * to users decreases. But the relationship is not linear,
120 * so pick a second-best guess by going with the log2 of the
121 * number of CPUs.
122 *
123 * This idea comes from the SD scheduler of Con Kolivas:
124 */
125static int get_update_sysctl_factor(void)
126{
127 unsigned int cpus = min_t(int, num_online_cpus(), 8);
128 unsigned int factor;
129
130 switch (sysctl_sched_tunable_scaling) {
131 case SCHED_TUNABLESCALING_NONE:
132 factor = 1;
133 break;
134 case SCHED_TUNABLESCALING_LINEAR:
135 factor = cpus;
136 break;
137 case SCHED_TUNABLESCALING_LOG:
138 default:
139 factor = 1 + ilog2(cpus);
140 break;
141 }
142
143 return factor;
144}
145
146static void update_sysctl(void)
147{
148 unsigned int factor = get_update_sysctl_factor();
149
150#define SET_SYSCTL(name) \
151 (sysctl_##name = (factor) * normalized_sysctl_##name)
152 SET_SYSCTL(sched_min_granularity);
153 SET_SYSCTL(sched_latency);
154 SET_SYSCTL(sched_wakeup_granularity);
155#undef SET_SYSCTL
156}
157
158void sched_init_granularity(void)
159{
160 update_sysctl();
161}
162
163#if BITS_PER_LONG == 32
164# define WMULT_CONST (~0UL)
165#else
166# define WMULT_CONST (1UL << 32)
167#endif
168
169#define WMULT_SHIFT 32
170
171/*
172 * Shift right and round:
173 */
174#define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
175
176/*
177 * delta *= weight / lw
178 */
179static unsigned long
180calc_delta_mine(unsigned long delta_exec, unsigned long weight,
181 struct load_weight *lw)
182{
183 u64 tmp;
184
185 /*
186 * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
187 * entities since MIN_SHARES = 2. Treat weight as 1 if less than
188 * 2^SCHED_LOAD_RESOLUTION.
189 */
190 if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
191 tmp = (u64)delta_exec * scale_load_down(weight);
192 else
193 tmp = (u64)delta_exec;
194
195 if (!lw->inv_weight) {
196 unsigned long w = scale_load_down(lw->weight);
197
198 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
199 lw->inv_weight = 1;
200 else if (unlikely(!w))
201 lw->inv_weight = WMULT_CONST;
202 else
203 lw->inv_weight = WMULT_CONST / w;
204 }
205
206 /*
207 * Check whether we'd overflow the 64-bit multiplication:
208 */
209 if (unlikely(tmp > WMULT_CONST))
210 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
211 WMULT_SHIFT/2);
212 else
213 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
214
215 return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
216}
217
218
219const struct sched_class fair_sched_class;
a4c2f00f 220
bf0f6f24
IM
221/**************************************************************
222 * CFS operations on generic schedulable entities:
223 */
224
62160e3f 225#ifdef CONFIG_FAIR_GROUP_SCHED
bf0f6f24 226
62160e3f 227/* cpu runqueue to which this cfs_rq is attached */
bf0f6f24
IM
228static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
229{
62160e3f 230 return cfs_rq->rq;
bf0f6f24
IM
231}
232
62160e3f
IM
233/* An entity is a task if it doesn't "own" a runqueue */
234#define entity_is_task(se) (!se->my_q)
bf0f6f24 235
8f48894f
PZ
236static inline struct task_struct *task_of(struct sched_entity *se)
237{
238#ifdef CONFIG_SCHED_DEBUG
239 WARN_ON_ONCE(!entity_is_task(se));
240#endif
241 return container_of(se, struct task_struct, se);
242}
243
b758149c
PZ
244/* Walk up scheduling entities hierarchy */
245#define for_each_sched_entity(se) \
246 for (; se; se = se->parent)
247
248static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
249{
250 return p->se.cfs_rq;
251}
252
253/* runqueue on which this entity is (to be) queued */
254static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
255{
256 return se->cfs_rq;
257}
258
259/* runqueue "owned" by this group */
260static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
261{
262 return grp->my_q;
263}
264
aff3e498
PT
265static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
266 int force_update);
9ee474f5 267
3d4b47b4
PZ
268static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
269{
270 if (!cfs_rq->on_list) {
67e86250
PT
271 /*
272 * Ensure we either appear before our parent (if already
273 * enqueued) or force our parent to appear after us when it is
274 * enqueued. The fact that we always enqueue bottom-up
275 * reduces this to two cases.
276 */
277 if (cfs_rq->tg->parent &&
278 cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
279 list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
280 &rq_of(cfs_rq)->leaf_cfs_rq_list);
281 } else {
282 list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
3d4b47b4 283 &rq_of(cfs_rq)->leaf_cfs_rq_list);
67e86250 284 }
3d4b47b4
PZ
285
286 cfs_rq->on_list = 1;
9ee474f5 287 /* We should have no load, but we need to update last_decay. */
aff3e498 288 update_cfs_rq_blocked_load(cfs_rq, 0);
3d4b47b4
PZ
289 }
290}
291
292static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
293{
294 if (cfs_rq->on_list) {
295 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
296 cfs_rq->on_list = 0;
297 }
298}
299
b758149c
PZ
300/* Iterate thr' all leaf cfs_rq's on a runqueue */
301#define for_each_leaf_cfs_rq(rq, cfs_rq) \
302 list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
303
304/* Do the two (enqueued) entities belong to the same group ? */
305static inline int
306is_same_group(struct sched_entity *se, struct sched_entity *pse)
307{
308 if (se->cfs_rq == pse->cfs_rq)
309 return 1;
310
311 return 0;
312}
313
314static inline struct sched_entity *parent_entity(struct sched_entity *se)
315{
316 return se->parent;
317}
318
464b7527
PZ
319/* return depth at which a sched entity is present in the hierarchy */
320static inline int depth_se(struct sched_entity *se)
321{
322 int depth = 0;
323
324 for_each_sched_entity(se)
325 depth++;
326
327 return depth;
328}
329
330static void
331find_matching_se(struct sched_entity **se, struct sched_entity **pse)
332{
333 int se_depth, pse_depth;
334
335 /*
336 * preemption test can be made between sibling entities who are in the
337 * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
338 * both tasks until we find their ancestors who are siblings of common
339 * parent.
340 */
341
342 /* First walk up until both entities are at same depth */
343 se_depth = depth_se(*se);
344 pse_depth = depth_se(*pse);
345
346 while (se_depth > pse_depth) {
347 se_depth--;
348 *se = parent_entity(*se);
349 }
350
351 while (pse_depth > se_depth) {
352 pse_depth--;
353 *pse = parent_entity(*pse);
354 }
355
356 while (!is_same_group(*se, *pse)) {
357 *se = parent_entity(*se);
358 *pse = parent_entity(*pse);
359 }
360}
361
8f48894f
PZ
362#else /* !CONFIG_FAIR_GROUP_SCHED */
363
364static inline struct task_struct *task_of(struct sched_entity *se)
365{
366 return container_of(se, struct task_struct, se);
367}
bf0f6f24 368
62160e3f
IM
369static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
370{
371 return container_of(cfs_rq, struct rq, cfs);
bf0f6f24
IM
372}
373
374#define entity_is_task(se) 1
375
b758149c
PZ
376#define for_each_sched_entity(se) \
377 for (; se; se = NULL)
bf0f6f24 378
b758149c 379static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
bf0f6f24 380{
b758149c 381 return &task_rq(p)->cfs;
bf0f6f24
IM
382}
383
b758149c
PZ
384static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
385{
386 struct task_struct *p = task_of(se);
387 struct rq *rq = task_rq(p);
388
389 return &rq->cfs;
390}
391
392/* runqueue "owned" by this group */
393static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
394{
395 return NULL;
396}
397
3d4b47b4
PZ
398static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
399{
400}
401
402static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
403{
404}
405
b758149c
PZ
406#define for_each_leaf_cfs_rq(rq, cfs_rq) \
407 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
408
409static inline int
410is_same_group(struct sched_entity *se, struct sched_entity *pse)
411{
412 return 1;
413}
414
415static inline struct sched_entity *parent_entity(struct sched_entity *se)
416{
417 return NULL;
418}
419
464b7527
PZ
420static inline void
421find_matching_se(struct sched_entity **se, struct sched_entity **pse)
422{
423}
424
b758149c
PZ
425#endif /* CONFIG_FAIR_GROUP_SCHED */
426
6c16a6dc
PZ
427static __always_inline
428void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
bf0f6f24
IM
429
430/**************************************************************
431 * Scheduling class tree data structure manipulation methods:
432 */
433
1bf08230 434static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
02e0431a 435{
1bf08230 436 s64 delta = (s64)(vruntime - max_vruntime);
368059a9 437 if (delta > 0)
1bf08230 438 max_vruntime = vruntime;
02e0431a 439
1bf08230 440 return max_vruntime;
02e0431a
PZ
441}
442
0702e3eb 443static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
b0ffd246
PZ
444{
445 s64 delta = (s64)(vruntime - min_vruntime);
446 if (delta < 0)
447 min_vruntime = vruntime;
448
449 return min_vruntime;
450}
451
54fdc581
FC
452static inline int entity_before(struct sched_entity *a,
453 struct sched_entity *b)
454{
455 return (s64)(a->vruntime - b->vruntime) < 0;
456}
457
1af5f730
PZ
458static void update_min_vruntime(struct cfs_rq *cfs_rq)
459{
460 u64 vruntime = cfs_rq->min_vruntime;
461
462 if (cfs_rq->curr)
463 vruntime = cfs_rq->curr->vruntime;
464
465 if (cfs_rq->rb_leftmost) {
466 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
467 struct sched_entity,
468 run_node);
469
e17036da 470 if (!cfs_rq->curr)
1af5f730
PZ
471 vruntime = se->vruntime;
472 else
473 vruntime = min_vruntime(vruntime, se->vruntime);
474 }
475
1bf08230 476 /* ensure we never gain time by being placed backwards. */
1af5f730 477 cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
3fe1698b
PZ
478#ifndef CONFIG_64BIT
479 smp_wmb();
480 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
481#endif
1af5f730
PZ
482}
483
bf0f6f24
IM
484/*
485 * Enqueue an entity into the rb-tree:
486 */
0702e3eb 487static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24
IM
488{
489 struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
490 struct rb_node *parent = NULL;
491 struct sched_entity *entry;
bf0f6f24
IM
492 int leftmost = 1;
493
494 /*
495 * Find the right place in the rbtree:
496 */
497 while (*link) {
498 parent = *link;
499 entry = rb_entry(parent, struct sched_entity, run_node);
500 /*
501 * We dont care about collisions. Nodes with
502 * the same key stay together.
503 */
2bd2d6f2 504 if (entity_before(se, entry)) {
bf0f6f24
IM
505 link = &parent->rb_left;
506 } else {
507 link = &parent->rb_right;
508 leftmost = 0;
509 }
510 }
511
512 /*
513 * Maintain a cache of leftmost tree entries (it is frequently
514 * used):
515 */
1af5f730 516 if (leftmost)
57cb499d 517 cfs_rq->rb_leftmost = &se->run_node;
bf0f6f24
IM
518
519 rb_link_node(&se->run_node, parent, link);
520 rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24
IM
521}
522
0702e3eb 523static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 524{
3fe69747
PZ
525 if (cfs_rq->rb_leftmost == &se->run_node) {
526 struct rb_node *next_node;
3fe69747
PZ
527
528 next_node = rb_next(&se->run_node);
529 cfs_rq->rb_leftmost = next_node;
3fe69747 530 }
e9acbff6 531
bf0f6f24 532 rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
bf0f6f24
IM
533}
534
029632fb 535struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
bf0f6f24 536{
f4b6755f
PZ
537 struct rb_node *left = cfs_rq->rb_leftmost;
538
539 if (!left)
540 return NULL;
541
542 return rb_entry(left, struct sched_entity, run_node);
bf0f6f24
IM
543}
544
ac53db59
RR
545static struct sched_entity *__pick_next_entity(struct sched_entity *se)
546{
547 struct rb_node *next = rb_next(&se->run_node);
548
549 if (!next)
550 return NULL;
551
552 return rb_entry(next, struct sched_entity, run_node);
553}
554
555#ifdef CONFIG_SCHED_DEBUG
029632fb 556struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
aeb73b04 557{
7eee3e67 558 struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
aeb73b04 559
70eee74b
BS
560 if (!last)
561 return NULL;
7eee3e67
IM
562
563 return rb_entry(last, struct sched_entity, run_node);
aeb73b04
PZ
564}
565
bf0f6f24
IM
566/**************************************************************
567 * Scheduling class statistics methods:
568 */
569
acb4a848 570int sched_proc_update_handler(struct ctl_table *table, int write,
8d65af78 571 void __user *buffer, size_t *lenp,
b2be5e96
PZ
572 loff_t *ppos)
573{
8d65af78 574 int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
acb4a848 575 int factor = get_update_sysctl_factor();
b2be5e96
PZ
576
577 if (ret || !write)
578 return ret;
579
580 sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
581 sysctl_sched_min_granularity);
582
acb4a848
CE
583#define WRT_SYSCTL(name) \
584 (normalized_sysctl_##name = sysctl_##name / (factor))
585 WRT_SYSCTL(sched_min_granularity);
586 WRT_SYSCTL(sched_latency);
587 WRT_SYSCTL(sched_wakeup_granularity);
acb4a848
CE
588#undef WRT_SYSCTL
589
b2be5e96
PZ
590 return 0;
591}
592#endif
647e7cac 593
a7be37ac 594/*
f9c0b095 595 * delta /= w
a7be37ac
PZ
596 */
597static inline unsigned long
598calc_delta_fair(unsigned long delta, struct sched_entity *se)
599{
f9c0b095
PZ
600 if (unlikely(se->load.weight != NICE_0_LOAD))
601 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
a7be37ac
PZ
602
603 return delta;
604}
605
647e7cac
IM
606/*
607 * The idea is to set a period in which each task runs once.
608 *
532b1858 609 * When there are too many tasks (sched_nr_latency) we have to stretch
647e7cac
IM
610 * this period because otherwise the slices get too small.
611 *
612 * p = (nr <= nl) ? l : l*nr/nl
613 */
4d78e7b6
PZ
614static u64 __sched_period(unsigned long nr_running)
615{
616 u64 period = sysctl_sched_latency;
b2be5e96 617 unsigned long nr_latency = sched_nr_latency;
4d78e7b6
PZ
618
619 if (unlikely(nr_running > nr_latency)) {
4bf0b771 620 period = sysctl_sched_min_granularity;
4d78e7b6 621 period *= nr_running;
4d78e7b6
PZ
622 }
623
624 return period;
625}
626
647e7cac
IM
627/*
628 * We calculate the wall-time slice from the period by taking a part
629 * proportional to the weight.
630 *
f9c0b095 631 * s = p*P[w/rw]
647e7cac 632 */
6d0f0ebd 633static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
21805085 634{
0a582440 635 u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
f9c0b095 636
0a582440 637 for_each_sched_entity(se) {
6272d68c 638 struct load_weight *load;
3104bf03 639 struct load_weight lw;
6272d68c
LM
640
641 cfs_rq = cfs_rq_of(se);
642 load = &cfs_rq->load;
f9c0b095 643
0a582440 644 if (unlikely(!se->on_rq)) {
3104bf03 645 lw = cfs_rq->load;
0a582440
MG
646
647 update_load_add(&lw, se->load.weight);
648 load = &lw;
649 }
650 slice = calc_delta_mine(slice, se->load.weight, load);
651 }
652 return slice;
bf0f6f24
IM
653}
654
647e7cac 655/*
660cc00f 656 * We calculate the vruntime slice of a to-be-inserted task.
647e7cac 657 *
f9c0b095 658 * vs = s/w
647e7cac 659 */
f9c0b095 660static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
67e9fb2a 661{
f9c0b095 662 return calc_delta_fair(sched_slice(cfs_rq, se), se);
a7be37ac
PZ
663}
664
bf0f6f24
IM
665/*
666 * Update the current task's runtime statistics. Skip current tasks that
667 * are not in our scheduling class.
668 */
669static inline void
8ebc91d9
IM
670__update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
671 unsigned long delta_exec)
bf0f6f24 672{
bbdba7c0 673 unsigned long delta_exec_weighted;
bf0f6f24 674
41acab88
LDM
675 schedstat_set(curr->statistics.exec_max,
676 max((u64)delta_exec, curr->statistics.exec_max));
bf0f6f24
IM
677
678 curr->sum_exec_runtime += delta_exec;
7a62eabc 679 schedstat_add(cfs_rq, exec_clock, delta_exec);
a7be37ac 680 delta_exec_weighted = calc_delta_fair(delta_exec, curr);
88ec22d3 681
e9acbff6 682 curr->vruntime += delta_exec_weighted;
1af5f730 683 update_min_vruntime(cfs_rq);
bf0f6f24
IM
684}
685
b7cc0896 686static void update_curr(struct cfs_rq *cfs_rq)
bf0f6f24 687{
429d43bc 688 struct sched_entity *curr = cfs_rq->curr;
305e6835 689 u64 now = rq_of(cfs_rq)->clock_task;
bf0f6f24
IM
690 unsigned long delta_exec;
691
692 if (unlikely(!curr))
693 return;
694
695 /*
696 * Get the amount of time the current task was running
697 * since the last time we changed load (this cannot
698 * overflow on 32 bits):
699 */
8ebc91d9 700 delta_exec = (unsigned long)(now - curr->exec_start);
34f28ecd
PZ
701 if (!delta_exec)
702 return;
bf0f6f24 703
8ebc91d9
IM
704 __update_curr(cfs_rq, curr, delta_exec);
705 curr->exec_start = now;
d842de87
SV
706
707 if (entity_is_task(curr)) {
708 struct task_struct *curtask = task_of(curr);
709
f977bb49 710 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
d842de87 711 cpuacct_charge(curtask, delta_exec);
f06febc9 712 account_group_exec_runtime(curtask, delta_exec);
d842de87 713 }
ec12cb7f
PT
714
715 account_cfs_rq_runtime(cfs_rq, delta_exec);
bf0f6f24
IM
716}
717
718static inline void
5870db5b 719update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 720{
41acab88 721 schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
bf0f6f24
IM
722}
723
bf0f6f24
IM
724/*
725 * Task is being enqueued - update stats:
726 */
d2417e5a 727static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 728{
bf0f6f24
IM
729 /*
730 * Are we enqueueing a waiting task? (for current tasks
731 * a dequeue/enqueue event is a NOP)
732 */
429d43bc 733 if (se != cfs_rq->curr)
5870db5b 734 update_stats_wait_start(cfs_rq, se);
bf0f6f24
IM
735}
736
bf0f6f24 737static void
9ef0a961 738update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 739{
41acab88
LDM
740 schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
741 rq_of(cfs_rq)->clock - se->statistics.wait_start));
742 schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
743 schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
744 rq_of(cfs_rq)->clock - se->statistics.wait_start);
768d0c27
PZ
745#ifdef CONFIG_SCHEDSTATS
746 if (entity_is_task(se)) {
747 trace_sched_stat_wait(task_of(se),
41acab88 748 rq_of(cfs_rq)->clock - se->statistics.wait_start);
768d0c27
PZ
749 }
750#endif
41acab88 751 schedstat_set(se->statistics.wait_start, 0);
bf0f6f24
IM
752}
753
754static inline void
19b6a2e3 755update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 756{
bf0f6f24
IM
757 /*
758 * Mark the end of the wait period if dequeueing a
759 * waiting task:
760 */
429d43bc 761 if (se != cfs_rq->curr)
9ef0a961 762 update_stats_wait_end(cfs_rq, se);
bf0f6f24
IM
763}
764
765/*
766 * We are picking a new current task - update its stats:
767 */
768static inline void
79303e9e 769update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24
IM
770{
771 /*
772 * We are starting a new run period:
773 */
305e6835 774 se->exec_start = rq_of(cfs_rq)->clock_task;
bf0f6f24
IM
775}
776
bf0f6f24
IM
777/**************************************************
778 * Scheduling class queueing methods:
779 */
780
cbee9f88
PZ
781#ifdef CONFIG_NUMA_BALANCING
782/*
6e5fb223 783 * numa task sample period in ms
cbee9f88 784 */
6e5fb223 785unsigned int sysctl_numa_balancing_scan_period_min = 100;
b8593bfd
MG
786unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
787unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
6e5fb223
PZ
788
789/* Portion of address space to scan in MB */
790unsigned int sysctl_numa_balancing_scan_size = 256;
cbee9f88 791
4b96a29b
PZ
792/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
793unsigned int sysctl_numa_balancing_scan_delay = 1000;
794
cbee9f88
PZ
795static void task_numa_placement(struct task_struct *p)
796{
2832bc19 797 int seq;
cbee9f88 798
2832bc19
HD
799 if (!p->mm) /* for example, ksmd faulting in a user's mm */
800 return;
801 seq = ACCESS_ONCE(p->mm->numa_scan_seq);
cbee9f88
PZ
802 if (p->numa_scan_seq == seq)
803 return;
804 p->numa_scan_seq = seq;
805
806 /* FIXME: Scheduling placement policy hints go here */
807}
808
809/*
810 * Got a PROT_NONE fault for a page on @node.
811 */
b8593bfd 812void task_numa_fault(int node, int pages, bool migrated)
cbee9f88
PZ
813{
814 struct task_struct *p = current;
815
1a687c2e
MG
816 if (!sched_feat_numa(NUMA))
817 return;
818
cbee9f88
PZ
819 /* FIXME: Allocate task-specific structure for placement policy here */
820
fb003b80 821 /*
b8593bfd
MG
822 * If pages are properly placed (did not migrate) then scan slower.
823 * This is reset periodically in case of phase changes
fb003b80 824 */
b8593bfd
MG
825 if (!migrated)
826 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
827 p->numa_scan_period + jiffies_to_msecs(10));
fb003b80 828
cbee9f88
PZ
829 task_numa_placement(p);
830}
831
6e5fb223
PZ
832static void reset_ptenuma_scan(struct task_struct *p)
833{
834 ACCESS_ONCE(p->mm->numa_scan_seq)++;
835 p->mm->numa_scan_offset = 0;
836}
837
cbee9f88
PZ
838/*
839 * The expensive part of numa migration is done from task_work context.
840 * Triggered from task_tick_numa().
841 */
842void task_numa_work(struct callback_head *work)
843{
844 unsigned long migrate, next_scan, now = jiffies;
845 struct task_struct *p = current;
846 struct mm_struct *mm = p->mm;
6e5fb223 847 struct vm_area_struct *vma;
9f40604c
MG
848 unsigned long start, end;
849 long pages;
cbee9f88
PZ
850
851 WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
852
853 work->next = work; /* protect against double add */
854 /*
855 * Who cares about NUMA placement when they're dying.
856 *
857 * NOTE: make sure not to dereference p->mm before this check,
858 * exit_task_work() happens _after_ exit_mm() so we could be called
859 * without p->mm even though we still had it when we enqueued this
860 * work.
861 */
862 if (p->flags & PF_EXITING)
863 return;
864
5bca2303
MG
865 /*
866 * We do not care about task placement until a task runs on a node
867 * other than the first one used by the address space. This is
868 * largely because migrations are driven by what CPU the task
869 * is running on. If it's never scheduled on another node, it'll
870 * not migrate so why bother trapping the fault.
871 */
872 if (mm->first_nid == NUMA_PTE_SCAN_INIT)
873 mm->first_nid = numa_node_id();
874 if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
875 /* Are we running on a new node yet? */
876 if (numa_node_id() == mm->first_nid &&
877 !sched_feat_numa(NUMA_FORCE))
878 return;
879
880 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
881 }
882
b8593bfd
MG
883 /*
884 * Reset the scan period if enough time has gone by. Objective is that
885 * scanning will be reduced if pages are properly placed. As tasks
886 * can enter different phases this needs to be re-examined. Lacking
887 * proper tracking of reference behaviour, this blunt hammer is used.
888 */
889 migrate = mm->numa_next_reset;
890 if (time_after(now, migrate)) {
891 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
892 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
893 xchg(&mm->numa_next_reset, next_scan);
894 }
895
cbee9f88
PZ
896 /*
897 * Enforce maximal scan/migration frequency..
898 */
899 migrate = mm->numa_next_scan;
900 if (time_before(now, migrate))
901 return;
902
903 if (p->numa_scan_period == 0)
904 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
905
fb003b80 906 next_scan = now + msecs_to_jiffies(p->numa_scan_period);
cbee9f88
PZ
907 if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
908 return;
909
e14808b4
MG
910 /*
911 * Do not set pte_numa if the current running node is rate-limited.
912 * This loses statistics on the fault but if we are unwilling to
913 * migrate to this node, it is less likely we can do useful work
914 */
915 if (migrate_ratelimited(numa_node_id()))
916 return;
917
9f40604c
MG
918 start = mm->numa_scan_offset;
919 pages = sysctl_numa_balancing_scan_size;
920 pages <<= 20 - PAGE_SHIFT; /* MB in pages */
921 if (!pages)
922 return;
cbee9f88 923
6e5fb223 924 down_read(&mm->mmap_sem);
9f40604c 925 vma = find_vma(mm, start);
6e5fb223
PZ
926 if (!vma) {
927 reset_ptenuma_scan(p);
9f40604c 928 start = 0;
6e5fb223
PZ
929 vma = mm->mmap;
930 }
9f40604c 931 for (; vma; vma = vma->vm_next) {
6e5fb223
PZ
932 if (!vma_migratable(vma))
933 continue;
934
935 /* Skip small VMAs. They are not likely to be of relevance */
221392c3 936 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
6e5fb223
PZ
937 continue;
938
13ea5487
MG
939 /*
940 * Skip inaccessible VMAs to avoid any confusion between
941 * PROT_NONE and NUMA hinting ptes
942 */
943 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
944 continue;
945
9f40604c
MG
946 do {
947 start = max(start, vma->vm_start);
948 end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
949 end = min(end, vma->vm_end);
950 pages -= change_prot_numa(vma, start, end);
6e5fb223 951
9f40604c
MG
952 start = end;
953 if (pages <= 0)
954 goto out;
955 } while (end != vma->vm_end);
cbee9f88 956 }
6e5fb223 957
9f40604c 958out:
6e5fb223
PZ
959 /*
960 * It is possible to reach the end of the VMA list but the last few VMAs are
961 * not guaranteed to the vma_migratable. If they are not, we would find the
962 * !migratable VMA on the next scan but not reset the scanner to the start
963 * so check it now.
964 */
965 if (vma)
9f40604c 966 mm->numa_scan_offset = start;
6e5fb223
PZ
967 else
968 reset_ptenuma_scan(p);
969 up_read(&mm->mmap_sem);
cbee9f88
PZ
970}
971
972/*
973 * Drive the periodic memory faults..
974 */
975void task_tick_numa(struct rq *rq, struct task_struct *curr)
976{
977 struct callback_head *work = &curr->numa_work;
978 u64 period, now;
979
980 /*
981 * We don't care about NUMA placement if we don't have memory.
982 */
983 if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
984 return;
985
986 /*
987 * Using runtime rather than walltime has the dual advantage that
988 * we (mostly) drive the selection from busy threads and that the
989 * task needs to have done some actual work before we bother with
990 * NUMA placement.
991 */
992 now = curr->se.sum_exec_runtime;
993 period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
994
995 if (now - curr->node_stamp > period) {
4b96a29b
PZ
996 if (!curr->node_stamp)
997 curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
cbee9f88
PZ
998 curr->node_stamp = now;
999
1000 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1001 init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1002 task_work_add(curr, work, true);
1003 }
1004 }
1005}
1006#else
1007static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1008{
1009}
1010#endif /* CONFIG_NUMA_BALANCING */
1011
30cfdcfc
DA
1012static void
1013account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1014{
1015 update_load_add(&cfs_rq->load, se->load.weight);
c09595f6 1016 if (!parent_entity(se))
029632fb 1017 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
367456c7
PZ
1018#ifdef CONFIG_SMP
1019 if (entity_is_task(se))
eb95308e 1020 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
367456c7 1021#endif
30cfdcfc 1022 cfs_rq->nr_running++;
30cfdcfc
DA
1023}
1024
1025static void
1026account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1027{
1028 update_load_sub(&cfs_rq->load, se->load.weight);
c09595f6 1029 if (!parent_entity(se))
029632fb 1030 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
367456c7 1031 if (entity_is_task(se))
b87f1724 1032 list_del_init(&se->group_node);
30cfdcfc 1033 cfs_rq->nr_running--;
30cfdcfc
DA
1034}
1035
3ff6dcac
YZ
1036#ifdef CONFIG_FAIR_GROUP_SCHED
1037# ifdef CONFIG_SMP
cf5f0acf
PZ
1038static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1039{
1040 long tg_weight;
1041
1042 /*
1043 * Use this CPU's actual weight instead of the last load_contribution
1044 * to gain a more accurate current total weight. See
1045 * update_cfs_rq_load_contribution().
1046 */
82958366
PT
1047 tg_weight = atomic64_read(&tg->load_avg);
1048 tg_weight -= cfs_rq->tg_load_contrib;
cf5f0acf
PZ
1049 tg_weight += cfs_rq->load.weight;
1050
1051 return tg_weight;
1052}
1053
6d5ab293 1054static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
3ff6dcac 1055{
cf5f0acf 1056 long tg_weight, load, shares;
3ff6dcac 1057
cf5f0acf 1058 tg_weight = calc_tg_weight(tg, cfs_rq);
6d5ab293 1059 load = cfs_rq->load.weight;
3ff6dcac 1060
3ff6dcac 1061 shares = (tg->shares * load);
cf5f0acf
PZ
1062 if (tg_weight)
1063 shares /= tg_weight;
3ff6dcac
YZ
1064
1065 if (shares < MIN_SHARES)
1066 shares = MIN_SHARES;
1067 if (shares > tg->shares)
1068 shares = tg->shares;
1069
1070 return shares;
1071}
3ff6dcac 1072# else /* CONFIG_SMP */
6d5ab293 1073static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
3ff6dcac
YZ
1074{
1075 return tg->shares;
1076}
3ff6dcac 1077# endif /* CONFIG_SMP */
2069dd75
PZ
1078static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1079 unsigned long weight)
1080{
19e5eebb
PT
1081 if (se->on_rq) {
1082 /* commit outstanding execution time */
1083 if (cfs_rq->curr == se)
1084 update_curr(cfs_rq);
2069dd75 1085 account_entity_dequeue(cfs_rq, se);
19e5eebb 1086 }
2069dd75
PZ
1087
1088 update_load_set(&se->load, weight);
1089
1090 if (se->on_rq)
1091 account_entity_enqueue(cfs_rq, se);
1092}
1093
82958366
PT
1094static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1095
6d5ab293 1096static void update_cfs_shares(struct cfs_rq *cfs_rq)
2069dd75
PZ
1097{
1098 struct task_group *tg;
1099 struct sched_entity *se;
3ff6dcac 1100 long shares;
2069dd75 1101
2069dd75
PZ
1102 tg = cfs_rq->tg;
1103 se = tg->se[cpu_of(rq_of(cfs_rq))];
64660c86 1104 if (!se || throttled_hierarchy(cfs_rq))
2069dd75 1105 return;
3ff6dcac
YZ
1106#ifndef CONFIG_SMP
1107 if (likely(se->load.weight == tg->shares))
1108 return;
1109#endif
6d5ab293 1110 shares = calc_cfs_shares(cfs_rq, tg);
2069dd75
PZ
1111
1112 reweight_entity(cfs_rq_of(se), se, shares);
1113}
1114#else /* CONFIG_FAIR_GROUP_SCHED */
6d5ab293 1115static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
2069dd75
PZ
1116{
1117}
1118#endif /* CONFIG_FAIR_GROUP_SCHED */
1119
f4e26b12
PT
1120/* Only depends on SMP, FAIR_GROUP_SCHED may be removed when useful in lb */
1121#if defined(CONFIG_SMP) && defined(CONFIG_FAIR_GROUP_SCHED)
5b51f2f8
PT
1122/*
1123 * We choose a half-life close to 1 scheduling period.
1124 * Note: The tables below are dependent on this value.
1125 */
1126#define LOAD_AVG_PERIOD 32
1127#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1128#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1129
1130/* Precomputed fixed inverse multiplies for multiplication by y^n */
1131static const u32 runnable_avg_yN_inv[] = {
1132 0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1133 0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1134 0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1135 0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1136 0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1137 0x85aac367, 0x82cd8698,
1138};
1139
1140/*
1141 * Precomputed \Sum y^k { 1<=k<=n }. These are floor(true_value) to prevent
1142 * over-estimates when re-combining.
1143 */
1144static const u32 runnable_avg_yN_sum[] = {
1145 0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1146 9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1147 17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1148};
1149
9d85f21c
PT
1150/*
1151 * Approximate:
1152 * val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
1153 */
1154static __always_inline u64 decay_load(u64 val, u64 n)
1155{
5b51f2f8
PT
1156 unsigned int local_n;
1157
1158 if (!n)
1159 return val;
1160 else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1161 return 0;
1162
1163 /* after bounds checking we can collapse to 32-bit */
1164 local_n = n;
1165
1166 /*
1167 * As y^PERIOD = 1/2, we can combine
1168 * y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1169 * With a look-up table which covers k^n (n<PERIOD)
1170 *
1171 * To achieve constant time decay_load.
1172 */
1173 if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1174 val >>= local_n / LOAD_AVG_PERIOD;
1175 local_n %= LOAD_AVG_PERIOD;
9d85f21c
PT
1176 }
1177
5b51f2f8
PT
1178 val *= runnable_avg_yN_inv[local_n];
1179 /* We don't use SRR here since we always want to round down. */
1180 return val >> 32;
1181}
1182
1183/*
1184 * For updates fully spanning n periods, the contribution to runnable
1185 * average will be: \Sum 1024*y^n
1186 *
1187 * We can compute this reasonably efficiently by combining:
1188 * y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for n <PERIOD}
1189 */
1190static u32 __compute_runnable_contrib(u64 n)
1191{
1192 u32 contrib = 0;
1193
1194 if (likely(n <= LOAD_AVG_PERIOD))
1195 return runnable_avg_yN_sum[n];
1196 else if (unlikely(n >= LOAD_AVG_MAX_N))
1197 return LOAD_AVG_MAX;
1198
1199 /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1200 do {
1201 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1202 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1203
1204 n -= LOAD_AVG_PERIOD;
1205 } while (n > LOAD_AVG_PERIOD);
1206
1207 contrib = decay_load(contrib, n);
1208 return contrib + runnable_avg_yN_sum[n];
9d85f21c
PT
1209}
1210
1211/*
1212 * We can represent the historical contribution to runnable average as the
1213 * coefficients of a geometric series. To do this we sub-divide our runnable
1214 * history into segments of approximately 1ms (1024us); label the segment that
1215 * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1216 *
1217 * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1218 * p0 p1 p2
1219 * (now) (~1ms ago) (~2ms ago)
1220 *
1221 * Let u_i denote the fraction of p_i that the entity was runnable.
1222 *
1223 * We then designate the fractions u_i as our co-efficients, yielding the
1224 * following representation of historical load:
1225 * u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1226 *
1227 * We choose y based on the with of a reasonably scheduling period, fixing:
1228 * y^32 = 0.5
1229 *
1230 * This means that the contribution to load ~32ms ago (u_32) will be weighted
1231 * approximately half as much as the contribution to load within the last ms
1232 * (u_0).
1233 *
1234 * When a period "rolls over" and we have new u_0`, multiplying the previous
1235 * sum again by y is sufficient to update:
1236 * load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1237 * = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1238 */
1239static __always_inline int __update_entity_runnable_avg(u64 now,
1240 struct sched_avg *sa,
1241 int runnable)
1242{
5b51f2f8
PT
1243 u64 delta, periods;
1244 u32 runnable_contrib;
9d85f21c
PT
1245 int delta_w, decayed = 0;
1246
1247 delta = now - sa->last_runnable_update;
1248 /*
1249 * This should only happen when time goes backwards, which it
1250 * unfortunately does during sched clock init when we swap over to TSC.
1251 */
1252 if ((s64)delta < 0) {
1253 sa->last_runnable_update = now;
1254 return 0;
1255 }
1256
1257 /*
1258 * Use 1024ns as the unit of measurement since it's a reasonable
1259 * approximation of 1us and fast to compute.
1260 */
1261 delta >>= 10;
1262 if (!delta)
1263 return 0;
1264 sa->last_runnable_update = now;
1265
1266 /* delta_w is the amount already accumulated against our next period */
1267 delta_w = sa->runnable_avg_period % 1024;
1268 if (delta + delta_w >= 1024) {
1269 /* period roll-over */
1270 decayed = 1;
1271
1272 /*
1273 * Now that we know we're crossing a period boundary, figure
1274 * out how much from delta we need to complete the current
1275 * period and accrue it.
1276 */
1277 delta_w = 1024 - delta_w;
5b51f2f8
PT
1278 if (runnable)
1279 sa->runnable_avg_sum += delta_w;
1280 sa->runnable_avg_period += delta_w;
1281
1282 delta -= delta_w;
1283
1284 /* Figure out how many additional periods this update spans */
1285 periods = delta / 1024;
1286 delta %= 1024;
1287
1288 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1289 periods + 1);
1290 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1291 periods + 1);
1292
1293 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1294 runnable_contrib = __compute_runnable_contrib(periods);
1295 if (runnable)
1296 sa->runnable_avg_sum += runnable_contrib;
1297 sa->runnable_avg_period += runnable_contrib;
9d85f21c
PT
1298 }
1299
1300 /* Remainder of delta accrued against u_0` */
1301 if (runnable)
1302 sa->runnable_avg_sum += delta;
1303 sa->runnable_avg_period += delta;
1304
1305 return decayed;
1306}
1307
9ee474f5 1308/* Synchronize an entity's decay with its parenting cfs_rq.*/
aff3e498 1309static inline u64 __synchronize_entity_decay(struct sched_entity *se)
9ee474f5
PT
1310{
1311 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1312 u64 decays = atomic64_read(&cfs_rq->decay_counter);
1313
1314 decays -= se->avg.decay_count;
1315 if (!decays)
aff3e498 1316 return 0;
9ee474f5
PT
1317
1318 se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1319 se->avg.decay_count = 0;
aff3e498
PT
1320
1321 return decays;
9ee474f5
PT
1322}
1323
c566e8e9
PT
1324#ifdef CONFIG_FAIR_GROUP_SCHED
1325static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1326 int force_update)
1327{
1328 struct task_group *tg = cfs_rq->tg;
1329 s64 tg_contrib;
1330
1331 tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1332 tg_contrib -= cfs_rq->tg_load_contrib;
1333
1334 if (force_update || abs64(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1335 atomic64_add(tg_contrib, &tg->load_avg);
1336 cfs_rq->tg_load_contrib += tg_contrib;
1337 }
1338}
8165e145 1339
bb17f655
PT
1340/*
1341 * Aggregate cfs_rq runnable averages into an equivalent task_group
1342 * representation for computing load contributions.
1343 */
1344static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1345 struct cfs_rq *cfs_rq)
1346{
1347 struct task_group *tg = cfs_rq->tg;
1348 long contrib;
1349
1350 /* The fraction of a cpu used by this cfs_rq */
1351 contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1352 sa->runnable_avg_period + 1);
1353 contrib -= cfs_rq->tg_runnable_contrib;
1354
1355 if (abs(contrib) > cfs_rq->tg_runnable_contrib / 64) {
1356 atomic_add(contrib, &tg->runnable_avg);
1357 cfs_rq->tg_runnable_contrib += contrib;
1358 }
1359}
1360
8165e145
PT
1361static inline void __update_group_entity_contrib(struct sched_entity *se)
1362{
1363 struct cfs_rq *cfs_rq = group_cfs_rq(se);
1364 struct task_group *tg = cfs_rq->tg;
bb17f655
PT
1365 int runnable_avg;
1366
8165e145
PT
1367 u64 contrib;
1368
1369 contrib = cfs_rq->tg_load_contrib * tg->shares;
1370 se->avg.load_avg_contrib = div64_u64(contrib,
1371 atomic64_read(&tg->load_avg) + 1);
bb17f655
PT
1372
1373 /*
1374 * For group entities we need to compute a correction term in the case
1375 * that they are consuming <1 cpu so that we would contribute the same
1376 * load as a task of equal weight.
1377 *
1378 * Explicitly co-ordinating this measurement would be expensive, but
1379 * fortunately the sum of each cpus contribution forms a usable
1380 * lower-bound on the true value.
1381 *
1382 * Consider the aggregate of 2 contributions. Either they are disjoint
1383 * (and the sum represents true value) or they are disjoint and we are
1384 * understating by the aggregate of their overlap.
1385 *
1386 * Extending this to N cpus, for a given overlap, the maximum amount we
1387 * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
1388 * cpus that overlap for this interval and w_i is the interval width.
1389 *
1390 * On a small machine; the first term is well-bounded which bounds the
1391 * total error since w_i is a subset of the period. Whereas on a
1392 * larger machine, while this first term can be larger, if w_i is the
1393 * of consequential size guaranteed to see n_i*w_i quickly converge to
1394 * our upper bound of 1-cpu.
1395 */
1396 runnable_avg = atomic_read(&tg->runnable_avg);
1397 if (runnable_avg < NICE_0_LOAD) {
1398 se->avg.load_avg_contrib *= runnable_avg;
1399 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
1400 }
8165e145 1401}
c566e8e9
PT
1402#else
1403static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1404 int force_update) {}
bb17f655
PT
1405static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1406 struct cfs_rq *cfs_rq) {}
8165e145 1407static inline void __update_group_entity_contrib(struct sched_entity *se) {}
c566e8e9
PT
1408#endif
1409
8165e145
PT
1410static inline void __update_task_entity_contrib(struct sched_entity *se)
1411{
1412 u32 contrib;
1413
1414 /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
1415 contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
1416 contrib /= (se->avg.runnable_avg_period + 1);
1417 se->avg.load_avg_contrib = scale_load(contrib);
1418}
1419
2dac754e
PT
1420/* Compute the current contribution to load_avg by se, return any delta */
1421static long __update_entity_load_avg_contrib(struct sched_entity *se)
1422{
1423 long old_contrib = se->avg.load_avg_contrib;
1424
8165e145
PT
1425 if (entity_is_task(se)) {
1426 __update_task_entity_contrib(se);
1427 } else {
bb17f655 1428 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
8165e145
PT
1429 __update_group_entity_contrib(se);
1430 }
2dac754e
PT
1431
1432 return se->avg.load_avg_contrib - old_contrib;
1433}
1434
9ee474f5
PT
1435static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
1436 long load_contrib)
1437{
1438 if (likely(load_contrib < cfs_rq->blocked_load_avg))
1439 cfs_rq->blocked_load_avg -= load_contrib;
1440 else
1441 cfs_rq->blocked_load_avg = 0;
1442}
1443
f1b17280
PT
1444static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
1445
9d85f21c 1446/* Update a sched_entity's runnable average */
9ee474f5
PT
1447static inline void update_entity_load_avg(struct sched_entity *se,
1448 int update_cfs_rq)
9d85f21c 1449{
2dac754e
PT
1450 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1451 long contrib_delta;
f1b17280 1452 u64 now;
2dac754e 1453
f1b17280
PT
1454 /*
1455 * For a group entity we need to use their owned cfs_rq_clock_task() in
1456 * case they are the parent of a throttled hierarchy.
1457 */
1458 if (entity_is_task(se))
1459 now = cfs_rq_clock_task(cfs_rq);
1460 else
1461 now = cfs_rq_clock_task(group_cfs_rq(se));
1462
1463 if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq))
2dac754e
PT
1464 return;
1465
1466 contrib_delta = __update_entity_load_avg_contrib(se);
9ee474f5
PT
1467
1468 if (!update_cfs_rq)
1469 return;
1470
2dac754e
PT
1471 if (se->on_rq)
1472 cfs_rq->runnable_load_avg += contrib_delta;
9ee474f5
PT
1473 else
1474 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
1475}
1476
1477/*
1478 * Decay the load contributed by all blocked children and account this so that
1479 * their contribution may appropriately discounted when they wake up.
1480 */
aff3e498 1481static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
9ee474f5 1482{
f1b17280 1483 u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
9ee474f5
PT
1484 u64 decays;
1485
1486 decays = now - cfs_rq->last_decay;
aff3e498 1487 if (!decays && !force_update)
9ee474f5
PT
1488 return;
1489
aff3e498
PT
1490 if (atomic64_read(&cfs_rq->removed_load)) {
1491 u64 removed_load = atomic64_xchg(&cfs_rq->removed_load, 0);
1492 subtract_blocked_load_contrib(cfs_rq, removed_load);
1493 }
9ee474f5 1494
aff3e498
PT
1495 if (decays) {
1496 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
1497 decays);
1498 atomic64_add(decays, &cfs_rq->decay_counter);
1499 cfs_rq->last_decay = now;
1500 }
c566e8e9
PT
1501
1502 __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
9d85f21c 1503}
18bf2805
BS
1504
1505static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
1506{
1507 __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable);
bb17f655 1508 __update_tg_runnable_avg(&rq->avg, &rq->cfs);
18bf2805 1509}
2dac754e
PT
1510
1511/* Add the load generated by se into cfs_rq's child load-average */
1512static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
1513 struct sched_entity *se,
1514 int wakeup)
2dac754e 1515{
aff3e498
PT
1516 /*
1517 * We track migrations using entity decay_count <= 0, on a wake-up
1518 * migration we use a negative decay count to track the remote decays
1519 * accumulated while sleeping.
1520 */
1521 if (unlikely(se->avg.decay_count <= 0)) {
9ee474f5 1522 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
aff3e498
PT
1523 if (se->avg.decay_count) {
1524 /*
1525 * In a wake-up migration we have to approximate the
1526 * time sleeping. This is because we can't synchronize
1527 * clock_task between the two cpus, and it is not
1528 * guaranteed to be read-safe. Instead, we can
1529 * approximate this using our carried decays, which are
1530 * explicitly atomically readable.
1531 */
1532 se->avg.last_runnable_update -= (-se->avg.decay_count)
1533 << 20;
1534 update_entity_load_avg(se, 0);
1535 /* Indicate that we're now synchronized and on-rq */
1536 se->avg.decay_count = 0;
1537 }
9ee474f5
PT
1538 wakeup = 0;
1539 } else {
1540 __synchronize_entity_decay(se);
1541 }
1542
aff3e498
PT
1543 /* migrated tasks did not contribute to our blocked load */
1544 if (wakeup) {
9ee474f5 1545 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
aff3e498
PT
1546 update_entity_load_avg(se, 0);
1547 }
9ee474f5 1548
2dac754e 1549 cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
aff3e498
PT
1550 /* we force update consideration on load-balancer moves */
1551 update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2dac754e
PT
1552}
1553
9ee474f5
PT
1554/*
1555 * Remove se's load from this cfs_rq child load-average, if the entity is
1556 * transitioning to a blocked state we track its projected decay using
1557 * blocked_load_avg.
1558 */
2dac754e 1559static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
1560 struct sched_entity *se,
1561 int sleep)
2dac754e 1562{
9ee474f5 1563 update_entity_load_avg(se, 1);
aff3e498
PT
1564 /* we force update consideration on load-balancer moves */
1565 update_cfs_rq_blocked_load(cfs_rq, !sleep);
9ee474f5 1566
2dac754e 1567 cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
9ee474f5
PT
1568 if (sleep) {
1569 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
1570 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
1571 } /* migrations, e.g. sleep=0 leave decay_count == 0 */
2dac754e 1572}
642dbc39
VG
1573
1574/*
1575 * Update the rq's load with the elapsed running time before entering
1576 * idle. if the last scheduled task is not a CFS task, idle_enter will
1577 * be the only way to update the runnable statistic.
1578 */
1579void idle_enter_fair(struct rq *this_rq)
1580{
1581 update_rq_runnable_avg(this_rq, 1);
1582}
1583
1584/*
1585 * Update the rq's load with the elapsed idle time before a task is
1586 * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
1587 * be the only way to update the runnable statistic.
1588 */
1589void idle_exit_fair(struct rq *this_rq)
1590{
1591 update_rq_runnable_avg(this_rq, 0);
1592}
1593
9d85f21c 1594#else
9ee474f5
PT
1595static inline void update_entity_load_avg(struct sched_entity *se,
1596 int update_cfs_rq) {}
18bf2805 1597static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2dac754e 1598static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
1599 struct sched_entity *se,
1600 int wakeup) {}
2dac754e 1601static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
9ee474f5
PT
1602 struct sched_entity *se,
1603 int sleep) {}
aff3e498
PT
1604static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
1605 int force_update) {}
9d85f21c
PT
1606#endif
1607
2396af69 1608static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 1609{
bf0f6f24 1610#ifdef CONFIG_SCHEDSTATS
e414314c
PZ
1611 struct task_struct *tsk = NULL;
1612
1613 if (entity_is_task(se))
1614 tsk = task_of(se);
1615
41acab88
LDM
1616 if (se->statistics.sleep_start) {
1617 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
bf0f6f24
IM
1618
1619 if ((s64)delta < 0)
1620 delta = 0;
1621
41acab88
LDM
1622 if (unlikely(delta > se->statistics.sleep_max))
1623 se->statistics.sleep_max = delta;
bf0f6f24 1624
8c79a045 1625 se->statistics.sleep_start = 0;
41acab88 1626 se->statistics.sum_sleep_runtime += delta;
9745512c 1627
768d0c27 1628 if (tsk) {
e414314c 1629 account_scheduler_latency(tsk, delta >> 10, 1);
768d0c27
PZ
1630 trace_sched_stat_sleep(tsk, delta);
1631 }
bf0f6f24 1632 }
41acab88
LDM
1633 if (se->statistics.block_start) {
1634 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
bf0f6f24
IM
1635
1636 if ((s64)delta < 0)
1637 delta = 0;
1638
41acab88
LDM
1639 if (unlikely(delta > se->statistics.block_max))
1640 se->statistics.block_max = delta;
bf0f6f24 1641
8c79a045 1642 se->statistics.block_start = 0;
41acab88 1643 se->statistics.sum_sleep_runtime += delta;
30084fbd 1644
e414314c 1645 if (tsk) {
8f0dfc34 1646 if (tsk->in_iowait) {
41acab88
LDM
1647 se->statistics.iowait_sum += delta;
1648 se->statistics.iowait_count++;
768d0c27 1649 trace_sched_stat_iowait(tsk, delta);
8f0dfc34
AV
1650 }
1651
b781a602
AV
1652 trace_sched_stat_blocked(tsk, delta);
1653
e414314c
PZ
1654 /*
1655 * Blocking time is in units of nanosecs, so shift by
1656 * 20 to get a milliseconds-range estimation of the
1657 * amount of time that the task spent sleeping:
1658 */
1659 if (unlikely(prof_on == SLEEP_PROFILING)) {
1660 profile_hits(SLEEP_PROFILING,
1661 (void *)get_wchan(tsk),
1662 delta >> 20);
1663 }
1664 account_scheduler_latency(tsk, delta >> 10, 0);
30084fbd 1665 }
bf0f6f24
IM
1666 }
1667#endif
1668}
1669
ddc97297
PZ
1670static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
1671{
1672#ifdef CONFIG_SCHED_DEBUG
1673 s64 d = se->vruntime - cfs_rq->min_vruntime;
1674
1675 if (d < 0)
1676 d = -d;
1677
1678 if (d > 3*sysctl_sched_latency)
1679 schedstat_inc(cfs_rq, nr_spread_over);
1680#endif
1681}
1682
aeb73b04
PZ
1683static void
1684place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
1685{
1af5f730 1686 u64 vruntime = cfs_rq->min_vruntime;
94dfb5e7 1687
2cb8600e
PZ
1688 /*
1689 * The 'current' period is already promised to the current tasks,
1690 * however the extra weight of the new task will slow them down a
1691 * little, place the new task so that it fits in the slot that
1692 * stays open at the end.
1693 */
94dfb5e7 1694 if (initial && sched_feat(START_DEBIT))
f9c0b095 1695 vruntime += sched_vslice(cfs_rq, se);
aeb73b04 1696
a2e7a7eb 1697 /* sleeps up to a single latency don't count. */
5ca9880c 1698 if (!initial) {
a2e7a7eb 1699 unsigned long thresh = sysctl_sched_latency;
a7be37ac 1700
a2e7a7eb
MG
1701 /*
1702 * Halve their sleep time's effect, to allow
1703 * for a gentler effect of sleepers:
1704 */
1705 if (sched_feat(GENTLE_FAIR_SLEEPERS))
1706 thresh >>= 1;
51e0304c 1707
a2e7a7eb 1708 vruntime -= thresh;
aeb73b04
PZ
1709 }
1710
b5d9d734 1711 /* ensure we never gain time by being placed backwards. */
16c8f1c7 1712 se->vruntime = max_vruntime(se->vruntime, vruntime);
aeb73b04
PZ
1713}
1714
d3d9dc33
PT
1715static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
1716
bf0f6f24 1717static void
88ec22d3 1718enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 1719{
88ec22d3
PZ
1720 /*
1721 * Update the normalized vruntime before updating min_vruntime
1722 * through callig update_curr().
1723 */
371fd7e7 1724 if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
88ec22d3
PZ
1725 se->vruntime += cfs_rq->min_vruntime;
1726
bf0f6f24 1727 /*
a2a2d680 1728 * Update run-time statistics of the 'current'.
bf0f6f24 1729 */
b7cc0896 1730 update_curr(cfs_rq);
f269ae04 1731 enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
17bc14b7
LT
1732 account_entity_enqueue(cfs_rq, se);
1733 update_cfs_shares(cfs_rq);
bf0f6f24 1734
88ec22d3 1735 if (flags & ENQUEUE_WAKEUP) {
aeb73b04 1736 place_entity(cfs_rq, se, 0);
2396af69 1737 enqueue_sleeper(cfs_rq, se);
e9acbff6 1738 }
bf0f6f24 1739
d2417e5a 1740 update_stats_enqueue(cfs_rq, se);
ddc97297 1741 check_spread(cfs_rq, se);
83b699ed
SV
1742 if (se != cfs_rq->curr)
1743 __enqueue_entity(cfs_rq, se);
2069dd75 1744 se->on_rq = 1;
3d4b47b4 1745
d3d9dc33 1746 if (cfs_rq->nr_running == 1) {
3d4b47b4 1747 list_add_leaf_cfs_rq(cfs_rq);
d3d9dc33
PT
1748 check_enqueue_throttle(cfs_rq);
1749 }
bf0f6f24
IM
1750}
1751
2c13c919 1752static void __clear_buddies_last(struct sched_entity *se)
2002c695 1753{
2c13c919
RR
1754 for_each_sched_entity(se) {
1755 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1756 if (cfs_rq->last == se)
1757 cfs_rq->last = NULL;
1758 else
1759 break;
1760 }
1761}
2002c695 1762
2c13c919
RR
1763static void __clear_buddies_next(struct sched_entity *se)
1764{
1765 for_each_sched_entity(se) {
1766 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1767 if (cfs_rq->next == se)
1768 cfs_rq->next = NULL;
1769 else
1770 break;
1771 }
2002c695
PZ
1772}
1773
ac53db59
RR
1774static void __clear_buddies_skip(struct sched_entity *se)
1775{
1776 for_each_sched_entity(se) {
1777 struct cfs_rq *cfs_rq = cfs_rq_of(se);
1778 if (cfs_rq->skip == se)
1779 cfs_rq->skip = NULL;
1780 else
1781 break;
1782 }
1783}
1784
a571bbea
PZ
1785static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
1786{
2c13c919
RR
1787 if (cfs_rq->last == se)
1788 __clear_buddies_last(se);
1789
1790 if (cfs_rq->next == se)
1791 __clear_buddies_next(se);
ac53db59
RR
1792
1793 if (cfs_rq->skip == se)
1794 __clear_buddies_skip(se);
a571bbea
PZ
1795}
1796
6c16a6dc 1797static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
d8b4986d 1798
bf0f6f24 1799static void
371fd7e7 1800dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
bf0f6f24 1801{
a2a2d680
DA
1802 /*
1803 * Update run-time statistics of the 'current'.
1804 */
1805 update_curr(cfs_rq);
17bc14b7 1806 dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
a2a2d680 1807
19b6a2e3 1808 update_stats_dequeue(cfs_rq, se);
371fd7e7 1809 if (flags & DEQUEUE_SLEEP) {
67e9fb2a 1810#ifdef CONFIG_SCHEDSTATS
bf0f6f24
IM
1811 if (entity_is_task(se)) {
1812 struct task_struct *tsk = task_of(se);
1813
1814 if (tsk->state & TASK_INTERRUPTIBLE)
41acab88 1815 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
bf0f6f24 1816 if (tsk->state & TASK_UNINTERRUPTIBLE)
41acab88 1817 se->statistics.block_start = rq_of(cfs_rq)->clock;
bf0f6f24 1818 }
db36cc7d 1819#endif
67e9fb2a
PZ
1820 }
1821
2002c695 1822 clear_buddies(cfs_rq, se);
4793241b 1823
83b699ed 1824 if (se != cfs_rq->curr)
30cfdcfc 1825 __dequeue_entity(cfs_rq, se);
17bc14b7 1826 se->on_rq = 0;
30cfdcfc 1827 account_entity_dequeue(cfs_rq, se);
88ec22d3
PZ
1828
1829 /*
1830 * Normalize the entity after updating the min_vruntime because the
1831 * update can refer to the ->curr item and we need to reflect this
1832 * movement in our normalized position.
1833 */
371fd7e7 1834 if (!(flags & DEQUEUE_SLEEP))
88ec22d3 1835 se->vruntime -= cfs_rq->min_vruntime;
1e876231 1836
d8b4986d
PT
1837 /* return excess runtime on last dequeue */
1838 return_cfs_rq_runtime(cfs_rq);
1839
1e876231 1840 update_min_vruntime(cfs_rq);
17bc14b7 1841 update_cfs_shares(cfs_rq);
bf0f6f24
IM
1842}
1843
1844/*
1845 * Preempt the current task with a newly woken task if needed:
1846 */
7c92e54f 1847static void
2e09bf55 1848check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
bf0f6f24 1849{
11697830 1850 unsigned long ideal_runtime, delta_exec;
f4cfb33e
WX
1851 struct sched_entity *se;
1852 s64 delta;
11697830 1853
6d0f0ebd 1854 ideal_runtime = sched_slice(cfs_rq, curr);
11697830 1855 delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
a9f3e2b5 1856 if (delta_exec > ideal_runtime) {
bf0f6f24 1857 resched_task(rq_of(cfs_rq)->curr);
a9f3e2b5
MG
1858 /*
1859 * The current task ran long enough, ensure it doesn't get
1860 * re-elected due to buddy favours.
1861 */
1862 clear_buddies(cfs_rq, curr);
f685ceac
MG
1863 return;
1864 }
1865
1866 /*
1867 * Ensure that a task that missed wakeup preemption by a
1868 * narrow margin doesn't have to wait for a full slice.
1869 * This also mitigates buddy induced latencies under load.
1870 */
f685ceac
MG
1871 if (delta_exec < sysctl_sched_min_granularity)
1872 return;
1873
f4cfb33e
WX
1874 se = __pick_first_entity(cfs_rq);
1875 delta = curr->vruntime - se->vruntime;
f685ceac 1876
f4cfb33e
WX
1877 if (delta < 0)
1878 return;
d7d82944 1879
f4cfb33e
WX
1880 if (delta > ideal_runtime)
1881 resched_task(rq_of(cfs_rq)->curr);
bf0f6f24
IM
1882}
1883
83b699ed 1884static void
8494f412 1885set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
bf0f6f24 1886{
83b699ed
SV
1887 /* 'current' is not kept within the tree. */
1888 if (se->on_rq) {
1889 /*
1890 * Any task has to be enqueued before it get to execute on
1891 * a CPU. So account for the time it spent waiting on the
1892 * runqueue.
1893 */
1894 update_stats_wait_end(cfs_rq, se);
1895 __dequeue_entity(cfs_rq, se);
1896 }
1897
79303e9e 1898 update_stats_curr_start(cfs_rq, se);
429d43bc 1899 cfs_rq->curr = se;
eba1ed4b
IM
1900#ifdef CONFIG_SCHEDSTATS
1901 /*
1902 * Track our maximum slice length, if the CPU's load is at
1903 * least twice that of our own weight (i.e. dont track it
1904 * when there are only lesser-weight tasks around):
1905 */
495eca49 1906 if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
41acab88 1907 se->statistics.slice_max = max(se->statistics.slice_max,
eba1ed4b
IM
1908 se->sum_exec_runtime - se->prev_sum_exec_runtime);
1909 }
1910#endif
4a55b450 1911 se->prev_sum_exec_runtime = se->sum_exec_runtime;
bf0f6f24
IM
1912}
1913
3f3a4904
PZ
1914static int
1915wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
1916
ac53db59
RR
1917/*
1918 * Pick the next process, keeping these things in mind, in this order:
1919 * 1) keep things fair between processes/task groups
1920 * 2) pick the "next" process, since someone really wants that to run
1921 * 3) pick the "last" process, for cache locality
1922 * 4) do not run the "skip" process, if something else is available
1923 */
f4b6755f 1924static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
aa2ac252 1925{
ac53db59 1926 struct sched_entity *se = __pick_first_entity(cfs_rq);
f685ceac 1927 struct sched_entity *left = se;
f4b6755f 1928
ac53db59
RR
1929 /*
1930 * Avoid running the skip buddy, if running something else can
1931 * be done without getting too unfair.
1932 */
1933 if (cfs_rq->skip == se) {
1934 struct sched_entity *second = __pick_next_entity(se);
1935 if (second && wakeup_preempt_entity(second, left) < 1)
1936 se = second;
1937 }
aa2ac252 1938
f685ceac
MG
1939 /*
1940 * Prefer last buddy, try to return the CPU to a preempted task.
1941 */
1942 if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
1943 se = cfs_rq->last;
1944
ac53db59
RR
1945 /*
1946 * Someone really wants this to run. If it's not unfair, run it.
1947 */
1948 if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
1949 se = cfs_rq->next;
1950
f685ceac 1951 clear_buddies(cfs_rq, se);
4793241b
PZ
1952
1953 return se;
aa2ac252
PZ
1954}
1955
d3d9dc33
PT
1956static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
1957
ab6cde26 1958static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
bf0f6f24
IM
1959{
1960 /*
1961 * If still on the runqueue then deactivate_task()
1962 * was not called and update_curr() has to be done:
1963 */
1964 if (prev->on_rq)
b7cc0896 1965 update_curr(cfs_rq);
bf0f6f24 1966
d3d9dc33
PT
1967 /* throttle cfs_rqs exceeding runtime */
1968 check_cfs_rq_runtime(cfs_rq);
1969
ddc97297 1970 check_spread(cfs_rq, prev);
30cfdcfc 1971 if (prev->on_rq) {
5870db5b 1972 update_stats_wait_start(cfs_rq, prev);
30cfdcfc
DA
1973 /* Put 'current' back into the tree. */
1974 __enqueue_entity(cfs_rq, prev);
9d85f21c 1975 /* in !on_rq case, update occurred at dequeue */
9ee474f5 1976 update_entity_load_avg(prev, 1);
30cfdcfc 1977 }
429d43bc 1978 cfs_rq->curr = NULL;
bf0f6f24
IM
1979}
1980
8f4d37ec
PZ
1981static void
1982entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
bf0f6f24 1983{
bf0f6f24 1984 /*
30cfdcfc 1985 * Update run-time statistics of the 'current'.
bf0f6f24 1986 */
30cfdcfc 1987 update_curr(cfs_rq);
bf0f6f24 1988
9d85f21c
PT
1989 /*
1990 * Ensure that runnable average is periodically updated.
1991 */
9ee474f5 1992 update_entity_load_avg(curr, 1);
aff3e498 1993 update_cfs_rq_blocked_load(cfs_rq, 1);
dead45bd 1994 update_cfs_shares(cfs_rq);
9d85f21c 1995
8f4d37ec
PZ
1996#ifdef CONFIG_SCHED_HRTICK
1997 /*
1998 * queued ticks are scheduled to match the slice, so don't bother
1999 * validating it and just reschedule.
2000 */
983ed7a6
HH
2001 if (queued) {
2002 resched_task(rq_of(cfs_rq)->curr);
2003 return;
2004 }
8f4d37ec
PZ
2005 /*
2006 * don't let the period tick interfere with the hrtick preemption
2007 */
2008 if (!sched_feat(DOUBLE_TICK) &&
2009 hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2010 return;
2011#endif
2012
2c2efaed 2013 if (cfs_rq->nr_running > 1)
2e09bf55 2014 check_preempt_tick(cfs_rq, curr);
bf0f6f24
IM
2015}
2016
ab84d31e
PT
2017
2018/**************************************************
2019 * CFS bandwidth control machinery
2020 */
2021
2022#ifdef CONFIG_CFS_BANDWIDTH
029632fb
PZ
2023
2024#ifdef HAVE_JUMP_LABEL
c5905afb 2025static struct static_key __cfs_bandwidth_used;
029632fb
PZ
2026
2027static inline bool cfs_bandwidth_used(void)
2028{
c5905afb 2029 return static_key_false(&__cfs_bandwidth_used);
029632fb
PZ
2030}
2031
9d80092f 2032void cfs_bandwidth_usage_inc(void)
029632fb 2033{
9d80092f
BS
2034 static_key_slow_inc(&__cfs_bandwidth_used);
2035}
2036
2037void cfs_bandwidth_usage_dec(void)
2038{
2039 static_key_slow_dec(&__cfs_bandwidth_used);
029632fb
PZ
2040}
2041#else /* HAVE_JUMP_LABEL */
2042static bool cfs_bandwidth_used(void)
2043{
2044 return true;
2045}
2046
9d80092f
BS
2047void cfs_bandwidth_usage_inc(void) {}
2048void cfs_bandwidth_usage_dec(void) {}
029632fb
PZ
2049#endif /* HAVE_JUMP_LABEL */
2050
ab84d31e
PT
2051/*
2052 * default period for cfs group bandwidth.
2053 * default: 0.1s, units: nanoseconds
2054 */
2055static inline u64 default_cfs_period(void)
2056{
2057 return 100000000ULL;
2058}
ec12cb7f
PT
2059
2060static inline u64 sched_cfs_bandwidth_slice(void)
2061{
2062 return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2063}
2064
a9cf55b2
PT
2065/*
2066 * Replenish runtime according to assigned quota and update expiration time.
2067 * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2068 * additional synchronization around rq->lock.
2069 *
2070 * requires cfs_b->lock
2071 */
029632fb 2072void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
a9cf55b2
PT
2073{
2074 u64 now;
2075
2076 if (cfs_b->quota == RUNTIME_INF)
2077 return;
2078
2079 now = sched_clock_cpu(smp_processor_id());
2080 cfs_b->runtime = cfs_b->quota;
2081 cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2082}
2083
029632fb
PZ
2084static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2085{
2086 return &tg->cfs_bandwidth;
2087}
2088
f1b17280
PT
2089/* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2090static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2091{
2092 if (unlikely(cfs_rq->throttle_count))
2093 return cfs_rq->throttled_clock_task;
2094
2095 return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2096}
2097
85dac906
PT
2098/* returns 0 on failure to allocate runtime */
2099static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f
PT
2100{
2101 struct task_group *tg = cfs_rq->tg;
2102 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
a9cf55b2 2103 u64 amount = 0, min_amount, expires;
ec12cb7f
PT
2104
2105 /* note: this is a positive sum as runtime_remaining <= 0 */
2106 min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2107
2108 raw_spin_lock(&cfs_b->lock);
2109 if (cfs_b->quota == RUNTIME_INF)
2110 amount = min_amount;
58088ad0 2111 else {
a9cf55b2
PT
2112 /*
2113 * If the bandwidth pool has become inactive, then at least one
2114 * period must have elapsed since the last consumption.
2115 * Refresh the global state and ensure bandwidth timer becomes
2116 * active.
2117 */
2118 if (!cfs_b->timer_active) {
2119 __refill_cfs_bandwidth_runtime(cfs_b);
58088ad0 2120 __start_cfs_bandwidth(cfs_b);
a9cf55b2 2121 }
58088ad0
PT
2122
2123 if (cfs_b->runtime > 0) {
2124 amount = min(cfs_b->runtime, min_amount);
2125 cfs_b->runtime -= amount;
2126 cfs_b->idle = 0;
2127 }
ec12cb7f 2128 }
a9cf55b2 2129 expires = cfs_b->runtime_expires;
ec12cb7f
PT
2130 raw_spin_unlock(&cfs_b->lock);
2131
2132 cfs_rq->runtime_remaining += amount;
a9cf55b2
PT
2133 /*
2134 * we may have advanced our local expiration to account for allowed
2135 * spread between our sched_clock and the one on which runtime was
2136 * issued.
2137 */
2138 if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2139 cfs_rq->runtime_expires = expires;
85dac906
PT
2140
2141 return cfs_rq->runtime_remaining > 0;
ec12cb7f
PT
2142}
2143
a9cf55b2
PT
2144/*
2145 * Note: This depends on the synchronization provided by sched_clock and the
2146 * fact that rq->clock snapshots this value.
2147 */
2148static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
ec12cb7f 2149{
a9cf55b2
PT
2150 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2151 struct rq *rq = rq_of(cfs_rq);
2152
2153 /* if the deadline is ahead of our clock, nothing to do */
2154 if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
ec12cb7f
PT
2155 return;
2156
a9cf55b2
PT
2157 if (cfs_rq->runtime_remaining < 0)
2158 return;
2159
2160 /*
2161 * If the local deadline has passed we have to consider the
2162 * possibility that our sched_clock is 'fast' and the global deadline
2163 * has not truly expired.
2164 *
2165 * Fortunately we can check determine whether this the case by checking
2166 * whether the global deadline has advanced.
2167 */
2168
2169 if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
2170 /* extend local deadline, drift is bounded above by 2 ticks */
2171 cfs_rq->runtime_expires += TICK_NSEC;
2172 } else {
2173 /* global deadline is ahead, expiration has passed */
2174 cfs_rq->runtime_remaining = 0;
2175 }
2176}
2177
2178static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2179 unsigned long delta_exec)
2180{
2181 /* dock delta_exec before expiring quota (as it could span periods) */
ec12cb7f 2182 cfs_rq->runtime_remaining -= delta_exec;
a9cf55b2
PT
2183 expire_cfs_rq_runtime(cfs_rq);
2184
2185 if (likely(cfs_rq->runtime_remaining > 0))
ec12cb7f
PT
2186 return;
2187
85dac906
PT
2188 /*
2189 * if we're unable to extend our runtime we resched so that the active
2190 * hierarchy can be throttled
2191 */
2192 if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
2193 resched_task(rq_of(cfs_rq)->curr);
ec12cb7f
PT
2194}
2195
6c16a6dc
PZ
2196static __always_inline
2197void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
ec12cb7f 2198{
56f570e5 2199 if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
ec12cb7f
PT
2200 return;
2201
2202 __account_cfs_rq_runtime(cfs_rq, delta_exec);
2203}
2204
85dac906
PT
2205static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2206{
56f570e5 2207 return cfs_bandwidth_used() && cfs_rq->throttled;
85dac906
PT
2208}
2209
64660c86
PT
2210/* check whether cfs_rq, or any parent, is throttled */
2211static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2212{
56f570e5 2213 return cfs_bandwidth_used() && cfs_rq->throttle_count;
64660c86
PT
2214}
2215
2216/*
2217 * Ensure that neither of the group entities corresponding to src_cpu or
2218 * dest_cpu are members of a throttled hierarchy when performing group
2219 * load-balance operations.
2220 */
2221static inline int throttled_lb_pair(struct task_group *tg,
2222 int src_cpu, int dest_cpu)
2223{
2224 struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
2225
2226 src_cfs_rq = tg->cfs_rq[src_cpu];
2227 dest_cfs_rq = tg->cfs_rq[dest_cpu];
2228
2229 return throttled_hierarchy(src_cfs_rq) ||
2230 throttled_hierarchy(dest_cfs_rq);
2231}
2232
2233/* updated child weight may affect parent so we have to do this bottom up */
2234static int tg_unthrottle_up(struct task_group *tg, void *data)
2235{
2236 struct rq *rq = data;
2237 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2238
2239 cfs_rq->throttle_count--;
2240#ifdef CONFIG_SMP
2241 if (!cfs_rq->throttle_count) {
f1b17280
PT
2242 /* adjust cfs_rq_clock_task() */
2243 cfs_rq->throttled_clock_task_time += rq->clock_task -
2244 cfs_rq->throttled_clock_task;
64660c86
PT
2245 }
2246#endif
2247
2248 return 0;
2249}
2250
2251static int tg_throttle_down(struct task_group *tg, void *data)
2252{
2253 struct rq *rq = data;
2254 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
2255
82958366
PT
2256 /* group is entering throttled state, stop time */
2257 if (!cfs_rq->throttle_count)
f1b17280 2258 cfs_rq->throttled_clock_task = rq->clock_task;
64660c86
PT
2259 cfs_rq->throttle_count++;
2260
2261 return 0;
2262}
2263
d3d9dc33 2264static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
85dac906
PT
2265{
2266 struct rq *rq = rq_of(cfs_rq);
2267 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2268 struct sched_entity *se;
2269 long task_delta, dequeue = 1;
2270
2271 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2272
f1b17280 2273 /* freeze hierarchy runnable averages while throttled */
64660c86
PT
2274 rcu_read_lock();
2275 walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
2276 rcu_read_unlock();
85dac906
PT
2277
2278 task_delta = cfs_rq->h_nr_running;
2279 for_each_sched_entity(se) {
2280 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
2281 /* throttled entity or throttle-on-deactivate */
2282 if (!se->on_rq)
2283 break;
2284
2285 if (dequeue)
2286 dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
2287 qcfs_rq->h_nr_running -= task_delta;
2288
2289 if (qcfs_rq->load.weight)
2290 dequeue = 0;
2291 }
2292
2293 if (!se)
2294 rq->nr_running -= task_delta;
2295
2296 cfs_rq->throttled = 1;
f1b17280 2297 cfs_rq->throttled_clock = rq->clock;
85dac906
PT
2298 raw_spin_lock(&cfs_b->lock);
2299 list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
5232a719
BS
2300 if (!cfs_b->timer_active)
2301 __start_cfs_bandwidth(cfs_b);
85dac906
PT
2302 raw_spin_unlock(&cfs_b->lock);
2303}
2304
029632fb 2305void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
671fd9da
PT
2306{
2307 struct rq *rq = rq_of(cfs_rq);
2308 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2309 struct sched_entity *se;
2310 int enqueue = 1;
2311 long task_delta;
2312
2313 se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
2314
2315 cfs_rq->throttled = 0;
2316 raw_spin_lock(&cfs_b->lock);
f1b17280 2317 cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
671fd9da
PT
2318 list_del_rcu(&cfs_rq->throttled_list);
2319 raw_spin_unlock(&cfs_b->lock);
2320
64660c86
PT
2321 update_rq_clock(rq);
2322 /* update hierarchical throttle state */
2323 walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
2324
671fd9da
PT
2325 if (!cfs_rq->load.weight)
2326 return;
2327
2328 task_delta = cfs_rq->h_nr_running;
2329 for_each_sched_entity(se) {
2330 if (se->on_rq)
2331 enqueue = 0;
2332
2333 cfs_rq = cfs_rq_of(se);
2334 if (enqueue)
2335 enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
2336 cfs_rq->h_nr_running += task_delta;
2337
2338 if (cfs_rq_throttled(cfs_rq))
2339 break;
2340 }
2341
2342 if (!se)
2343 rq->nr_running += task_delta;
2344
2345 /* determine whether we need to wake up potentially idle cpu */
2346 if (rq->curr == rq->idle && rq->cfs.nr_running)
2347 resched_task(rq->curr);
2348}
2349
2350static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
2351 u64 remaining, u64 expires)
2352{
2353 struct cfs_rq *cfs_rq;
2354 u64 runtime = remaining;
2355
2356 rcu_read_lock();
2357 list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
2358 throttled_list) {
2359 struct rq *rq = rq_of(cfs_rq);
2360
2361 raw_spin_lock(&rq->lock);
2362 if (!cfs_rq_throttled(cfs_rq))
2363 goto next;
2364
2365 runtime = -cfs_rq->runtime_remaining + 1;
2366 if (runtime > remaining)
2367 runtime = remaining;
2368 remaining -= runtime;
2369
2370 cfs_rq->runtime_remaining += runtime;
2371 cfs_rq->runtime_expires = expires;
2372
2373 /* we check whether we're throttled above */
2374 if (cfs_rq->runtime_remaining > 0)
2375 unthrottle_cfs_rq(cfs_rq);
2376
2377next:
2378 raw_spin_unlock(&rq->lock);
2379
2380 if (!remaining)
2381 break;
2382 }
2383 rcu_read_unlock();
2384
2385 return remaining;
2386}
2387
58088ad0
PT
2388/*
2389 * Responsible for refilling a task_group's bandwidth and unthrottling its
2390 * cfs_rqs as appropriate. If there has been no activity within the last
2391 * period the timer is deactivated until scheduling resumes; cfs_b->idle is
2392 * used to track this state.
2393 */
2394static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
2395{
671fd9da
PT
2396 u64 runtime, runtime_expires;
2397 int idle = 1, throttled;
58088ad0
PT
2398
2399 raw_spin_lock(&cfs_b->lock);
2400 /* no need to continue the timer with no bandwidth constraint */
2401 if (cfs_b->quota == RUNTIME_INF)
2402 goto out_unlock;
2403
671fd9da
PT
2404 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2405 /* idle depends on !throttled (for the case of a large deficit) */
2406 idle = cfs_b->idle && !throttled;
e8da1b18 2407 cfs_b->nr_periods += overrun;
671fd9da 2408
a9cf55b2
PT
2409 /* if we're going inactive then everything else can be deferred */
2410 if (idle)
2411 goto out_unlock;
2412
9ca715c4
BS
2413 /*
2414 * if we have relooped after returning idle once, we need to update our
2415 * status as actually running, so that other cpus doing
2416 * __start_cfs_bandwidth will stop trying to cancel us.
2417 */
2418 cfs_b->timer_active = 1;
2419
a9cf55b2
PT
2420 __refill_cfs_bandwidth_runtime(cfs_b);
2421
671fd9da
PT
2422 if (!throttled) {
2423 /* mark as potentially idle for the upcoming period */
2424 cfs_b->idle = 1;
2425 goto out_unlock;
2426 }
2427
e8da1b18
NR
2428 /* account preceding periods in which throttling occurred */
2429 cfs_b->nr_throttled += overrun;
2430
671fd9da
PT
2431 /*
2432 * There are throttled entities so we must first use the new bandwidth
2433 * to unthrottle them before making it generally available. This
2434 * ensures that all existing debts will be paid before a new cfs_rq is
2435 * allowed to run.
2436 */
2437 runtime = cfs_b->runtime;
2438 runtime_expires = cfs_b->runtime_expires;
2439 cfs_b->runtime = 0;
2440
2441 /*
2442 * This check is repeated as we are holding onto the new bandwidth
2443 * while we unthrottle. This can potentially race with an unthrottled
2444 * group trying to acquire new bandwidth from the global pool.
2445 */
2446 while (throttled && runtime > 0) {
2447 raw_spin_unlock(&cfs_b->lock);
2448 /* we can't nest cfs_b->lock while distributing bandwidth */
2449 runtime = distribute_cfs_runtime(cfs_b, runtime,
2450 runtime_expires);
2451 raw_spin_lock(&cfs_b->lock);
2452
2453 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
2454 }
58088ad0 2455
671fd9da
PT
2456 /* return (any) remaining runtime */
2457 cfs_b->runtime = runtime;
2458 /*
2459 * While we are ensured activity in the period following an
2460 * unthrottle, this also covers the case in which the new bandwidth is
2461 * insufficient to cover the existing bandwidth deficit. (Forcing the
2462 * timer to remain active while there are any throttled entities.)
2463 */
2464 cfs_b->idle = 0;
58088ad0
PT
2465out_unlock:
2466 if (idle)
2467 cfs_b->timer_active = 0;
2468 raw_spin_unlock(&cfs_b->lock);
2469
2470 return idle;
2471}
d3d9dc33 2472
d8b4986d
PT
2473/* a cfs_rq won't donate quota below this amount */
2474static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
2475/* minimum remaining period time to redistribute slack quota */
2476static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
2477/* how long we wait to gather additional slack before distributing */
2478static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
2479
373e0a59
BS
2480/*
2481 * Are we near the end of the current quota period?
2482 *
2483 * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
2484 * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
2485 * migrate_hrtimers, base is never cleared, so we are fine.
2486 */
d8b4986d
PT
2487static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
2488{
2489 struct hrtimer *refresh_timer = &cfs_b->period_timer;
2490 u64 remaining;
2491
2492 /* if the call-back is running a quota refresh is already occurring */
2493 if (hrtimer_callback_running(refresh_timer))
2494 return 1;
2495
2496 /* is a quota refresh about to occur? */
2497 remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
2498 if (remaining < min_expire)
2499 return 1;
2500
2501 return 0;
2502}
2503
2504static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
2505{
2506 u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
2507
2508 /* if there's a quota refresh soon don't bother with slack */
2509 if (runtime_refresh_within(cfs_b, min_left))
2510 return;
2511
2512 start_bandwidth_timer(&cfs_b->slack_timer,
2513 ns_to_ktime(cfs_bandwidth_slack_period));
2514}
2515
2516/* we know any runtime found here is valid as update_curr() precedes return */
2517static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2518{
2519 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2520 s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
2521
2522 if (slack_runtime <= 0)
2523 return;
2524
2525 raw_spin_lock(&cfs_b->lock);
2526 if (cfs_b->quota != RUNTIME_INF &&
2527 cfs_rq->runtime_expires == cfs_b->runtime_expires) {
2528 cfs_b->runtime += slack_runtime;
2529
2530 /* we are under rq->lock, defer unthrottling using a timer */
2531 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
2532 !list_empty(&cfs_b->throttled_cfs_rq))
2533 start_cfs_slack_bandwidth(cfs_b);
2534 }
2535 raw_spin_unlock(&cfs_b->lock);
2536
2537 /* even if it's not valid for return we don't want to try again */
2538 cfs_rq->runtime_remaining -= slack_runtime;
2539}
2540
2541static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2542{
56f570e5
PT
2543 if (!cfs_bandwidth_used())
2544 return;
2545
fccfdc6f 2546 if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
d8b4986d
PT
2547 return;
2548
2549 __return_cfs_rq_runtime(cfs_rq);
2550}
2551
2552/*
2553 * This is done with a timer (instead of inline with bandwidth return) since
2554 * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
2555 */
2556static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
2557{
2558 u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
2559 u64 expires;
2560
2561 /* confirm we're still not at a refresh boundary */
373e0a59
BS
2562 raw_spin_lock(&cfs_b->lock);
2563 if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
2564 raw_spin_unlock(&cfs_b->lock);
d8b4986d 2565 return;
373e0a59 2566 }
d8b4986d 2567
d8b4986d
PT
2568 if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
2569 runtime = cfs_b->runtime;
2570 cfs_b->runtime = 0;
2571 }
2572 expires = cfs_b->runtime_expires;
2573 raw_spin_unlock(&cfs_b->lock);
2574
2575 if (!runtime)
2576 return;
2577
2578 runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
2579
2580 raw_spin_lock(&cfs_b->lock);
2581 if (expires == cfs_b->runtime_expires)
2582 cfs_b->runtime = runtime;
2583 raw_spin_unlock(&cfs_b->lock);
2584}
2585
d3d9dc33
PT
2586/*
2587 * When a group wakes up we want to make sure that its quota is not already
2588 * expired/exceeded, otherwise it may be allowed to steal additional ticks of
2589 * runtime as update_curr() throttling can not not trigger until it's on-rq.
2590 */
2591static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
2592{
56f570e5
PT
2593 if (!cfs_bandwidth_used())
2594 return;
2595
d3d9dc33
PT
2596 /* an active group must be handled by the update_curr()->put() path */
2597 if (!cfs_rq->runtime_enabled || cfs_rq->curr)
2598 return;
2599
2600 /* ensure the group is not already throttled */
2601 if (cfs_rq_throttled(cfs_rq))
2602 return;
2603
2604 /* update runtime allocation */
2605 account_cfs_rq_runtime(cfs_rq, 0);
2606 if (cfs_rq->runtime_remaining <= 0)
2607 throttle_cfs_rq(cfs_rq);
2608}
2609
2610/* conditionally throttle active cfs_rq's from put_prev_entity() */
2611static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2612{
56f570e5
PT
2613 if (!cfs_bandwidth_used())
2614 return;
2615
d3d9dc33
PT
2616 if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
2617 return;
2618
2619 /*
2620 * it's possible for a throttled entity to be forced into a running
2621 * state (e.g. set_curr_task), in this case we're finished.
2622 */
2623 if (cfs_rq_throttled(cfs_rq))
2624 return;
2625
2626 throttle_cfs_rq(cfs_rq);
2627}
029632fb
PZ
2628
2629static inline u64 default_cfs_period(void);
2630static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
2631static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
2632
2633static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
2634{
2635 struct cfs_bandwidth *cfs_b =
2636 container_of(timer, struct cfs_bandwidth, slack_timer);
2637 do_sched_cfs_slack_timer(cfs_b);
2638
2639 return HRTIMER_NORESTART;
2640}
2641
2642static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
2643{
2644 struct cfs_bandwidth *cfs_b =
2645 container_of(timer, struct cfs_bandwidth, period_timer);
2646 ktime_t now;
2647 int overrun;
2648 int idle = 0;
2649
2650 for (;;) {
2651 now = hrtimer_cb_get_time(timer);
2652 overrun = hrtimer_forward(timer, now, cfs_b->period);
2653
2654 if (!overrun)
2655 break;
2656
2657 idle = do_sched_cfs_period_timer(cfs_b, overrun);
2658 }
2659
2660 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
2661}
2662
2663void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2664{
2665 raw_spin_lock_init(&cfs_b->lock);
2666 cfs_b->runtime = 0;
2667 cfs_b->quota = RUNTIME_INF;
2668 cfs_b->period = ns_to_ktime(default_cfs_period());
2669
2670 INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
2671 hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2672 cfs_b->period_timer.function = sched_cfs_period_timer;
2673 hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
2674 cfs_b->slack_timer.function = sched_cfs_slack_timer;
2675}
2676
2677static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2678{
2679 cfs_rq->runtime_enabled = 0;
2680 INIT_LIST_HEAD(&cfs_rq->throttled_list);
2681}
2682
2683/* requires cfs_b->lock, may release to reprogram timer */
2684void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2685{
2686 /*
2687 * The timer may be active because we're trying to set a new bandwidth
2688 * period or because we're racing with the tear-down path
2689 * (timer_active==0 becomes visible before the hrtimer call-back
2690 * terminates). In either case we ensure that it's re-programmed
2691 */
9ca715c4
BS
2692 while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
2693 hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
2694 /* bounce the lock to allow do_sched_cfs_period_timer to run */
029632fb 2695 raw_spin_unlock(&cfs_b->lock);
9ca715c4 2696 cpu_relax();
029632fb
PZ
2697 raw_spin_lock(&cfs_b->lock);
2698 /* if someone else restarted the timer then we're done */
2699 if (cfs_b->timer_active)
2700 return;
2701 }
2702
2703 cfs_b->timer_active = 1;
2704 start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
2705}
2706
2707static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
2708{
2709 hrtimer_cancel(&cfs_b->period_timer);
2710 hrtimer_cancel(&cfs_b->slack_timer);
2711}
2712
38dc3348 2713static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
029632fb
PZ
2714{
2715 struct cfs_rq *cfs_rq;
2716
2717 for_each_leaf_cfs_rq(rq, cfs_rq) {
2718 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
2719
2720 if (!cfs_rq->runtime_enabled)
2721 continue;
2722
2723 /*
2724 * clock_task is not advancing so we just need to make sure
2725 * there's some valid quota amount
2726 */
2727 cfs_rq->runtime_remaining = cfs_b->quota;
2728 if (cfs_rq_throttled(cfs_rq))
2729 unthrottle_cfs_rq(cfs_rq);
2730 }
2731}
2732
2733#else /* CONFIG_CFS_BANDWIDTH */
f1b17280
PT
2734static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2735{
2736 return rq_of(cfs_rq)->clock_task;
2737}
2738
2739static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
2740 unsigned long delta_exec) {}
d3d9dc33
PT
2741static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
2742static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
6c16a6dc 2743static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
85dac906
PT
2744
2745static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
2746{
2747 return 0;
2748}
64660c86
PT
2749
2750static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
2751{
2752 return 0;
2753}
2754
2755static inline int throttled_lb_pair(struct task_group *tg,
2756 int src_cpu, int dest_cpu)
2757{
2758 return 0;
2759}
029632fb
PZ
2760
2761void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
2762
2763#ifdef CONFIG_FAIR_GROUP_SCHED
2764static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
ab84d31e
PT
2765#endif
2766
029632fb
PZ
2767static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2768{
2769 return NULL;
2770}
2771static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
a4c96ae3 2772static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
029632fb
PZ
2773
2774#endif /* CONFIG_CFS_BANDWIDTH */
2775
bf0f6f24
IM
2776/**************************************************
2777 * CFS operations on tasks:
2778 */
2779
8f4d37ec
PZ
2780#ifdef CONFIG_SCHED_HRTICK
2781static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
2782{
8f4d37ec
PZ
2783 struct sched_entity *se = &p->se;
2784 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2785
2786 WARN_ON(task_rq(p) != rq);
2787
b39e66ea 2788 if (cfs_rq->nr_running > 1) {
8f4d37ec
PZ
2789 u64 slice = sched_slice(cfs_rq, se);
2790 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
2791 s64 delta = slice - ran;
2792
2793 if (delta < 0) {
2794 if (rq->curr == p)
2795 resched_task(p);
2796 return;
2797 }
2798
2799 /*
2800 * Don't schedule slices shorter than 10000ns, that just
2801 * doesn't make sense. Rely on vruntime for fairness.
2802 */
31656519 2803 if (rq->curr != p)
157124c1 2804 delta = max_t(s64, 10000LL, delta);
8f4d37ec 2805
31656519 2806 hrtick_start(rq, delta);
8f4d37ec
PZ
2807 }
2808}
a4c2f00f
PZ
2809
2810/*
2811 * called from enqueue/dequeue and updates the hrtick when the
2812 * current task is from our class and nr_running is low enough
2813 * to matter.
2814 */
2815static void hrtick_update(struct rq *rq)
2816{
2817 struct task_struct *curr = rq->curr;
2818
b39e66ea 2819 if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
a4c2f00f
PZ
2820 return;
2821
2822 if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
2823 hrtick_start_fair(rq, curr);
2824}
55e12e5e 2825#else /* !CONFIG_SCHED_HRTICK */
8f4d37ec
PZ
2826static inline void
2827hrtick_start_fair(struct rq *rq, struct task_struct *p)
2828{
2829}
a4c2f00f
PZ
2830
2831static inline void hrtick_update(struct rq *rq)
2832{
2833}
8f4d37ec
PZ
2834#endif
2835
bf0f6f24
IM
2836/*
2837 * The enqueue_task method is called before nr_running is
2838 * increased. Here we update the fair scheduling stats and
2839 * then put the task into the rbtree:
2840 */
ea87bb78 2841static void
371fd7e7 2842enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
2843{
2844 struct cfs_rq *cfs_rq;
62fb1851 2845 struct sched_entity *se = &p->se;
bf0f6f24
IM
2846
2847 for_each_sched_entity(se) {
62fb1851 2848 if (se->on_rq)
bf0f6f24
IM
2849 break;
2850 cfs_rq = cfs_rq_of(se);
88ec22d3 2851 enqueue_entity(cfs_rq, se, flags);
85dac906
PT
2852
2853 /*
2854 * end evaluation on encountering a throttled cfs_rq
2855 *
2856 * note: in the case of encountering a throttled cfs_rq we will
2857 * post the final h_nr_running increment below.
2858 */
2859 if (cfs_rq_throttled(cfs_rq))
2860 break;
953bfcd1 2861 cfs_rq->h_nr_running++;
85dac906 2862
88ec22d3 2863 flags = ENQUEUE_WAKEUP;
bf0f6f24 2864 }
8f4d37ec 2865
2069dd75 2866 for_each_sched_entity(se) {
0f317143 2867 cfs_rq = cfs_rq_of(se);
953bfcd1 2868 cfs_rq->h_nr_running++;
2069dd75 2869
85dac906
PT
2870 if (cfs_rq_throttled(cfs_rq))
2871 break;
2872
17bc14b7 2873 update_cfs_shares(cfs_rq);
9ee474f5 2874 update_entity_load_avg(se, 1);
2069dd75
PZ
2875 }
2876
18bf2805
BS
2877 if (!se) {
2878 update_rq_runnable_avg(rq, rq->nr_running);
85dac906 2879 inc_nr_running(rq);
18bf2805 2880 }
a4c2f00f 2881 hrtick_update(rq);
bf0f6f24
IM
2882}
2883
2f36825b
VP
2884static void set_next_buddy(struct sched_entity *se);
2885
bf0f6f24
IM
2886/*
2887 * The dequeue_task method is called before nr_running is
2888 * decreased. We remove the task from the rbtree and
2889 * update the fair scheduling stats:
2890 */
371fd7e7 2891static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
bf0f6f24
IM
2892{
2893 struct cfs_rq *cfs_rq;
62fb1851 2894 struct sched_entity *se = &p->se;
2f36825b 2895 int task_sleep = flags & DEQUEUE_SLEEP;
bf0f6f24
IM
2896
2897 for_each_sched_entity(se) {
2898 cfs_rq = cfs_rq_of(se);
371fd7e7 2899 dequeue_entity(cfs_rq, se, flags);
85dac906
PT
2900
2901 /*
2902 * end evaluation on encountering a throttled cfs_rq
2903 *
2904 * note: in the case of encountering a throttled cfs_rq we will
2905 * post the final h_nr_running decrement below.
2906 */
2907 if (cfs_rq_throttled(cfs_rq))
2908 break;
953bfcd1 2909 cfs_rq->h_nr_running--;
2069dd75 2910
bf0f6f24 2911 /* Don't dequeue parent if it has other entities besides us */
2f36825b
VP
2912 if (cfs_rq->load.weight) {
2913 /*
2914 * Bias pick_next to pick a task from this cfs_rq, as
2915 * p is sleeping when it is within its sched_slice.
2916 */
2917 if (task_sleep && parent_entity(se))
2918 set_next_buddy(parent_entity(se));
9598c82d
PT
2919
2920 /* avoid re-evaluating load for this entity */
2921 se = parent_entity(se);
bf0f6f24 2922 break;
2f36825b 2923 }
371fd7e7 2924 flags |= DEQUEUE_SLEEP;
bf0f6f24 2925 }
8f4d37ec 2926
2069dd75 2927 for_each_sched_entity(se) {
0f317143 2928 cfs_rq = cfs_rq_of(se);
953bfcd1 2929 cfs_rq->h_nr_running--;
2069dd75 2930
85dac906
PT
2931 if (cfs_rq_throttled(cfs_rq))
2932 break;
2933
17bc14b7 2934 update_cfs_shares(cfs_rq);
9ee474f5 2935 update_entity_load_avg(se, 1);
2069dd75
PZ
2936 }
2937
18bf2805 2938 if (!se) {
85dac906 2939 dec_nr_running(rq);
18bf2805
BS
2940 update_rq_runnable_avg(rq, 1);
2941 }
a4c2f00f 2942 hrtick_update(rq);
bf0f6f24
IM
2943}
2944
e7693a36 2945#ifdef CONFIG_SMP
029632fb
PZ
2946/* Used instead of source_load when we know the type == 0 */
2947static unsigned long weighted_cpuload(const int cpu)
2948{
2949 return cpu_rq(cpu)->load.weight;
2950}
2951
2952/*
2953 * Return a low guess at the load of a migration-source cpu weighted
2954 * according to the scheduling class and "nice" value.
2955 *
2956 * We want to under-estimate the load of migration sources, to
2957 * balance conservatively.
2958 */
2959static unsigned long source_load(int cpu, int type)
2960{
2961 struct rq *rq = cpu_rq(cpu);
2962 unsigned long total = weighted_cpuload(cpu);
2963
2964 if (type == 0 || !sched_feat(LB_BIAS))
2965 return total;
2966
2967 return min(rq->cpu_load[type-1], total);
2968}
2969
2970/*
2971 * Return a high guess at the load of a migration-target cpu weighted
2972 * according to the scheduling class and "nice" value.
2973 */
2974static unsigned long target_load(int cpu, int type)
2975{
2976 struct rq *rq = cpu_rq(cpu);
2977 unsigned long total = weighted_cpuload(cpu);
2978
2979 if (type == 0 || !sched_feat(LB_BIAS))
2980 return total;
2981
2982 return max(rq->cpu_load[type-1], total);
2983}
2984
2985static unsigned long power_of(int cpu)
2986{
2987 return cpu_rq(cpu)->cpu_power;
2988}
2989
2990static unsigned long cpu_avg_load_per_task(int cpu)
2991{
2992 struct rq *rq = cpu_rq(cpu);
2993 unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
2994
2995 if (nr_running)
2996 return rq->load.weight / nr_running;
2997
2998 return 0;
2999}
3000
098fb9db 3001
74f8e4b2 3002static void task_waking_fair(struct task_struct *p)
88ec22d3
PZ
3003{
3004 struct sched_entity *se = &p->se;
3005 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3fe1698b
PZ
3006 u64 min_vruntime;
3007
3008#ifndef CONFIG_64BIT
3009 u64 min_vruntime_copy;
88ec22d3 3010
3fe1698b
PZ
3011 do {
3012 min_vruntime_copy = cfs_rq->min_vruntime_copy;
3013 smp_rmb();
3014 min_vruntime = cfs_rq->min_vruntime;
3015 } while (min_vruntime != min_vruntime_copy);
3016#else
3017 min_vruntime = cfs_rq->min_vruntime;
3018#endif
88ec22d3 3019
3fe1698b 3020 se->vruntime -= min_vruntime;
88ec22d3
PZ
3021}
3022
bb3469ac 3023#ifdef CONFIG_FAIR_GROUP_SCHED
f5bfb7d9
PZ
3024/*
3025 * effective_load() calculates the load change as seen from the root_task_group
3026 *
3027 * Adding load to a group doesn't make a group heavier, but can cause movement
3028 * of group shares between cpus. Assuming the shares were perfectly aligned one
3029 * can calculate the shift in shares.
cf5f0acf
PZ
3030 *
3031 * Calculate the effective load difference if @wl is added (subtracted) to @tg
3032 * on this @cpu and results in a total addition (subtraction) of @wg to the
3033 * total group weight.
3034 *
3035 * Given a runqueue weight distribution (rw_i) we can compute a shares
3036 * distribution (s_i) using:
3037 *
3038 * s_i = rw_i / \Sum rw_j (1)
3039 *
3040 * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
3041 * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
3042 * shares distribution (s_i):
3043 *
3044 * rw_i = { 2, 4, 1, 0 }
3045 * s_i = { 2/7, 4/7, 1/7, 0 }
3046 *
3047 * As per wake_affine() we're interested in the load of two CPUs (the CPU the
3048 * task used to run on and the CPU the waker is running on), we need to
3049 * compute the effect of waking a task on either CPU and, in case of a sync
3050 * wakeup, compute the effect of the current task going to sleep.
3051 *
3052 * So for a change of @wl to the local @cpu with an overall group weight change
3053 * of @wl we can compute the new shares distribution (s'_i) using:
3054 *
3055 * s'_i = (rw_i + @wl) / (@wg + \Sum rw_j) (2)
3056 *
3057 * Suppose we're interested in CPUs 0 and 1, and want to compute the load
3058 * differences in waking a task to CPU 0. The additional task changes the
3059 * weight and shares distributions like:
3060 *
3061 * rw'_i = { 3, 4, 1, 0 }
3062 * s'_i = { 3/8, 4/8, 1/8, 0 }
3063 *
3064 * We can then compute the difference in effective weight by using:
3065 *
3066 * dw_i = S * (s'_i - s_i) (3)
3067 *
3068 * Where 'S' is the group weight as seen by its parent.
3069 *
3070 * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
3071 * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
3072 * 4/7) times the weight of the group.
f5bfb7d9 3073 */
2069dd75 3074static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
bb3469ac 3075{
4be9daaa 3076 struct sched_entity *se = tg->se[cpu];
f1d239f7 3077
cf5f0acf 3078 if (!tg->parent) /* the trivial, non-cgroup case */
f1d239f7
PZ
3079 return wl;
3080
4be9daaa 3081 for_each_sched_entity(se) {
cf5f0acf 3082 long w, W;
4be9daaa 3083
977dda7c 3084 tg = se->my_q->tg;
bb3469ac 3085
cf5f0acf
PZ
3086 /*
3087 * W = @wg + \Sum rw_j
3088 */
3089 W = wg + calc_tg_weight(tg, se->my_q);
4be9daaa 3090
cf5f0acf
PZ
3091 /*
3092 * w = rw_i + @wl
3093 */
3094 w = se->my_q->load.weight + wl;
940959e9 3095
cf5f0acf
PZ
3096 /*
3097 * wl = S * s'_i; see (2)
3098 */
3099 if (W > 0 && w < W)
3100 wl = (w * tg->shares) / W;
977dda7c
PT
3101 else
3102 wl = tg->shares;
940959e9 3103
cf5f0acf
PZ
3104 /*
3105 * Per the above, wl is the new se->load.weight value; since
3106 * those are clipped to [MIN_SHARES, ...) do so now. See
3107 * calc_cfs_shares().
3108 */
977dda7c
PT
3109 if (wl < MIN_SHARES)
3110 wl = MIN_SHARES;
cf5f0acf
PZ
3111
3112 /*
3113 * wl = dw_i = S * (s'_i - s_i); see (3)
3114 */
977dda7c 3115 wl -= se->load.weight;
cf5f0acf
PZ
3116
3117 /*
3118 * Recursively apply this logic to all parent groups to compute
3119 * the final effective load change on the root group. Since
3120 * only the @tg group gets extra weight, all parent groups can
3121 * only redistribute existing shares. @wl is the shift in shares
3122 * resulting from this level per the above.
3123 */
4be9daaa 3124 wg = 0;
4be9daaa 3125 }
bb3469ac 3126
4be9daaa 3127 return wl;
bb3469ac
PZ
3128}
3129#else
4be9daaa 3130
83378269
PZ
3131static inline unsigned long effective_load(struct task_group *tg, int cpu,
3132 unsigned long wl, unsigned long wg)
4be9daaa 3133{
83378269 3134 return wl;
bb3469ac 3135}
4be9daaa 3136
bb3469ac
PZ
3137#endif
3138
c88d5910 3139static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
098fb9db 3140{
e37b6a7b 3141 s64 this_load, load;
c88d5910 3142 int idx, this_cpu, prev_cpu;
098fb9db 3143 unsigned long tl_per_task;
c88d5910 3144 struct task_group *tg;
83378269 3145 unsigned long weight;
b3137bc8 3146 int balanced;
098fb9db 3147
c88d5910
PZ
3148 idx = sd->wake_idx;
3149 this_cpu = smp_processor_id();
3150 prev_cpu = task_cpu(p);
3151 load = source_load(prev_cpu, idx);
3152 this_load = target_load(this_cpu, idx);
098fb9db 3153
b3137bc8
MG
3154 /*
3155 * If sync wakeup then subtract the (maximum possible)
3156 * effect of the currently running task from the load
3157 * of the current CPU:
3158 */
83378269
PZ
3159 if (sync) {
3160 tg = task_group(current);
3161 weight = current->se.load.weight;
3162
c88d5910 3163 this_load += effective_load(tg, this_cpu, -weight, -weight);
83378269
PZ
3164 load += effective_load(tg, prev_cpu, 0, -weight);
3165 }
b3137bc8 3166
83378269
PZ
3167 tg = task_group(p);
3168 weight = p->se.load.weight;
b3137bc8 3169
71a29aa7
PZ
3170 /*
3171 * In low-load situations, where prev_cpu is idle and this_cpu is idle
c88d5910
PZ
3172 * due to the sync cause above having dropped this_load to 0, we'll
3173 * always have an imbalance, but there's really nothing you can do
3174 * about that, so that's good too.
71a29aa7
PZ
3175 *
3176 * Otherwise check if either cpus are near enough in load to allow this
3177 * task to be woken on this_cpu.
3178 */
e37b6a7b
PT
3179 if (this_load > 0) {
3180 s64 this_eff_load, prev_eff_load;
e51fd5e2
PZ
3181
3182 this_eff_load = 100;
3183 this_eff_load *= power_of(prev_cpu);
3184 this_eff_load *= this_load +
3185 effective_load(tg, this_cpu, weight, weight);
3186
3187 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
3188 prev_eff_load *= power_of(this_cpu);
3189 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
3190
3191 balanced = this_eff_load <= prev_eff_load;
3192 } else
3193 balanced = true;
b3137bc8 3194
098fb9db 3195 /*
4ae7d5ce
IM
3196 * If the currently running task will sleep within
3197 * a reasonable amount of time then attract this newly
3198 * woken task:
098fb9db 3199 */
2fb7635c
PZ
3200 if (sync && balanced)
3201 return 1;
098fb9db 3202
41acab88 3203 schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
098fb9db
IM
3204 tl_per_task = cpu_avg_load_per_task(this_cpu);
3205
c88d5910
PZ
3206 if (balanced ||
3207 (this_load <= load &&
3208 this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
098fb9db
IM
3209 /*
3210 * This domain has SD_WAKE_AFFINE and
3211 * p is cache cold in this domain, and
3212 * there is no bad imbalance.
3213 */
c88d5910 3214 schedstat_inc(sd, ttwu_move_affine);
41acab88 3215 schedstat_inc(p, se.statistics.nr_wakeups_affine);
098fb9db
IM
3216
3217 return 1;
3218 }
3219 return 0;
3220}
3221
aaee1203
PZ
3222/*
3223 * find_idlest_group finds and returns the least busy CPU group within the
3224 * domain.
3225 */
3226static struct sched_group *
78e7ed53 3227find_idlest_group(struct sched_domain *sd, struct task_struct *p,
5158f4e4 3228 int this_cpu, int load_idx)
e7693a36 3229{
b3bd3de6 3230 struct sched_group *idlest = NULL, *group = sd->groups;
aaee1203 3231 unsigned long min_load = ULONG_MAX, this_load = 0;
aaee1203 3232 int imbalance = 100 + (sd->imbalance_pct-100)/2;
e7693a36 3233
aaee1203
PZ
3234 do {
3235 unsigned long load, avg_load;
3236 int local_group;
3237 int i;
e7693a36 3238
aaee1203
PZ
3239 /* Skip over this group if it has no CPUs allowed */
3240 if (!cpumask_intersects(sched_group_cpus(group),
fa17b507 3241 tsk_cpus_allowed(p)))
aaee1203
PZ
3242 continue;
3243
3244 local_group = cpumask_test_cpu(this_cpu,
3245 sched_group_cpus(group));
3246
3247 /* Tally up the load of all CPUs in the group */
3248 avg_load = 0;
3249
3250 for_each_cpu(i, sched_group_cpus(group)) {
3251 /* Bias balancing toward cpus of our domain */
3252 if (local_group)
3253 load = source_load(i, load_idx);
3254 else
3255 load = target_load(i, load_idx);
3256
3257 avg_load += load;
3258 }
3259
3260 /* Adjust by relative CPU power of the group */
9c3f75cb 3261 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
aaee1203
PZ
3262
3263 if (local_group) {
3264 this_load = avg_load;
aaee1203
PZ
3265 } else if (avg_load < min_load) {
3266 min_load = avg_load;
3267 idlest = group;
3268 }
3269 } while (group = group->next, group != sd->groups);
3270
3271 if (!idlest || 100*this_load < imbalance*min_load)
3272 return NULL;
3273 return idlest;
3274}
3275
3276/*
3277 * find_idlest_cpu - find the idlest cpu among the cpus in group.
3278 */
3279static int
3280find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
3281{
3282 unsigned long load, min_load = ULONG_MAX;
3283 int idlest = -1;
3284 int i;
3285
3286 /* Traverse only the allowed CPUs */
fa17b507 3287 for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
aaee1203
PZ
3288 load = weighted_cpuload(i);
3289
3290 if (load < min_load || (load == min_load && i == this_cpu)) {
3291 min_load = load;
3292 idlest = i;
e7693a36
GH
3293 }
3294 }
3295
aaee1203
PZ
3296 return idlest;
3297}
e7693a36 3298
a50bde51
PZ
3299/*
3300 * Try and locate an idle CPU in the sched_domain.
3301 */
99bd5e2f 3302static int select_idle_sibling(struct task_struct *p, int target)
a50bde51 3303{
99bd5e2f 3304 struct sched_domain *sd;
37407ea7 3305 struct sched_group *sg;
e0a79f52 3306 int i = task_cpu(p);
a50bde51 3307
e0a79f52
MG
3308 if (idle_cpu(target))
3309 return target;
99bd5e2f
SS
3310
3311 /*
e0a79f52 3312 * If the prevous cpu is cache affine and idle, don't be stupid.
99bd5e2f 3313 */
e0a79f52
MG
3314 if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
3315 return i;
a50bde51
PZ
3316
3317 /*
37407ea7 3318 * Otherwise, iterate the domains and find an elegible idle cpu.
a50bde51 3319 */
518cd623 3320 sd = rcu_dereference(per_cpu(sd_llc, target));
970e1789 3321 for_each_lower_domain(sd) {
37407ea7
LT
3322 sg = sd->groups;
3323 do {
3324 if (!cpumask_intersects(sched_group_cpus(sg),
3325 tsk_cpus_allowed(p)))
3326 goto next;
3327
3328 for_each_cpu(i, sched_group_cpus(sg)) {
e0a79f52 3329 if (i == target || !idle_cpu(i))
37407ea7
LT
3330 goto next;
3331 }
970e1789 3332
37407ea7
LT
3333 target = cpumask_first_and(sched_group_cpus(sg),
3334 tsk_cpus_allowed(p));
3335 goto done;
3336next:
3337 sg = sg->next;
3338 } while (sg != sd->groups);
3339 }
3340done:
a50bde51
PZ
3341 return target;
3342}
3343
aaee1203
PZ
3344/*
3345 * sched_balance_self: balance the current task (running on cpu) in domains
3346 * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
3347 * SD_BALANCE_EXEC.
3348 *
3349 * Balance, ie. select the least loaded group.
3350 *
3351 * Returns the target CPU number, or the same CPU if no balancing is needed.
3352 *
3353 * preempt must be disabled.
3354 */
0017d735 3355static int
7608dec2 3356select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
aaee1203 3357{
29cd8bae 3358 struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
c88d5910
PZ
3359 int cpu = smp_processor_id();
3360 int prev_cpu = task_cpu(p);
3361 int new_cpu = cpu;
99bd5e2f 3362 int want_affine = 0;
5158f4e4 3363 int sync = wake_flags & WF_SYNC;
c88d5910 3364
29baa747 3365 if (p->nr_cpus_allowed == 1)
76854c7e
MG
3366 return prev_cpu;
3367
0763a660 3368 if (sd_flag & SD_BALANCE_WAKE) {
fa17b507 3369 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
c88d5910
PZ
3370 want_affine = 1;
3371 new_cpu = prev_cpu;
3372 }
aaee1203 3373
dce840a0 3374 rcu_read_lock();
aaee1203 3375 for_each_domain(cpu, tmp) {
e4f42888
PZ
3376 if (!(tmp->flags & SD_LOAD_BALANCE))
3377 continue;
3378
fe3bcfe1 3379 /*
99bd5e2f
SS
3380 * If both cpu and prev_cpu are part of this domain,
3381 * cpu is a valid SD_WAKE_AFFINE target.
fe3bcfe1 3382 */
99bd5e2f
SS
3383 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
3384 cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
3385 affine_sd = tmp;
29cd8bae 3386 break;
f03542a7 3387 }
29cd8bae 3388
f03542a7 3389 if (tmp->flags & sd_flag)
29cd8bae
PZ
3390 sd = tmp;
3391 }
3392
8b911acd 3393 if (affine_sd) {
f03542a7 3394 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
dce840a0
PZ
3395 prev_cpu = cpu;
3396
3397 new_cpu = select_idle_sibling(p, prev_cpu);
3398 goto unlock;
8b911acd 3399 }
e7693a36 3400
aaee1203 3401 while (sd) {
5158f4e4 3402 int load_idx = sd->forkexec_idx;
aaee1203 3403 struct sched_group *group;
c88d5910 3404 int weight;
098fb9db 3405
0763a660 3406 if (!(sd->flags & sd_flag)) {
aaee1203
PZ
3407 sd = sd->child;
3408 continue;
3409 }
098fb9db 3410
5158f4e4
PZ
3411 if (sd_flag & SD_BALANCE_WAKE)
3412 load_idx = sd->wake_idx;
098fb9db 3413
5158f4e4 3414 group = find_idlest_group(sd, p, cpu, load_idx);
aaee1203
PZ
3415 if (!group) {
3416 sd = sd->child;
3417 continue;
3418 }
4ae7d5ce 3419
d7c33c49 3420 new_cpu = find_idlest_cpu(group, p, cpu);
aaee1203
PZ
3421 if (new_cpu == -1 || new_cpu == cpu) {
3422 /* Now try balancing at a lower domain level of cpu */
3423 sd = sd->child;
3424 continue;
e7693a36 3425 }
aaee1203
PZ
3426
3427 /* Now try balancing at a lower domain level of new_cpu */
3428 cpu = new_cpu;
669c55e9 3429 weight = sd->span_weight;
aaee1203
PZ
3430 sd = NULL;
3431 for_each_domain(cpu, tmp) {
669c55e9 3432 if (weight <= tmp->span_weight)
aaee1203 3433 break;
0763a660 3434 if (tmp->flags & sd_flag)
aaee1203
PZ
3435 sd = tmp;
3436 }
3437 /* while loop will break here if sd == NULL */
e7693a36 3438 }
dce840a0
PZ
3439unlock:
3440 rcu_read_unlock();
e7693a36 3441
c88d5910 3442 return new_cpu;
e7693a36 3443}
0a74bef8 3444
f4e26b12
PT
3445/*
3446 * Load-tracking only depends on SMP, FAIR_GROUP_SCHED dependency below may be
3447 * removed when useful for applications beyond shares distribution (e.g.
3448 * load-balance).
3449 */
3450#ifdef CONFIG_FAIR_GROUP_SCHED
0a74bef8
PT
3451/*
3452 * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
3453 * cfs_rq_of(p) references at time of call are still valid and identify the
3454 * previous cpu. However, the caller only guarantees p->pi_lock is held; no
3455 * other assumptions, including the state of rq->lock, should be made.
3456 */
3457static void
3458migrate_task_rq_fair(struct task_struct *p, int next_cpu)
3459{
aff3e498
PT
3460 struct sched_entity *se = &p->se;
3461 struct cfs_rq *cfs_rq = cfs_rq_of(se);
3462
3463 /*
3464 * Load tracking: accumulate removed load so that it can be processed
3465 * when we next update owning cfs_rq under rq->lock. Tasks contribute
3466 * to blocked load iff they have a positive decay-count. It can never
3467 * be negative here since on-rq tasks have decay-count == 0.
3468 */
3469 if (se->avg.decay_count) {
3470 se->avg.decay_count = -__synchronize_entity_decay(se);
3471 atomic64_add(se->avg.load_avg_contrib, &cfs_rq->removed_load);
3472 }
0a74bef8 3473}
f4e26b12 3474#endif
e7693a36
GH
3475#endif /* CONFIG_SMP */
3476
e52fb7c0
PZ
3477static unsigned long
3478wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
0bbd3336
PZ
3479{
3480 unsigned long gran = sysctl_sched_wakeup_granularity;
3481
3482 /*
e52fb7c0
PZ
3483 * Since its curr running now, convert the gran from real-time
3484 * to virtual-time in his units.
13814d42
MG
3485 *
3486 * By using 'se' instead of 'curr' we penalize light tasks, so
3487 * they get preempted easier. That is, if 'se' < 'curr' then
3488 * the resulting gran will be larger, therefore penalizing the
3489 * lighter, if otoh 'se' > 'curr' then the resulting gran will
3490 * be smaller, again penalizing the lighter task.
3491 *
3492 * This is especially important for buddies when the leftmost
3493 * task is higher priority than the buddy.
0bbd3336 3494 */
f4ad9bd2 3495 return calc_delta_fair(gran, se);
0bbd3336
PZ
3496}
3497
464b7527
PZ
3498/*
3499 * Should 'se' preempt 'curr'.
3500 *
3501 * |s1
3502 * |s2
3503 * |s3
3504 * g
3505 * |<--->|c
3506 *
3507 * w(c, s1) = -1
3508 * w(c, s2) = 0
3509 * w(c, s3) = 1
3510 *
3511 */
3512static int
3513wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
3514{
3515 s64 gran, vdiff = curr->vruntime - se->vruntime;
3516
3517 if (vdiff <= 0)
3518 return -1;
3519
e52fb7c0 3520 gran = wakeup_gran(curr, se);
464b7527
PZ
3521 if (vdiff > gran)
3522 return 1;
3523
3524 return 0;
3525}
3526
02479099
PZ
3527static void set_last_buddy(struct sched_entity *se)
3528{
69c80f3e
VP
3529 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3530 return;
3531
3532 for_each_sched_entity(se)
3533 cfs_rq_of(se)->last = se;
02479099
PZ
3534}
3535
3536static void set_next_buddy(struct sched_entity *se)
3537{
69c80f3e
VP
3538 if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
3539 return;
3540
3541 for_each_sched_entity(se)
3542 cfs_rq_of(se)->next = se;
02479099
PZ
3543}
3544
ac53db59
RR
3545static void set_skip_buddy(struct sched_entity *se)
3546{
69c80f3e
VP
3547 for_each_sched_entity(se)
3548 cfs_rq_of(se)->skip = se;
ac53db59
RR
3549}
3550
bf0f6f24
IM
3551/*
3552 * Preempt the current task with a newly woken task if needed:
3553 */
5a9b86f6 3554static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
bf0f6f24
IM
3555{
3556 struct task_struct *curr = rq->curr;
8651a86c 3557 struct sched_entity *se = &curr->se, *pse = &p->se;
03e89e45 3558 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
f685ceac 3559 int scale = cfs_rq->nr_running >= sched_nr_latency;
2f36825b 3560 int next_buddy_marked = 0;
bf0f6f24 3561
4ae7d5ce
IM
3562 if (unlikely(se == pse))
3563 return;
3564
5238cdd3 3565 /*
ddcdf6e7 3566 * This is possible from callers such as move_task(), in which we
5238cdd3
PT
3567 * unconditionally check_prempt_curr() after an enqueue (which may have
3568 * lead to a throttle). This both saves work and prevents false
3569 * next-buddy nomination below.
3570 */
3571 if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
3572 return;
3573
2f36825b 3574 if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
3cb63d52 3575 set_next_buddy(pse);
2f36825b
VP
3576 next_buddy_marked = 1;
3577 }
57fdc26d 3578
aec0a514
BR
3579 /*
3580 * We can come here with TIF_NEED_RESCHED already set from new task
3581 * wake up path.
5238cdd3
PT
3582 *
3583 * Note: this also catches the edge-case of curr being in a throttled
3584 * group (e.g. via set_curr_task), since update_curr() (in the
3585 * enqueue of curr) will have resulted in resched being set. This
3586 * prevents us from potentially nominating it as a false LAST_BUDDY
3587 * below.
aec0a514
BR
3588 */
3589 if (test_tsk_need_resched(curr))
3590 return;
3591
a2f5c9ab
DH
3592 /* Idle tasks are by definition preempted by non-idle tasks. */
3593 if (unlikely(curr->policy == SCHED_IDLE) &&
3594 likely(p->policy != SCHED_IDLE))
3595 goto preempt;
3596
91c234b4 3597 /*
a2f5c9ab
DH
3598 * Batch and idle tasks do not preempt non-idle tasks (their preemption
3599 * is driven by the tick):
91c234b4 3600 */
8ed92e51 3601 if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
91c234b4 3602 return;
bf0f6f24 3603
464b7527 3604 find_matching_se(&se, &pse);
9bbd7374 3605 update_curr(cfs_rq_of(se));
002f128b 3606 BUG_ON(!pse);
2f36825b
VP
3607 if (wakeup_preempt_entity(se, pse) == 1) {
3608 /*
3609 * Bias pick_next to pick the sched entity that is
3610 * triggering this preemption.
3611 */
3612 if (!next_buddy_marked)
3613 set_next_buddy(pse);
3a7e73a2 3614 goto preempt;
2f36825b 3615 }
464b7527 3616
3a7e73a2 3617 return;
a65ac745 3618
3a7e73a2
PZ
3619preempt:
3620 resched_task(curr);
3621 /*
3622 * Only set the backward buddy when the current task is still
3623 * on the rq. This can happen when a wakeup gets interleaved
3624 * with schedule on the ->pre_schedule() or idle_balance()
3625 * point, either of which can * drop the rq lock.
3626 *
3627 * Also, during early boot the idle thread is in the fair class,
3628 * for obvious reasons its a bad idea to schedule back to it.
3629 */
3630 if (unlikely(!se->on_rq || curr == rq->idle))
3631 return;
3632
3633 if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
3634 set_last_buddy(se);
bf0f6f24
IM
3635}
3636
fb8d4724 3637static struct task_struct *pick_next_task_fair(struct rq *rq)
bf0f6f24 3638{
8f4d37ec 3639 struct task_struct *p;
bf0f6f24
IM
3640 struct cfs_rq *cfs_rq = &rq->cfs;
3641 struct sched_entity *se;
3642
36ace27e 3643 if (!cfs_rq->nr_running)
bf0f6f24
IM
3644 return NULL;
3645
3646 do {
9948f4b2 3647 se = pick_next_entity(cfs_rq);
f4b6755f 3648 set_next_entity(cfs_rq, se);
bf0f6f24
IM
3649 cfs_rq = group_cfs_rq(se);
3650 } while (cfs_rq);
3651
8f4d37ec 3652 p = task_of(se);
b39e66ea
MG
3653 if (hrtick_enabled(rq))
3654 hrtick_start_fair(rq, p);
8f4d37ec
PZ
3655
3656 return p;
bf0f6f24
IM
3657}
3658
3659/*
3660 * Account for a descheduled task:
3661 */
31ee529c 3662static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
bf0f6f24
IM
3663{
3664 struct sched_entity *se = &prev->se;
3665 struct cfs_rq *cfs_rq;
3666
3667 for_each_sched_entity(se) {
3668 cfs_rq = cfs_rq_of(se);
ab6cde26 3669 put_prev_entity(cfs_rq, se);
bf0f6f24
IM
3670 }
3671}
3672
ac53db59
RR
3673/*
3674 * sched_yield() is very simple
3675 *
3676 * The magic of dealing with the ->skip buddy is in pick_next_entity.
3677 */
3678static void yield_task_fair(struct rq *rq)
3679{
3680 struct task_struct *curr = rq->curr;
3681 struct cfs_rq *cfs_rq = task_cfs_rq(curr);
3682 struct sched_entity *se = &curr->se;
3683
3684 /*
3685 * Are we the only task in the tree?
3686 */
3687 if (unlikely(rq->nr_running == 1))
3688 return;
3689
3690 clear_buddies(cfs_rq, se);
3691
3692 if (curr->policy != SCHED_BATCH) {
3693 update_rq_clock(rq);
3694 /*
3695 * Update run-time statistics of the 'current'.
3696 */
3697 update_curr(cfs_rq);
916671c0
MG
3698 /*
3699 * Tell update_rq_clock() that we've just updated,
3700 * so we don't do microscopic update in schedule()
3701 * and double the fastpath cost.
3702 */
3703 rq->skip_clock_update = 1;
ac53db59
RR
3704 }
3705
3706 set_skip_buddy(se);
3707}
3708
d95f4122
MG
3709static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
3710{
3711 struct sched_entity *se = &p->se;
3712
5238cdd3
PT
3713 /* throttled hierarchies are not runnable */
3714 if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
d95f4122
MG
3715 return false;
3716
3717 /* Tell the scheduler that we'd really like pse to run next. */
3718 set_next_buddy(se);
3719
d95f4122
MG
3720 yield_task_fair(rq);
3721
3722 return true;
3723}
3724
681f3e68 3725#ifdef CONFIG_SMP
bf0f6f24 3726/**************************************************
e9c84cb8
PZ
3727 * Fair scheduling class load-balancing methods.
3728 *
3729 * BASICS
3730 *
3731 * The purpose of load-balancing is to achieve the same basic fairness the
3732 * per-cpu scheduler provides, namely provide a proportional amount of compute
3733 * time to each task. This is expressed in the following equation:
3734 *
3735 * W_i,n/P_i == W_j,n/P_j for all i,j (1)
3736 *
3737 * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
3738 * W_i,0 is defined as:
3739 *
3740 * W_i,0 = \Sum_j w_i,j (2)
3741 *
3742 * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
3743 * is derived from the nice value as per prio_to_weight[].
3744 *
3745 * The weight average is an exponential decay average of the instantaneous
3746 * weight:
3747 *
3748 * W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0 (3)
3749 *
3750 * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
3751 * fraction of 'recent' time available for SCHED_OTHER task execution. But it
3752 * can also include other factors [XXX].
3753 *
3754 * To achieve this balance we define a measure of imbalance which follows
3755 * directly from (1):
3756 *
3757 * imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j } (4)
3758 *
3759 * We them move tasks around to minimize the imbalance. In the continuous
3760 * function space it is obvious this converges, in the discrete case we get
3761 * a few fun cases generally called infeasible weight scenarios.
3762 *
3763 * [XXX expand on:
3764 * - infeasible weights;
3765 * - local vs global optima in the discrete case. ]
3766 *
3767 *
3768 * SCHED DOMAINS
3769 *
3770 * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
3771 * for all i,j solution, we create a tree of cpus that follows the hardware
3772 * topology where each level pairs two lower groups (or better). This results
3773 * in O(log n) layers. Furthermore we reduce the number of cpus going up the
3774 * tree to only the first of the previous level and we decrease the frequency
3775 * of load-balance at each level inv. proportional to the number of cpus in
3776 * the groups.
3777 *
3778 * This yields:
3779 *
3780 * log_2 n 1 n
3781 * \Sum { --- * --- * 2^i } = O(n) (5)
3782 * i = 0 2^i 2^i
3783 * `- size of each group
3784 * | | `- number of cpus doing load-balance
3785 * | `- freq
3786 * `- sum over all levels
3787 *
3788 * Coupled with a limit on how many tasks we can migrate every balance pass,
3789 * this makes (5) the runtime complexity of the balancer.
3790 *
3791 * An important property here is that each CPU is still (indirectly) connected
3792 * to every other cpu in at most O(log n) steps:
3793 *
3794 * The adjacency matrix of the resulting graph is given by:
3795 *
3796 * log_2 n
3797 * A_i,j = \Union (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1) (6)
3798 * k = 0
3799 *
3800 * And you'll find that:
3801 *
3802 * A^(log_2 n)_i,j != 0 for all i,j (7)
3803 *
3804 * Showing there's indeed a path between every cpu in at most O(log n) steps.
3805 * The task movement gives a factor of O(m), giving a convergence complexity
3806 * of:
3807 *
3808 * O(nm log n), n := nr_cpus, m := nr_tasks (8)
3809 *
3810 *
3811 * WORK CONSERVING
3812 *
3813 * In order to avoid CPUs going idle while there's still work to do, new idle
3814 * balancing is more aggressive and has the newly idle cpu iterate up the domain
3815 * tree itself instead of relying on other CPUs to bring it work.
3816 *
3817 * This adds some complexity to both (5) and (8) but it reduces the total idle
3818 * time.
3819 *
3820 * [XXX more?]
3821 *
3822 *
3823 * CGROUPS
3824 *
3825 * Cgroups make a horror show out of (2), instead of a simple sum we get:
3826 *
3827 * s_k,i
3828 * W_i,0 = \Sum_j \Prod_k w_k * ----- (9)
3829 * S_k
3830 *
3831 * Where
3832 *
3833 * s_k,i = \Sum_j w_i,j,k and S_k = \Sum_i s_k,i (10)
3834 *
3835 * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
3836 *
3837 * The big problem is S_k, its a global sum needed to compute a local (W_i)
3838 * property.
3839 *
3840 * [XXX write more on how we solve this.. _after_ merging pjt's patches that
3841 * rewrite all of this once again.]
3842 */
bf0f6f24 3843
ed387b78
HS
3844static unsigned long __read_mostly max_load_balance_interval = HZ/10;
3845
ddcdf6e7 3846#define LBF_ALL_PINNED 0x01
367456c7 3847#define LBF_NEED_BREAK 0x02
88b8dac0 3848#define LBF_SOME_PINNED 0x04
ddcdf6e7
PZ
3849
3850struct lb_env {
3851 struct sched_domain *sd;
3852
ddcdf6e7 3853 struct rq *src_rq;
85c1e7da 3854 int src_cpu;
ddcdf6e7
PZ
3855
3856 int dst_cpu;
3857 struct rq *dst_rq;
3858
88b8dac0
SV
3859 struct cpumask *dst_grpmask;
3860 int new_dst_cpu;
ddcdf6e7 3861 enum cpu_idle_type idle;
bd939f45 3862 long imbalance;
b9403130
MW
3863 /* The set of CPUs under consideration for load-balancing */
3864 struct cpumask *cpus;
3865
ddcdf6e7 3866 unsigned int flags;
367456c7
PZ
3867
3868 unsigned int loop;
3869 unsigned int loop_break;
3870 unsigned int loop_max;
ddcdf6e7
PZ
3871};
3872
1e3c88bd 3873/*
ddcdf6e7 3874 * move_task - move a task from one runqueue to another runqueue.
1e3c88bd
PZ
3875 * Both runqueues must be locked.
3876 */
ddcdf6e7 3877static void move_task(struct task_struct *p, struct lb_env *env)
1e3c88bd 3878{
ddcdf6e7
PZ
3879 deactivate_task(env->src_rq, p, 0);
3880 set_task_cpu(p, env->dst_cpu);
3881 activate_task(env->dst_rq, p, 0);
3882 check_preempt_curr(env->dst_rq, p, 0);
1e3c88bd
PZ
3883}
3884
029632fb
PZ
3885/*
3886 * Is this task likely cache-hot:
3887 */
3888static int
3889task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
3890{
3891 s64 delta;
3892
3893 if (p->sched_class != &fair_sched_class)
3894 return 0;
3895
3896 if (unlikely(p->policy == SCHED_IDLE))
3897 return 0;
3898
3899 /*
3900 * Buddy candidates are cache hot:
3901 */
3902 if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
3903 (&p->se == cfs_rq_of(&p->se)->next ||
3904 &p->se == cfs_rq_of(&p->se)->last))
3905 return 1;
3906
3907 if (sysctl_sched_migration_cost == -1)
3908 return 1;
3909 if (sysctl_sched_migration_cost == 0)
3910 return 0;
3911
3912 delta = now - p->se.exec_start;
3913
3914 return delta < (s64)sysctl_sched_migration_cost;
3915}
3916
1e3c88bd
PZ
3917/*
3918 * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
3919 */
3920static
8e45cb54 3921int can_migrate_task(struct task_struct *p, struct lb_env *env)
1e3c88bd
PZ
3922{
3923 int tsk_cache_hot = 0;
3924 /*
3925 * We do not migrate tasks that are:
d3198084 3926 * 1) throttled_lb_pair, or
1e3c88bd 3927 * 2) cannot be migrated to this CPU due to cpus_allowed, or
d3198084
JK
3928 * 3) running (obviously), or
3929 * 4) are cache-hot on their current CPU.
1e3c88bd 3930 */
d3198084
JK
3931 if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
3932 return 0;
3933
ddcdf6e7 3934 if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
e02e60c1 3935 int cpu;
88b8dac0 3936
41acab88 3937 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
88b8dac0
SV
3938
3939 /*
3940 * Remember if this task can be migrated to any other cpu in
3941 * our sched_group. We may want to revisit it if we couldn't
3942 * meet load balance goals by pulling other tasks on src_cpu.
3943 *
3944 * Also avoid computing new_dst_cpu if we have already computed
3945 * one in current iteration.
3946 */
3947 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
3948 return 0;
3949
e02e60c1
JK
3950 /* Prevent to re-select dst_cpu via env's cpus */
3951 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
3952 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
3953 env->flags |= LBF_SOME_PINNED;
3954 env->new_dst_cpu = cpu;
3955 break;
3956 }
88b8dac0 3957 }
e02e60c1 3958
1e3c88bd
PZ
3959 return 0;
3960 }
88b8dac0
SV
3961
3962 /* Record that we found atleast one task that could run on dst_cpu */
8e45cb54 3963 env->flags &= ~LBF_ALL_PINNED;
1e3c88bd 3964
ddcdf6e7 3965 if (task_running(env->src_rq, p)) {
41acab88 3966 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
1e3c88bd
PZ
3967 return 0;
3968 }
3969
3970 /*
3971 * Aggressive migration if:
3972 * 1) task is cache cold, or
3973 * 2) too many balance attempts have failed.
3974 */
3975
ddcdf6e7 3976 tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
1e3c88bd 3977 if (!tsk_cache_hot ||
8e45cb54 3978 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
4e2dcb73 3979
1e3c88bd 3980 if (tsk_cache_hot) {
8e45cb54 3981 schedstat_inc(env->sd, lb_hot_gained[env->idle]);
41acab88 3982 schedstat_inc(p, se.statistics.nr_forced_migrations);
1e3c88bd 3983 }
4e2dcb73 3984
1e3c88bd
PZ
3985 return 1;
3986 }
3987
4e2dcb73
ZH
3988 schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
3989 return 0;
1e3c88bd
PZ
3990}
3991
897c395f
PZ
3992/*
3993 * move_one_task tries to move exactly one task from busiest to this_rq, as
3994 * part of active balancing operations within "domain".
3995 * Returns 1 if successful and 0 otherwise.
3996 *
3997 * Called with both runqueues locked.
3998 */
8e45cb54 3999static int move_one_task(struct lb_env *env)
897c395f
PZ
4000{
4001 struct task_struct *p, *n;
897c395f 4002
367456c7 4003 list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
367456c7
PZ
4004 if (!can_migrate_task(p, env))
4005 continue;
897c395f 4006
367456c7
PZ
4007 move_task(p, env);
4008 /*
4009 * Right now, this is only the second place move_task()
4010 * is called, so we can safely collect move_task()
4011 * stats here rather than inside move_task().
4012 */
4013 schedstat_inc(env->sd, lb_gained[env->idle]);
4014 return 1;
897c395f 4015 }
897c395f
PZ
4016 return 0;
4017}
4018
367456c7
PZ
4019static unsigned long task_h_load(struct task_struct *p);
4020
eb95308e
PZ
4021static const unsigned int sched_nr_migrate_break = 32;
4022
5d6523eb 4023/*
bd939f45 4024 * move_tasks tries to move up to imbalance weighted load from busiest to
5d6523eb
PZ
4025 * this_rq, as part of a balancing operation within domain "sd".
4026 * Returns 1 if successful and 0 otherwise.
4027 *
4028 * Called with both runqueues locked.
4029 */
4030static int move_tasks(struct lb_env *env)
1e3c88bd 4031{
5d6523eb
PZ
4032 struct list_head *tasks = &env->src_rq->cfs_tasks;
4033 struct task_struct *p;
367456c7
PZ
4034 unsigned long load;
4035 int pulled = 0;
1e3c88bd 4036
bd939f45 4037 if (env->imbalance <= 0)
5d6523eb 4038 return 0;
1e3c88bd 4039
5d6523eb
PZ
4040 while (!list_empty(tasks)) {
4041 p = list_first_entry(tasks, struct task_struct, se.group_node);
1e3c88bd 4042
367456c7
PZ
4043 env->loop++;
4044 /* We've more or less seen every task there is, call it quits */
5d6523eb 4045 if (env->loop > env->loop_max)
367456c7 4046 break;
5d6523eb
PZ
4047
4048 /* take a breather every nr_migrate tasks */
367456c7 4049 if (env->loop > env->loop_break) {
eb95308e 4050 env->loop_break += sched_nr_migrate_break;
8e45cb54 4051 env->flags |= LBF_NEED_BREAK;
ee00e66f 4052 break;
a195f004 4053 }
1e3c88bd 4054
d3198084 4055 if (!can_migrate_task(p, env))
367456c7
PZ
4056 goto next;
4057
4058 load = task_h_load(p);
5d6523eb 4059
eb95308e 4060 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
367456c7
PZ
4061 goto next;
4062
bd939f45 4063 if ((load / 2) > env->imbalance)
367456c7 4064 goto next;
1e3c88bd 4065
ddcdf6e7 4066 move_task(p, env);
ee00e66f 4067 pulled++;
bd939f45 4068 env->imbalance -= load;
1e3c88bd
PZ
4069
4070#ifdef CONFIG_PREEMPT
ee00e66f
PZ
4071 /*
4072 * NEWIDLE balancing is a source of latency, so preemptible
4073 * kernels will stop after the first task is pulled to minimize
4074 * the critical section.
4075 */
5d6523eb 4076 if (env->idle == CPU_NEWLY_IDLE)
ee00e66f 4077 break;
1e3c88bd
PZ
4078#endif
4079
ee00e66f
PZ
4080 /*
4081 * We only want to steal up to the prescribed amount of
4082 * weighted load.
4083 */
bd939f45 4084 if (env->imbalance <= 0)
ee00e66f 4085 break;
367456c7
PZ
4086
4087 continue;
4088next:
5d6523eb 4089 list_move_tail(&p->se.group_node, tasks);
1e3c88bd 4090 }
5d6523eb 4091
1e3c88bd 4092 /*
ddcdf6e7
PZ
4093 * Right now, this is one of only two places move_task() is called,
4094 * so we can safely collect move_task() stats here rather than
4095 * inside move_task().
1e3c88bd 4096 */
8e45cb54 4097 schedstat_add(env->sd, lb_gained[env->idle], pulled);
1e3c88bd 4098
5d6523eb 4099 return pulled;
1e3c88bd
PZ
4100}
4101
230059de 4102#ifdef CONFIG_FAIR_GROUP_SCHED
9e3081ca
PZ
4103/*
4104 * update tg->load_weight by folding this cpu's load_avg
4105 */
48a16753 4106static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
9e3081ca 4107{
48a16753
PT
4108 struct sched_entity *se = tg->se[cpu];
4109 struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
9e3081ca 4110
48a16753
PT
4111 /* throttled entities do not contribute to load */
4112 if (throttled_hierarchy(cfs_rq))
4113 return;
9e3081ca 4114
aff3e498 4115 update_cfs_rq_blocked_load(cfs_rq, 1);
9e3081ca 4116
82958366
PT
4117 if (se) {
4118 update_entity_load_avg(se, 1);
4119 /*
4120 * We pivot on our runnable average having decayed to zero for
4121 * list removal. This generally implies that all our children
4122 * have also been removed (modulo rounding error or bandwidth
4123 * control); however, such cases are rare and we can fix these
4124 * at enqueue.
4125 *
4126 * TODO: fix up out-of-order children on enqueue.
4127 */
4128 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
4129 list_del_leaf_cfs_rq(cfs_rq);
4130 } else {
48a16753 4131 struct rq *rq = rq_of(cfs_rq);
82958366
PT
4132 update_rq_runnable_avg(rq, rq->nr_running);
4133 }
9e3081ca
PZ
4134}
4135
48a16753 4136static void update_blocked_averages(int cpu)
9e3081ca 4137{
9e3081ca 4138 struct rq *rq = cpu_rq(cpu);
48a16753
PT
4139 struct cfs_rq *cfs_rq;
4140 unsigned long flags;
9e3081ca 4141
48a16753
PT
4142 raw_spin_lock_irqsave(&rq->lock, flags);
4143 update_rq_clock(rq);
9763b67f
PZ
4144 /*
4145 * Iterates the task_group tree in a bottom up fashion, see
4146 * list_add_leaf_cfs_rq() for details.
4147 */
64660c86 4148 for_each_leaf_cfs_rq(rq, cfs_rq) {
48a16753
PT
4149 /*
4150 * Note: We may want to consider periodically releasing
4151 * rq->lock about these updates so that creating many task
4152 * groups does not result in continually extending hold time.
4153 */
4154 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
64660c86 4155 }
48a16753
PT
4156
4157 raw_spin_unlock_irqrestore(&rq->lock, flags);
9e3081ca
PZ
4158}
4159
9763b67f
PZ
4160/*
4161 * Compute the cpu's hierarchical load factor for each task group.
4162 * This needs to be done in a top-down fashion because the load of a child
4163 * group is a fraction of its parents load.
4164 */
4165static int tg_load_down(struct task_group *tg, void *data)
4166{
4167 unsigned long load;
4168 long cpu = (long)data;
4169
4170 if (!tg->parent) {
4171 load = cpu_rq(cpu)->load.weight;
4172 } else {
4173 load = tg->parent->cfs_rq[cpu]->h_load;
4174 load *= tg->se[cpu]->load.weight;
4175 load /= tg->parent->cfs_rq[cpu]->load.weight + 1;
4176 }
4177
4178 tg->cfs_rq[cpu]->h_load = load;
4179
4180 return 0;
4181}
4182
4183static void update_h_load(long cpu)
4184{
a35b6466
PZ
4185 struct rq *rq = cpu_rq(cpu);
4186 unsigned long now = jiffies;
4187
4188 if (rq->h_load_throttle == now)
4189 return;
4190
4191 rq->h_load_throttle = now;
4192
367456c7 4193 rcu_read_lock();
9763b67f 4194 walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
367456c7 4195 rcu_read_unlock();
9763b67f
PZ
4196}
4197
367456c7 4198static unsigned long task_h_load(struct task_struct *p)
230059de 4199{
367456c7
PZ
4200 struct cfs_rq *cfs_rq = task_cfs_rq(p);
4201 unsigned long load;
230059de 4202
367456c7
PZ
4203 load = p->se.load.weight;
4204 load = div_u64(load * cfs_rq->h_load, cfs_rq->load.weight + 1);
230059de 4205
367456c7 4206 return load;
230059de
PZ
4207}
4208#else
48a16753 4209static inline void update_blocked_averages(int cpu)
9e3081ca
PZ
4210{
4211}
4212
367456c7 4213static inline void update_h_load(long cpu)
230059de 4214{
230059de 4215}
230059de 4216
367456c7 4217static unsigned long task_h_load(struct task_struct *p)
1e3c88bd 4218{
367456c7 4219 return p->se.load.weight;
1e3c88bd 4220}
230059de 4221#endif
1e3c88bd 4222
1e3c88bd
PZ
4223/********** Helpers for find_busiest_group ************************/
4224/*
4225 * sd_lb_stats - Structure to store the statistics of a sched_domain
4226 * during load balancing.
4227 */
4228struct sd_lb_stats {
4229 struct sched_group *busiest; /* Busiest group in this sd */
4230 struct sched_group *this; /* Local group in this sd */
4231 unsigned long total_load; /* Total load of all groups in sd */
4232 unsigned long total_pwr; /* Total power of all groups in sd */
4233 unsigned long avg_load; /* Average load across all groups in sd */
4234
4235 /** Statistics of this group */
4236 unsigned long this_load;
4237 unsigned long this_load_per_task;
4238 unsigned long this_nr_running;
fab47622 4239 unsigned long this_has_capacity;
aae6d3dd 4240 unsigned int this_idle_cpus;
1e3c88bd
PZ
4241
4242 /* Statistics of the busiest group */
aae6d3dd 4243 unsigned int busiest_idle_cpus;
1e3c88bd
PZ
4244 unsigned long max_load;
4245 unsigned long busiest_load_per_task;
4246 unsigned long busiest_nr_running;
dd5feea1 4247 unsigned long busiest_group_capacity;
fab47622 4248 unsigned long busiest_has_capacity;
aae6d3dd 4249 unsigned int busiest_group_weight;
1e3c88bd
PZ
4250
4251 int group_imb; /* Is there imbalance in this sd */
1e3c88bd
PZ
4252};
4253
4254/*
4255 * sg_lb_stats - stats of a sched_group required for load_balancing
4256 */
4257struct sg_lb_stats {
4258 unsigned long avg_load; /*Avg load across the CPUs of the group */
4259 unsigned long group_load; /* Total load over the CPUs of the group */
4260 unsigned long sum_nr_running; /* Nr tasks running in the group */
4261 unsigned long sum_weighted_load; /* Weighted load of group's tasks */
4262 unsigned long group_capacity;
aae6d3dd
SS
4263 unsigned long idle_cpus;
4264 unsigned long group_weight;
1e3c88bd 4265 int group_imb; /* Is there an imbalance in the group ? */
fab47622 4266 int group_has_capacity; /* Is there extra capacity in the group? */
1e3c88bd
PZ
4267};
4268
1e3c88bd
PZ
4269/**
4270 * get_sd_load_idx - Obtain the load index for a given sched domain.
4271 * @sd: The sched_domain whose load_idx is to be obtained.
4272 * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
4273 */
4274static inline int get_sd_load_idx(struct sched_domain *sd,
4275 enum cpu_idle_type idle)
4276{
4277 int load_idx;
4278
4279 switch (idle) {
4280 case CPU_NOT_IDLE:
4281 load_idx = sd->busy_idx;
4282 break;
4283
4284 case CPU_NEWLY_IDLE:
4285 load_idx = sd->newidle_idx;
4286 break;
4287 default:
4288 load_idx = sd->idle_idx;
4289 break;
4290 }
4291
4292 return load_idx;
4293}
4294
15f803c9 4295static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
1e3c88bd 4296{
1399fa78 4297 return SCHED_POWER_SCALE;
1e3c88bd
PZ
4298}
4299
4300unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
4301{
4302 return default_scale_freq_power(sd, cpu);
4303}
4304
15f803c9 4305static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
1e3c88bd 4306{
669c55e9 4307 unsigned long weight = sd->span_weight;
1e3c88bd
PZ
4308 unsigned long smt_gain = sd->smt_gain;
4309
4310 smt_gain /= weight;
4311
4312 return smt_gain;
4313}
4314
4315unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
4316{
4317 return default_scale_smt_power(sd, cpu);
4318}
4319
15f803c9 4320static unsigned long scale_rt_power(int cpu)
1e3c88bd
PZ
4321{
4322 struct rq *rq = cpu_rq(cpu);
b654f7de 4323 u64 total, available, age_stamp, avg;
1e3c88bd 4324
b654f7de
PZ
4325 /*
4326 * Since we're reading these variables without serialization make sure
4327 * we read them once before doing sanity checks on them.
4328 */
4329 age_stamp = ACCESS_ONCE(rq->age_stamp);
4330 avg = ACCESS_ONCE(rq->rt_avg);
4331
4332 total = sched_avg_period() + (rq->clock - age_stamp);
aa483808 4333
b654f7de 4334 if (unlikely(total < avg)) {
aa483808
VP
4335 /* Ensures that power won't end up being negative */
4336 available = 0;
4337 } else {
b654f7de 4338 available = total - avg;
aa483808 4339 }
1e3c88bd 4340
1399fa78
NR
4341 if (unlikely((s64)total < SCHED_POWER_SCALE))
4342 total = SCHED_POWER_SCALE;
1e3c88bd 4343
1399fa78 4344 total >>= SCHED_POWER_SHIFT;
1e3c88bd
PZ
4345
4346 return div_u64(available, total);
4347}
4348
4349static void update_cpu_power(struct sched_domain *sd, int cpu)
4350{
669c55e9 4351 unsigned long weight = sd->span_weight;
1399fa78 4352 unsigned long power = SCHED_POWER_SCALE;
1e3c88bd
PZ
4353 struct sched_group *sdg = sd->groups;
4354
1e3c88bd
PZ
4355 if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
4356 if (sched_feat(ARCH_POWER))
4357 power *= arch_scale_smt_power(sd, cpu);
4358 else
4359 power *= default_scale_smt_power(sd, cpu);
4360
1399fa78 4361 power >>= SCHED_POWER_SHIFT;
1e3c88bd
PZ
4362 }
4363
9c3f75cb 4364 sdg->sgp->power_orig = power;
9d5efe05
SV
4365
4366 if (sched_feat(ARCH_POWER))
4367 power *= arch_scale_freq_power(sd, cpu);
4368 else
4369 power *= default_scale_freq_power(sd, cpu);
4370
1399fa78 4371 power >>= SCHED_POWER_SHIFT;
9d5efe05 4372
1e3c88bd 4373 power *= scale_rt_power(cpu);
1399fa78 4374 power >>= SCHED_POWER_SHIFT;
1e3c88bd
PZ
4375
4376 if (!power)
4377 power = 1;
4378
e51fd5e2 4379 cpu_rq(cpu)->cpu_power = power;
9c3f75cb 4380 sdg->sgp->power = power;
1e3c88bd
PZ
4381}
4382
029632fb 4383void update_group_power(struct sched_domain *sd, int cpu)
1e3c88bd
PZ
4384{
4385 struct sched_domain *child = sd->child;
4386 struct sched_group *group, *sdg = sd->groups;
4387 unsigned long power;
4ec4412e
VG
4388 unsigned long interval;
4389
4390 interval = msecs_to_jiffies(sd->balance_interval);
4391 interval = clamp(interval, 1UL, max_load_balance_interval);
4392 sdg->sgp->next_update = jiffies + interval;
1e3c88bd
PZ
4393
4394 if (!child) {
4395 update_cpu_power(sd, cpu);
4396 return;
4397 }
4398
4399 power = 0;
4400
74a5ce20
PZ
4401 if (child->flags & SD_OVERLAP) {
4402 /*
4403 * SD_OVERLAP domains cannot assume that child groups
4404 * span the current group.
4405 */
4406
4407 for_each_cpu(cpu, sched_group_cpus(sdg))
4408 power += power_of(cpu);
4409 } else {
4410 /*
4411 * !SD_OVERLAP domains can assume that child groups
4412 * span the current group.
4413 */
4414
4415 group = child->groups;
4416 do {
4417 power += group->sgp->power;
4418 group = group->next;
4419 } while (group != child->groups);
4420 }
1e3c88bd 4421
c3decf0d 4422 sdg->sgp->power_orig = sdg->sgp->power = power;
1e3c88bd
PZ
4423}
4424
9d5efe05
SV
4425/*
4426 * Try and fix up capacity for tiny siblings, this is needed when
4427 * things like SD_ASYM_PACKING need f_b_g to select another sibling
4428 * which on its own isn't powerful enough.
4429 *
4430 * See update_sd_pick_busiest() and check_asym_packing().
4431 */
4432static inline int
4433fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
4434{
4435 /*
1399fa78 4436 * Only siblings can have significantly less than SCHED_POWER_SCALE
9d5efe05 4437 */
a6c75f2f 4438 if (!(sd->flags & SD_SHARE_CPUPOWER))
9d5efe05
SV
4439 return 0;
4440
4441 /*
4442 * If ~90% of the cpu_power is still there, we're good.
4443 */
9c3f75cb 4444 if (group->sgp->power * 32 > group->sgp->power_orig * 29)
9d5efe05
SV
4445 return 1;
4446
4447 return 0;
4448}
4449
1e3c88bd
PZ
4450/**
4451 * update_sg_lb_stats - Update sched_group's statistics for load balancing.
cd96891d 4452 * @env: The load balancing environment.
1e3c88bd 4453 * @group: sched_group whose statistics are to be updated.
1e3c88bd 4454 * @load_idx: Load index of sched_domain of this_cpu for load calc.
1e3c88bd 4455 * @local_group: Does group contain this_cpu.
1e3c88bd
PZ
4456 * @balance: Should we balance.
4457 * @sgs: variable to hold the statistics for this group.
4458 */
bd939f45
PZ
4459static inline void update_sg_lb_stats(struct lb_env *env,
4460 struct sched_group *group, int load_idx,
b9403130 4461 int local_group, int *balance, struct sg_lb_stats *sgs)
1e3c88bd 4462{
e44bc5c5
PZ
4463 unsigned long nr_running, max_nr_running, min_nr_running;
4464 unsigned long load, max_cpu_load, min_cpu_load;
04f733b4 4465 unsigned int balance_cpu = -1, first_idle_cpu = 0;
dd5feea1 4466 unsigned long avg_load_per_task = 0;
bd939f45 4467 int i;
1e3c88bd 4468
871e35bc 4469 if (local_group)
c1174876 4470 balance_cpu = group_balance_cpu(group);
1e3c88bd
PZ
4471
4472 /* Tally up the load of all CPUs in the group */
1e3c88bd
PZ
4473 max_cpu_load = 0;
4474 min_cpu_load = ~0UL;
2582f0eb 4475 max_nr_running = 0;
e44bc5c5 4476 min_nr_running = ~0UL;
1e3c88bd 4477
b9403130 4478 for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
1e3c88bd
PZ
4479 struct rq *rq = cpu_rq(i);
4480
e44bc5c5
PZ
4481 nr_running = rq->nr_running;
4482
1e3c88bd
PZ
4483 /* Bias balancing toward cpus of our domain */
4484 if (local_group) {
c1174876
PZ
4485 if (idle_cpu(i) && !first_idle_cpu &&
4486 cpumask_test_cpu(i, sched_group_mask(group))) {
04f733b4 4487 first_idle_cpu = 1;
1e3c88bd
PZ
4488 balance_cpu = i;
4489 }
04f733b4
PZ
4490
4491 load = target_load(i, load_idx);
1e3c88bd
PZ
4492 } else {
4493 load = source_load(i, load_idx);
e44bc5c5 4494 if (load > max_cpu_load)
1e3c88bd
PZ
4495 max_cpu_load = load;
4496 if (min_cpu_load > load)
4497 min_cpu_load = load;
e44bc5c5
PZ
4498
4499 if (nr_running > max_nr_running)
4500 max_nr_running = nr_running;
4501 if (min_nr_running > nr_running)
4502 min_nr_running = nr_running;
1e3c88bd
PZ
4503 }
4504
4505 sgs->group_load += load;
e44bc5c5 4506 sgs->sum_nr_running += nr_running;
1e3c88bd 4507 sgs->sum_weighted_load += weighted_cpuload(i);
aae6d3dd
SS
4508 if (idle_cpu(i))
4509 sgs->idle_cpus++;
1e3c88bd
PZ
4510 }
4511
4512 /*
4513 * First idle cpu or the first cpu(busiest) in this sched group
4514 * is eligible for doing load balancing at this and above
4515 * domains. In the newly idle case, we will allow all the cpu's
4516 * to do the newly idle load balance.
4517 */
4ec4412e 4518 if (local_group) {
bd939f45 4519 if (env->idle != CPU_NEWLY_IDLE) {
04f733b4 4520 if (balance_cpu != env->dst_cpu) {
4ec4412e
VG
4521 *balance = 0;
4522 return;
4523 }
bd939f45 4524 update_group_power(env->sd, env->dst_cpu);
4ec4412e 4525 } else if (time_after_eq(jiffies, group->sgp->next_update))
bd939f45 4526 update_group_power(env->sd, env->dst_cpu);
1e3c88bd
PZ
4527 }
4528
4529 /* Adjust by relative CPU power of the group */
9c3f75cb 4530 sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
1e3c88bd 4531
1e3c88bd
PZ
4532 /*
4533 * Consider the group unbalanced when the imbalance is larger
866ab43e 4534 * than the average weight of a task.
1e3c88bd
PZ
4535 *
4536 * APZ: with cgroup the avg task weight can vary wildly and
4537 * might not be a suitable number - should we keep a
4538 * normalized nr_running number somewhere that negates
4539 * the hierarchy?
4540 */
dd5feea1
SS
4541 if (sgs->sum_nr_running)
4542 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
1e3c88bd 4543
e44bc5c5
PZ
4544 if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
4545 (max_nr_running - min_nr_running) > 1)
1e3c88bd
PZ
4546 sgs->group_imb = 1;
4547
9c3f75cb 4548 sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
1399fa78 4549 SCHED_POWER_SCALE);
9d5efe05 4550 if (!sgs->group_capacity)
bd939f45 4551 sgs->group_capacity = fix_small_capacity(env->sd, group);
aae6d3dd 4552 sgs->group_weight = group->group_weight;
fab47622
NR
4553
4554 if (sgs->group_capacity > sgs->sum_nr_running)
4555 sgs->group_has_capacity = 1;
1e3c88bd
PZ
4556}
4557
532cb4c4
MN
4558/**
4559 * update_sd_pick_busiest - return 1 on busiest group
cd96891d 4560 * @env: The load balancing environment.
532cb4c4
MN
4561 * @sds: sched_domain statistics
4562 * @sg: sched_group candidate to be checked for being the busiest
b6b12294 4563 * @sgs: sched_group statistics
532cb4c4
MN
4564 *
4565 * Determine if @sg is a busier group than the previously selected
4566 * busiest group.
4567 */
bd939f45 4568static bool update_sd_pick_busiest(struct lb_env *env,
532cb4c4
MN
4569 struct sd_lb_stats *sds,
4570 struct sched_group *sg,
bd939f45 4571 struct sg_lb_stats *sgs)
532cb4c4
MN
4572{
4573 if (sgs->avg_load <= sds->max_load)
4574 return false;
4575
4576 if (sgs->sum_nr_running > sgs->group_capacity)
4577 return true;
4578
4579 if (sgs->group_imb)
4580 return true;
4581
4582 /*
4583 * ASYM_PACKING needs to move all the work to the lowest
4584 * numbered CPUs in the group, therefore mark all groups
4585 * higher than ourself as busy.
4586 */
bd939f45
PZ
4587 if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
4588 env->dst_cpu < group_first_cpu(sg)) {
532cb4c4
MN
4589 if (!sds->busiest)
4590 return true;
4591
4592 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
4593 return true;
4594 }
4595
4596 return false;
4597}
4598
1e3c88bd 4599/**
461819ac 4600 * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
cd96891d 4601 * @env: The load balancing environment.
1e3c88bd
PZ
4602 * @balance: Should we balance.
4603 * @sds: variable to hold the statistics for this sched_domain.
4604 */
bd939f45 4605static inline void update_sd_lb_stats(struct lb_env *env,
b9403130 4606 int *balance, struct sd_lb_stats *sds)
1e3c88bd 4607{
bd939f45
PZ
4608 struct sched_domain *child = env->sd->child;
4609 struct sched_group *sg = env->sd->groups;
1e3c88bd
PZ
4610 struct sg_lb_stats sgs;
4611 int load_idx, prefer_sibling = 0;
4612
4613 if (child && child->flags & SD_PREFER_SIBLING)
4614 prefer_sibling = 1;
4615
bd939f45 4616 load_idx = get_sd_load_idx(env->sd, env->idle);
1e3c88bd
PZ
4617
4618 do {
4619 int local_group;
4620
bd939f45 4621 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
1e3c88bd 4622 memset(&sgs, 0, sizeof(sgs));
b9403130 4623 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
1e3c88bd 4624
8f190fb3 4625 if (local_group && !(*balance))
1e3c88bd
PZ
4626 return;
4627
4628 sds->total_load += sgs.group_load;
9c3f75cb 4629 sds->total_pwr += sg->sgp->power;
1e3c88bd
PZ
4630
4631 /*
4632 * In case the child domain prefers tasks go to siblings
532cb4c4 4633 * first, lower the sg capacity to one so that we'll try
75dd321d
NR
4634 * and move all the excess tasks away. We lower the capacity
4635 * of a group only if the local group has the capacity to fit
4636 * these excess tasks, i.e. nr_running < group_capacity. The
4637 * extra check prevents the case where you always pull from the
4638 * heaviest group when it is already under-utilized (possible
4639 * with a large weight task outweighs the tasks on the system).
1e3c88bd 4640 */
75dd321d 4641 if (prefer_sibling && !local_group && sds->this_has_capacity)
1e3c88bd
PZ
4642 sgs.group_capacity = min(sgs.group_capacity, 1UL);
4643
4644 if (local_group) {
4645 sds->this_load = sgs.avg_load;
532cb4c4 4646 sds->this = sg;
1e3c88bd
PZ
4647 sds->this_nr_running = sgs.sum_nr_running;
4648 sds->this_load_per_task = sgs.sum_weighted_load;
fab47622 4649 sds->this_has_capacity = sgs.group_has_capacity;
aae6d3dd 4650 sds->this_idle_cpus = sgs.idle_cpus;
bd939f45 4651 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
1e3c88bd 4652 sds->max_load = sgs.avg_load;
532cb4c4 4653 sds->busiest = sg;
1e3c88bd 4654 sds->busiest_nr_running = sgs.sum_nr_running;
aae6d3dd 4655 sds->busiest_idle_cpus = sgs.idle_cpus;
dd5feea1 4656 sds->busiest_group_capacity = sgs.group_capacity;
1e3c88bd 4657 sds->busiest_load_per_task = sgs.sum_weighted_load;
fab47622 4658 sds->busiest_has_capacity = sgs.group_has_capacity;
aae6d3dd 4659 sds->busiest_group_weight = sgs.group_weight;
1e3c88bd
PZ
4660 sds->group_imb = sgs.group_imb;
4661 }
4662
532cb4c4 4663 sg = sg->next;
bd939f45 4664 } while (sg != env->sd->groups);
532cb4c4
MN
4665}
4666
532cb4c4
MN
4667/**
4668 * check_asym_packing - Check to see if the group is packed into the
4669 * sched doman.
4670 *
4671 * This is primarily intended to used at the sibling level. Some
4672 * cores like POWER7 prefer to use lower numbered SMT threads. In the
4673 * case of POWER7, it can move to lower SMT modes only when higher
4674 * threads are idle. When in lower SMT modes, the threads will
4675 * perform better since they share less core resources. Hence when we
4676 * have idle threads, we want them to be the higher ones.
4677 *
4678 * This packing function is run on idle threads. It checks to see if
4679 * the busiest CPU in this domain (core in the P7 case) has a higher
4680 * CPU number than the packing function is being run on. Here we are
4681 * assuming lower CPU number will be equivalent to lower a SMT thread
4682 * number.
4683 *
b6b12294
MN
4684 * Returns 1 when packing is required and a task should be moved to
4685 * this CPU. The amount of the imbalance is returned in *imbalance.
4686 *
cd96891d 4687 * @env: The load balancing environment.
532cb4c4 4688 * @sds: Statistics of the sched_domain which is to be packed
532cb4c4 4689 */
bd939f45 4690static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
532cb4c4
MN
4691{
4692 int busiest_cpu;
4693
bd939f45 4694 if (!(env->sd->flags & SD_ASYM_PACKING))
532cb4c4
MN
4695 return 0;
4696
4697 if (!sds->busiest)
4698 return 0;
4699
4700 busiest_cpu = group_first_cpu(sds->busiest);
bd939f45 4701 if (env->dst_cpu > busiest_cpu)
532cb4c4
MN
4702 return 0;
4703
bd939f45
PZ
4704 env->imbalance = DIV_ROUND_CLOSEST(
4705 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
4706
532cb4c4 4707 return 1;
1e3c88bd
PZ
4708}
4709
4710/**
4711 * fix_small_imbalance - Calculate the minor imbalance that exists
4712 * amongst the groups of a sched_domain, during
4713 * load balancing.
cd96891d 4714 * @env: The load balancing environment.
1e3c88bd 4715 * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 4716 */
bd939f45
PZ
4717static inline
4718void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd
PZ
4719{
4720 unsigned long tmp, pwr_now = 0, pwr_move = 0;
4721 unsigned int imbn = 2;
dd5feea1 4722 unsigned long scaled_busy_load_per_task;
1e3c88bd
PZ
4723
4724 if (sds->this_nr_running) {
4725 sds->this_load_per_task /= sds->this_nr_running;
4726 if (sds->busiest_load_per_task >
4727 sds->this_load_per_task)
4728 imbn = 1;
bd939f45 4729 } else {
1e3c88bd 4730 sds->this_load_per_task =
bd939f45
PZ
4731 cpu_avg_load_per_task(env->dst_cpu);
4732 }
1e3c88bd 4733
dd5feea1 4734 scaled_busy_load_per_task = sds->busiest_load_per_task
1399fa78 4735 * SCHED_POWER_SCALE;
9c3f75cb 4736 scaled_busy_load_per_task /= sds->busiest->sgp->power;
dd5feea1
SS
4737
4738 if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
4739 (scaled_busy_load_per_task * imbn)) {
bd939f45 4740 env->imbalance = sds->busiest_load_per_task;
1e3c88bd
PZ
4741 return;
4742 }
4743
4744 /*
4745 * OK, we don't have enough imbalance to justify moving tasks,
4746 * however we may be able to increase total CPU power used by
4747 * moving them.
4748 */
4749
9c3f75cb 4750 pwr_now += sds->busiest->sgp->power *
1e3c88bd 4751 min(sds->busiest_load_per_task, sds->max_load);
9c3f75cb 4752 pwr_now += sds->this->sgp->power *
1e3c88bd 4753 min(sds->this_load_per_task, sds->this_load);
1399fa78 4754 pwr_now /= SCHED_POWER_SCALE;
1e3c88bd
PZ
4755
4756 /* Amount of load we'd subtract */
1399fa78 4757 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
9c3f75cb 4758 sds->busiest->sgp->power;
1e3c88bd 4759 if (sds->max_load > tmp)
9c3f75cb 4760 pwr_move += sds->busiest->sgp->power *
1e3c88bd
PZ
4761 min(sds->busiest_load_per_task, sds->max_load - tmp);
4762
4763 /* Amount of load we'd add */
9c3f75cb 4764 if (sds->max_load * sds->busiest->sgp->power <
1399fa78 4765 sds->busiest_load_per_task * SCHED_POWER_SCALE)
9c3f75cb
PZ
4766 tmp = (sds->max_load * sds->busiest->sgp->power) /
4767 sds->this->sgp->power;
1e3c88bd 4768 else
1399fa78 4769 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
9c3f75cb
PZ
4770 sds->this->sgp->power;
4771 pwr_move += sds->this->sgp->power *
1e3c88bd 4772 min(sds->this_load_per_task, sds->this_load + tmp);
1399fa78 4773 pwr_move /= SCHED_POWER_SCALE;
1e3c88bd
PZ
4774
4775 /* Move if we gain throughput */
4776 if (pwr_move > pwr_now)
bd939f45 4777 env->imbalance = sds->busiest_load_per_task;
1e3c88bd
PZ
4778}
4779
4780/**
4781 * calculate_imbalance - Calculate the amount of imbalance present within the
4782 * groups of a given sched_domain during load balance.
bd939f45 4783 * @env: load balance environment
1e3c88bd 4784 * @sds: statistics of the sched_domain whose imbalance is to be calculated.
1e3c88bd 4785 */
bd939f45 4786static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
1e3c88bd 4787{
dd5feea1
SS
4788 unsigned long max_pull, load_above_capacity = ~0UL;
4789
4790 sds->busiest_load_per_task /= sds->busiest_nr_running;
4791 if (sds->group_imb) {
4792 sds->busiest_load_per_task =
4793 min(sds->busiest_load_per_task, sds->avg_load);
4794 }
4795
1e3c88bd
PZ
4796 /*
4797 * In the presence of smp nice balancing, certain scenarios can have
4798 * max load less than avg load(as we skip the groups at or below
4799 * its cpu_power, while calculating max_load..)
4800 */
4801 if (sds->max_load < sds->avg_load) {
bd939f45
PZ
4802 env->imbalance = 0;
4803 return fix_small_imbalance(env, sds);
1e3c88bd
PZ
4804 }
4805
dd5feea1
SS
4806 if (!sds->group_imb) {
4807 /*
4808 * Don't want to pull so many tasks that a group would go idle.
4809 */
4810 load_above_capacity = (sds->busiest_nr_running -
4811 sds->busiest_group_capacity);
4812
1399fa78 4813 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
dd5feea1 4814
9c3f75cb 4815 load_above_capacity /= sds->busiest->sgp->power;
dd5feea1
SS
4816 }
4817
4818 /*
4819 * We're trying to get all the cpus to the average_load, so we don't
4820 * want to push ourselves above the average load, nor do we wish to
4821 * reduce the max loaded cpu below the average load. At the same time,
4822 * we also don't want to reduce the group load below the group capacity
4823 * (so that we can implement power-savings policies etc). Thus we look
4824 * for the minimum possible imbalance.
4825 * Be careful of negative numbers as they'll appear as very large values
4826 * with unsigned longs.
4827 */
4828 max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
1e3c88bd
PZ
4829
4830 /* How much load to actually move to equalise the imbalance */
bd939f45 4831 env->imbalance = min(max_pull * sds->busiest->sgp->power,
9c3f75cb 4832 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
1399fa78 4833 / SCHED_POWER_SCALE;
1e3c88bd
PZ
4834
4835 /*
4836 * if *imbalance is less than the average load per runnable task
25985edc 4837 * there is no guarantee that any tasks will be moved so we'll have
1e3c88bd
PZ
4838 * a think about bumping its value to force at least one task to be
4839 * moved
4840 */
bd939f45
PZ
4841 if (env->imbalance < sds->busiest_load_per_task)
4842 return fix_small_imbalance(env, sds);
1e3c88bd
PZ
4843
4844}
fab47622 4845
1e3c88bd
PZ
4846/******* find_busiest_group() helpers end here *********************/
4847
4848/**
4849 * find_busiest_group - Returns the busiest group within the sched_domain
4850 * if there is an imbalance. If there isn't an imbalance, and
4851 * the user has opted for power-savings, it returns a group whose
4852 * CPUs can be put to idle by rebalancing those tasks elsewhere, if
4853 * such a group exists.
4854 *
4855 * Also calculates the amount of weighted load which should be moved
4856 * to restore balance.
4857 *
cd96891d 4858 * @env: The load balancing environment.
1e3c88bd
PZ
4859 * @balance: Pointer to a variable indicating if this_cpu
4860 * is the appropriate cpu to perform load balancing at this_level.
4861 *
4862 * Returns: - the busiest group if imbalance exists.
4863 * - If no imbalance and user has opted for power-savings balance,
4864 * return the least loaded group whose CPUs can be
4865 * put to idle by rebalancing its tasks onto our group.
4866 */
4867static struct sched_group *
b9403130 4868find_busiest_group(struct lb_env *env, int *balance)
1e3c88bd
PZ
4869{
4870 struct sd_lb_stats sds;
4871
4872 memset(&sds, 0, sizeof(sds));
4873
4874 /*
4875 * Compute the various statistics relavent for load balancing at
4876 * this level.
4877 */
b9403130 4878 update_sd_lb_stats(env, balance, &sds);
1e3c88bd 4879
cc57aa8f
PZ
4880 /*
4881 * this_cpu is not the appropriate cpu to perform load balancing at
4882 * this level.
1e3c88bd 4883 */
8f190fb3 4884 if (!(*balance))
1e3c88bd
PZ
4885 goto ret;
4886
bd939f45
PZ
4887 if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
4888 check_asym_packing(env, &sds))
532cb4c4
MN
4889 return sds.busiest;
4890
cc57aa8f 4891 /* There is no busy sibling group to pull tasks from */
1e3c88bd
PZ
4892 if (!sds.busiest || sds.busiest_nr_running == 0)
4893 goto out_balanced;
4894
1399fa78 4895 sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
b0432d8f 4896
866ab43e
PZ
4897 /*
4898 * If the busiest group is imbalanced the below checks don't
4899 * work because they assumes all things are equal, which typically
4900 * isn't true due to cpus_allowed constraints and the like.
4901 */
4902 if (sds.group_imb)
4903 goto force_balance;
4904
cc57aa8f 4905 /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
bd939f45 4906 if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
fab47622
NR
4907 !sds.busiest_has_capacity)
4908 goto force_balance;
4909
cc57aa8f
PZ
4910 /*
4911 * If the local group is more busy than the selected busiest group
4912 * don't try and pull any tasks.
4913 */
1e3c88bd
PZ
4914 if (sds.this_load >= sds.max_load)
4915 goto out_balanced;
4916
cc57aa8f
PZ
4917 /*
4918 * Don't pull any tasks if this group is already above the domain
4919 * average load.
4920 */
1e3c88bd
PZ
4921 if (sds.this_load >= sds.avg_load)
4922 goto out_balanced;
4923
bd939f45 4924 if (env->idle == CPU_IDLE) {
aae6d3dd
SS
4925 /*
4926 * This cpu is idle. If the busiest group load doesn't
4927 * have more tasks than the number of available cpu's and
4928 * there is no imbalance between this and busiest group
4929 * wrt to idle cpu's, it is balanced.
4930 */
c186fafe 4931 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
aae6d3dd
SS
4932 sds.busiest_nr_running <= sds.busiest_group_weight)
4933 goto out_balanced;
c186fafe
PZ
4934 } else {
4935 /*
4936 * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
4937 * imbalance_pct to be conservative.
4938 */
bd939f45 4939 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load)
c186fafe 4940 goto out_balanced;
aae6d3dd 4941 }
1e3c88bd 4942
fab47622 4943force_balance:
1e3c88bd 4944 /* Looks like there is an imbalance. Compute it */
bd939f45 4945 calculate_imbalance(env, &sds);
1e3c88bd
PZ
4946 return sds.busiest;
4947
4948out_balanced:
1e3c88bd 4949ret:
bd939f45 4950 env->imbalance = 0;
1e3c88bd
PZ
4951 return NULL;
4952}
4953
4954/*
4955 * find_busiest_queue - find the busiest runqueue among the cpus in group.
4956 */
bd939f45 4957static struct rq *find_busiest_queue(struct lb_env *env,
b9403130 4958 struct sched_group *group)
1e3c88bd
PZ
4959{
4960 struct rq *busiest = NULL, *rq;
4961 unsigned long max_load = 0;
4962 int i;
4963
4964 for_each_cpu(i, sched_group_cpus(group)) {
4965 unsigned long power = power_of(i);
1399fa78
NR
4966 unsigned long capacity = DIV_ROUND_CLOSEST(power,
4967 SCHED_POWER_SCALE);
1e3c88bd
PZ
4968 unsigned long wl;
4969
9d5efe05 4970 if (!capacity)
bd939f45 4971 capacity = fix_small_capacity(env->sd, group);
9d5efe05 4972
b9403130 4973 if (!cpumask_test_cpu(i, env->cpus))
1e3c88bd
PZ
4974 continue;
4975
4976 rq = cpu_rq(i);
6e40f5bb 4977 wl = weighted_cpuload(i);
1e3c88bd 4978
6e40f5bb
TG
4979 /*
4980 * When comparing with imbalance, use weighted_cpuload()
4981 * which is not scaled with the cpu power.
4982 */
bd939f45 4983 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
1e3c88bd
PZ
4984 continue;
4985
6e40f5bb
TG
4986 /*
4987 * For the load comparisons with the other cpu's, consider
4988 * the weighted_cpuload() scaled with the cpu power, so that
4989 * the load can be moved away from the cpu that is potentially
4990 * running at a lower capacity.
4991 */
1399fa78 4992 wl = (wl * SCHED_POWER_SCALE) / power;
6e40f5bb 4993
1e3c88bd
PZ
4994 if (wl > max_load) {
4995 max_load = wl;
4996 busiest = rq;
4997 }
4998 }
4999
5000 return busiest;
5001}
5002
5003/*
5004 * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
5005 * so long as it is large enough.
5006 */
5007#define MAX_PINNED_INTERVAL 512
5008
5009/* Working cpumask for load_balance and load_balance_newidle. */
e6252c3e 5010DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
1e3c88bd 5011
bd939f45 5012static int need_active_balance(struct lb_env *env)
1af3ed3d 5013{
bd939f45
PZ
5014 struct sched_domain *sd = env->sd;
5015
5016 if (env->idle == CPU_NEWLY_IDLE) {
532cb4c4
MN
5017
5018 /*
5019 * ASYM_PACKING needs to force migrate tasks from busy but
5020 * higher numbered CPUs in order to pack all tasks in the
5021 * lowest numbered CPUs.
5022 */
bd939f45 5023 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
532cb4c4 5024 return 1;
1af3ed3d
PZ
5025 }
5026
5027 return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
5028}
5029
969c7921
TH
5030static int active_load_balance_cpu_stop(void *data);
5031
1e3c88bd
PZ
5032/*
5033 * Check this_cpu to ensure it is balanced within domain. Attempt to move
5034 * tasks if there is an imbalance.
5035 */
5036static int load_balance(int this_cpu, struct rq *this_rq,
5037 struct sched_domain *sd, enum cpu_idle_type idle,
5038 int *balance)
5039{
88b8dac0 5040 int ld_moved, cur_ld_moved, active_balance = 0;
1e3c88bd 5041 struct sched_group *group;
1e3c88bd
PZ
5042 struct rq *busiest;
5043 unsigned long flags;
e6252c3e 5044 struct cpumask *cpus = __get_cpu_var(load_balance_mask);
1e3c88bd 5045
8e45cb54
PZ
5046 struct lb_env env = {
5047 .sd = sd,
ddcdf6e7
PZ
5048 .dst_cpu = this_cpu,
5049 .dst_rq = this_rq,
88b8dac0 5050 .dst_grpmask = sched_group_cpus(sd->groups),
8e45cb54 5051 .idle = idle,
eb95308e 5052 .loop_break = sched_nr_migrate_break,
b9403130 5053 .cpus = cpus,
8e45cb54
PZ
5054 };
5055
cfc03118
JK
5056 /*
5057 * For NEWLY_IDLE load_balancing, we don't need to consider
5058 * other cpus in our group
5059 */
e02e60c1 5060 if (idle == CPU_NEWLY_IDLE)
cfc03118 5061 env.dst_grpmask = NULL;
cfc03118 5062
1e3c88bd
PZ
5063 cpumask_copy(cpus, cpu_active_mask);
5064
1e3c88bd
PZ
5065 schedstat_inc(sd, lb_count[idle]);
5066
5067redo:
b9403130 5068 group = find_busiest_group(&env, balance);
1e3c88bd
PZ
5069
5070 if (*balance == 0)
5071 goto out_balanced;
5072
5073 if (!group) {
5074 schedstat_inc(sd, lb_nobusyg[idle]);
5075 goto out_balanced;
5076 }
5077
b9403130 5078 busiest = find_busiest_queue(&env, group);
1e3c88bd
PZ
5079 if (!busiest) {
5080 schedstat_inc(sd, lb_nobusyq[idle]);
5081 goto out_balanced;
5082 }
5083
78feefc5 5084 BUG_ON(busiest == env.dst_rq);
1e3c88bd 5085
bd939f45 5086 schedstat_add(sd, lb_imbalance[idle], env.imbalance);
1e3c88bd
PZ
5087
5088 ld_moved = 0;
5089 if (busiest->nr_running > 1) {
5090 /*
5091 * Attempt to move tasks. If find_busiest_group has found
5092 * an imbalance but busiest->nr_running <= 1, the group is
5093 * still unbalanced. ld_moved simply stays zero, so it is
5094 * correctly treated as an imbalance.
5095 */
8e45cb54 5096 env.flags |= LBF_ALL_PINNED;
c82513e5
PZ
5097 env.src_cpu = busiest->cpu;
5098 env.src_rq = busiest;
5099 env.loop_max = min(sysctl_sched_nr_migrate, busiest->nr_running);
8e45cb54 5100
a35b6466 5101 update_h_load(env.src_cpu);
5d6523eb 5102more_balance:
1e3c88bd 5103 local_irq_save(flags);
78feefc5 5104 double_rq_lock(env.dst_rq, busiest);
88b8dac0
SV
5105
5106 /*
5107 * cur_ld_moved - load moved in current iteration
5108 * ld_moved - cumulative load moved across iterations
5109 */
5110 cur_ld_moved = move_tasks(&env);
5111 ld_moved += cur_ld_moved;
78feefc5 5112 double_rq_unlock(env.dst_rq, busiest);
1e3c88bd
PZ
5113 local_irq_restore(flags);
5114
5115 /*
5116 * some other cpu did the load balance for us.
5117 */
88b8dac0
SV
5118 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
5119 resched_cpu(env.dst_cpu);
5120
f1cd0858
JK
5121 if (env.flags & LBF_NEED_BREAK) {
5122 env.flags &= ~LBF_NEED_BREAK;
5123 goto more_balance;
5124 }
5125
88b8dac0
SV
5126 /*
5127 * Revisit (affine) tasks on src_cpu that couldn't be moved to
5128 * us and move them to an alternate dst_cpu in our sched_group
5129 * where they can run. The upper limit on how many times we
5130 * iterate on same src_cpu is dependent on number of cpus in our
5131 * sched_group.
5132 *
5133 * This changes load balance semantics a bit on who can move
5134 * load to a given_cpu. In addition to the given_cpu itself
5135 * (or a ilb_cpu acting on its behalf where given_cpu is
5136 * nohz-idle), we now have balance_cpu in a position to move
5137 * load to given_cpu. In rare situations, this may cause
5138 * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
5139 * _independently_ and at _same_ time to move some load to
5140 * given_cpu) causing exceess load to be moved to given_cpu.
5141 * This however should not happen so much in practice and
5142 * moreover subsequent load balance cycles should correct the
5143 * excess load moved.
5144 */
e02e60c1 5145 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
88b8dac0 5146
78feefc5 5147 env.dst_rq = cpu_rq(env.new_dst_cpu);
88b8dac0
SV
5148 env.dst_cpu = env.new_dst_cpu;
5149 env.flags &= ~LBF_SOME_PINNED;
5150 env.loop = 0;
5151 env.loop_break = sched_nr_migrate_break;
e02e60c1
JK
5152
5153 /* Prevent to re-select dst_cpu via env's cpus */
5154 cpumask_clear_cpu(env.dst_cpu, env.cpus);
5155
88b8dac0
SV
5156 /*
5157 * Go back to "more_balance" rather than "redo" since we
5158 * need to continue with same src_cpu.
5159 */
5160 goto more_balance;
5161 }
1e3c88bd
PZ
5162
5163 /* All tasks on this runqueue were pinned by CPU affinity */
8e45cb54 5164 if (unlikely(env.flags & LBF_ALL_PINNED)) {
1e3c88bd 5165 cpumask_clear_cpu(cpu_of(busiest), cpus);
bbf18b19
PN
5166 if (!cpumask_empty(cpus)) {
5167 env.loop = 0;
5168 env.loop_break = sched_nr_migrate_break;
1e3c88bd 5169 goto redo;
bbf18b19 5170 }
1e3c88bd
PZ
5171 goto out_balanced;
5172 }
5173 }
5174
5175 if (!ld_moved) {
5176 schedstat_inc(sd, lb_failed[idle]);
58b26c4c
VP
5177 /*
5178 * Increment the failure counter only on periodic balance.
5179 * We do not want newidle balance, which can be very
5180 * frequent, pollute the failure counter causing
5181 * excessive cache_hot migrations and active balances.
5182 */
5183 if (idle != CPU_NEWLY_IDLE)
5184 sd->nr_balance_failed++;
1e3c88bd 5185
bd939f45 5186 if (need_active_balance(&env)) {
1e3c88bd
PZ
5187 raw_spin_lock_irqsave(&busiest->lock, flags);
5188
969c7921
TH
5189 /* don't kick the active_load_balance_cpu_stop,
5190 * if the curr task on busiest cpu can't be
5191 * moved to this_cpu
1e3c88bd
PZ
5192 */
5193 if (!cpumask_test_cpu(this_cpu,
fa17b507 5194 tsk_cpus_allowed(busiest->curr))) {
1e3c88bd
PZ
5195 raw_spin_unlock_irqrestore(&busiest->lock,
5196 flags);
8e45cb54 5197 env.flags |= LBF_ALL_PINNED;
1e3c88bd
PZ
5198 goto out_one_pinned;
5199 }
5200
969c7921
TH
5201 /*
5202 * ->active_balance synchronizes accesses to
5203 * ->active_balance_work. Once set, it's cleared
5204 * only after active load balance is finished.
5205 */
1e3c88bd
PZ
5206 if (!busiest->active_balance) {
5207 busiest->active_balance = 1;
5208 busiest->push_cpu = this_cpu;
5209 active_balance = 1;
5210 }
5211 raw_spin_unlock_irqrestore(&busiest->lock, flags);
969c7921 5212
bd939f45 5213 if (active_balance) {
969c7921
TH
5214 stop_one_cpu_nowait(cpu_of(busiest),
5215 active_load_balance_cpu_stop, busiest,
5216 &busiest->active_balance_work);
bd939f45 5217 }
1e3c88bd
PZ
5218
5219 /*
5220 * We've kicked active balancing, reset the failure
5221 * counter.
5222 */
5223 sd->nr_balance_failed = sd->cache_nice_tries+1;
5224 }
5225 } else
5226 sd->nr_balance_failed = 0;
5227
5228 if (likely(!active_balance)) {
5229 /* We were unbalanced, so reset the balancing interval */
5230 sd->balance_interval = sd->min_interval;
5231 } else {
5232 /*
5233 * If we've begun active balancing, start to back off. This
5234 * case may not be covered by the all_pinned logic if there
5235 * is only 1 task on the busy runqueue (because we don't call
5236 * move_tasks).
5237 */
5238 if (sd->balance_interval < sd->max_interval)
5239 sd->balance_interval *= 2;
5240 }
5241
1e3c88bd
PZ
5242 goto out;
5243
5244out_balanced:
5245 schedstat_inc(sd, lb_balanced[idle]);
5246
5247 sd->nr_balance_failed = 0;
5248
5249out_one_pinned:
5250 /* tune up the balancing interval */
8e45cb54 5251 if (((env.flags & LBF_ALL_PINNED) &&
5b54b56b 5252 sd->balance_interval < MAX_PINNED_INTERVAL) ||
1e3c88bd
PZ
5253 (sd->balance_interval < sd->max_interval))
5254 sd->balance_interval *= 2;
5255
46e49b38 5256 ld_moved = 0;
1e3c88bd 5257out:
1e3c88bd
PZ
5258 return ld_moved;
5259}
5260
1e3c88bd
PZ
5261/*
5262 * idle_balance is called by schedule() if this_cpu is about to become
5263 * idle. Attempts to pull tasks from other CPUs.
5264 */
029632fb 5265void idle_balance(int this_cpu, struct rq *this_rq)
1e3c88bd
PZ
5266{
5267 struct sched_domain *sd;
5268 int pulled_task = 0;
5269 unsigned long next_balance = jiffies + HZ;
5270
5271 this_rq->idle_stamp = this_rq->clock;
5272
5273 if (this_rq->avg_idle < sysctl_sched_migration_cost)
5274 return;
5275
f492e12e
PZ
5276 /*
5277 * Drop the rq->lock, but keep IRQ/preempt disabled.
5278 */
5279 raw_spin_unlock(&this_rq->lock);
5280
48a16753 5281 update_blocked_averages(this_cpu);
dce840a0 5282 rcu_read_lock();
1e3c88bd
PZ
5283 for_each_domain(this_cpu, sd) {
5284 unsigned long interval;
f492e12e 5285 int balance = 1;
1e3c88bd
PZ
5286
5287 if (!(sd->flags & SD_LOAD_BALANCE))
5288 continue;
5289
f492e12e 5290 if (sd->flags & SD_BALANCE_NEWIDLE) {
1e3c88bd 5291 /* If we've pulled tasks over stop searching: */
f492e12e
PZ
5292 pulled_task = load_balance(this_cpu, this_rq,
5293 sd, CPU_NEWLY_IDLE, &balance);
5294 }
1e3c88bd
PZ
5295
5296 interval = msecs_to_jiffies(sd->balance_interval);
5297 if (time_after(next_balance, sd->last_balance + interval))
5298 next_balance = sd->last_balance + interval;
d5ad140b
NR
5299 if (pulled_task) {
5300 this_rq->idle_stamp = 0;
1e3c88bd 5301 break;
d5ad140b 5302 }
1e3c88bd 5303 }
dce840a0 5304 rcu_read_unlock();
f492e12e
PZ
5305
5306 raw_spin_lock(&this_rq->lock);
5307
1e3c88bd
PZ
5308 if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
5309 /*
5310 * We are going idle. next_balance may be set based on
5311 * a busy processor. So reset next_balance.
5312 */
5313 this_rq->next_balance = next_balance;
5314 }
5315}
5316
5317/*
969c7921
TH
5318 * active_load_balance_cpu_stop is run by cpu stopper. It pushes
5319 * running tasks off the busiest CPU onto idle CPUs. It requires at
5320 * least 1 task to be running on each physical CPU where possible, and
5321 * avoids physical / logical imbalances.
1e3c88bd 5322 */
969c7921 5323static int active_load_balance_cpu_stop(void *data)
1e3c88bd 5324{
969c7921
TH
5325 struct rq *busiest_rq = data;
5326 int busiest_cpu = cpu_of(busiest_rq);
1e3c88bd 5327 int target_cpu = busiest_rq->push_cpu;
969c7921 5328 struct rq *target_rq = cpu_rq(target_cpu);
1e3c88bd 5329 struct sched_domain *sd;
969c7921
TH
5330
5331 raw_spin_lock_irq(&busiest_rq->lock);
5332
5333 /* make sure the requested cpu hasn't gone down in the meantime */
5334 if (unlikely(busiest_cpu != smp_processor_id() ||
5335 !busiest_rq->active_balance))
5336 goto out_unlock;
1e3c88bd
PZ
5337
5338 /* Is there any task to move? */
5339 if (busiest_rq->nr_running <= 1)
969c7921 5340 goto out_unlock;
1e3c88bd
PZ
5341
5342 /*
5343 * This condition is "impossible", if it occurs
5344 * we need to fix it. Originally reported by
5345 * Bjorn Helgaas on a 128-cpu setup.
5346 */
5347 BUG_ON(busiest_rq == target_rq);
5348
5349 /* move a task from busiest_rq to target_rq */
5350 double_lock_balance(busiest_rq, target_rq);
1e3c88bd
PZ
5351
5352 /* Search for an sd spanning us and the target CPU. */
dce840a0 5353 rcu_read_lock();
1e3c88bd
PZ
5354 for_each_domain(target_cpu, sd) {
5355 if ((sd->flags & SD_LOAD_BALANCE) &&
5356 cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
5357 break;
5358 }
5359
5360 if (likely(sd)) {
8e45cb54
PZ
5361 struct lb_env env = {
5362 .sd = sd,
ddcdf6e7
PZ
5363 .dst_cpu = target_cpu,
5364 .dst_rq = target_rq,
5365 .src_cpu = busiest_rq->cpu,
5366 .src_rq = busiest_rq,
8e45cb54
PZ
5367 .idle = CPU_IDLE,
5368 };
5369
1e3c88bd
PZ
5370 schedstat_inc(sd, alb_count);
5371
8e45cb54 5372 if (move_one_task(&env))
1e3c88bd
PZ
5373 schedstat_inc(sd, alb_pushed);
5374 else
5375 schedstat_inc(sd, alb_failed);
5376 }
dce840a0 5377 rcu_read_unlock();
1e3c88bd 5378 double_unlock_balance(busiest_rq, target_rq);
969c7921
TH
5379out_unlock:
5380 busiest_rq->active_balance = 0;
5381 raw_spin_unlock_irq(&busiest_rq->lock);
5382 return 0;
1e3c88bd
PZ
5383}
5384
3451d024 5385#ifdef CONFIG_NO_HZ_COMMON
83cd4fe2
VP
5386/*
5387 * idle load balancing details
83cd4fe2
VP
5388 * - When one of the busy CPUs notice that there may be an idle rebalancing
5389 * needed, they will kick the idle load balancer, which then does idle
5390 * load balancing for all the idle CPUs.
5391 */
1e3c88bd 5392static struct {
83cd4fe2 5393 cpumask_var_t idle_cpus_mask;
0b005cf5 5394 atomic_t nr_cpus;
83cd4fe2
VP
5395 unsigned long next_balance; /* in jiffy units */
5396} nohz ____cacheline_aligned;
1e3c88bd 5397
8e7fbcbc 5398static inline int find_new_ilb(int call_cpu)
1e3c88bd 5399{
0b005cf5 5400 int ilb = cpumask_first(nohz.idle_cpus_mask);
1e3c88bd 5401
786d6dc7
SS
5402 if (ilb < nr_cpu_ids && idle_cpu(ilb))
5403 return ilb;
5404
5405 return nr_cpu_ids;
1e3c88bd 5406}
1e3c88bd 5407
83cd4fe2
VP
5408/*
5409 * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
5410 * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
5411 * CPU (if there is one).
5412 */
5413static void nohz_balancer_kick(int cpu)
5414{
5415 int ilb_cpu;
5416
5417 nohz.next_balance++;
5418
0b005cf5 5419 ilb_cpu = find_new_ilb(cpu);
83cd4fe2 5420
0b005cf5
SS
5421 if (ilb_cpu >= nr_cpu_ids)
5422 return;
83cd4fe2 5423
cd490c5b 5424 if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
1c792db7
SS
5425 return;
5426 /*
5427 * Use smp_send_reschedule() instead of resched_cpu().
5428 * This way we generate a sched IPI on the target cpu which
5429 * is idle. And the softirq performing nohz idle load balance
5430 * will be run before returning from the IPI.
5431 */
5432 smp_send_reschedule(ilb_cpu);
83cd4fe2
VP
5433 return;
5434}
5435
c1cc017c 5436static inline void nohz_balance_exit_idle(int cpu)
71325960
SS
5437{
5438 if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
5439 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
5440 atomic_dec(&nohz.nr_cpus);
5441 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
5442 }
5443}
5444
69e1e811
SS
5445static inline void set_cpu_sd_state_busy(void)
5446{
5447 struct sched_domain *sd;
5448 int cpu = smp_processor_id();
5449
69e1e811 5450 rcu_read_lock();
25f55d9d
VG
5451 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5452
5453 if (!sd || !sd->nohz_idle)
5454 goto unlock;
5455 sd->nohz_idle = 0;
5456
5457 for (; sd; sd = sd->parent)
69e1e811 5458 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
25f55d9d 5459unlock:
69e1e811
SS
5460 rcu_read_unlock();
5461}
5462
5463void set_cpu_sd_state_idle(void)
5464{
5465 struct sched_domain *sd;
5466 int cpu = smp_processor_id();
5467
69e1e811 5468 rcu_read_lock();
25f55d9d
VG
5469 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
5470
5471 if (!sd || sd->nohz_idle)
5472 goto unlock;
5473 sd->nohz_idle = 1;
5474
5475 for (; sd; sd = sd->parent)
69e1e811 5476 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
25f55d9d 5477unlock:
69e1e811
SS
5478 rcu_read_unlock();
5479}
5480
1e3c88bd 5481/*
c1cc017c 5482 * This routine will record that the cpu is going idle with tick stopped.
0b005cf5 5483 * This info will be used in performing idle load balancing in the future.
1e3c88bd 5484 */
c1cc017c 5485void nohz_balance_enter_idle(int cpu)
1e3c88bd 5486{
71325960
SS
5487 /*
5488 * If this cpu is going down, then nothing needs to be done.
5489 */
5490 if (!cpu_active(cpu))
5491 return;
5492
c1cc017c
AS
5493 if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
5494 return;
1e3c88bd 5495
c1cc017c
AS
5496 cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
5497 atomic_inc(&nohz.nr_cpus);
5498 set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
1e3c88bd 5499}
71325960
SS
5500
5501static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
5502 unsigned long action, void *hcpu)
5503{
5504 switch (action & ~CPU_TASKS_FROZEN) {
5505 case CPU_DYING:
c1cc017c 5506 nohz_balance_exit_idle(smp_processor_id());
71325960
SS
5507 return NOTIFY_OK;
5508 default:
5509 return NOTIFY_DONE;
5510 }
5511}
1e3c88bd
PZ
5512#endif
5513
5514static DEFINE_SPINLOCK(balancing);
5515
49c022e6
PZ
5516/*
5517 * Scale the max load_balance interval with the number of CPUs in the system.
5518 * This trades load-balance latency on larger machines for less cross talk.
5519 */
029632fb 5520void update_max_interval(void)
49c022e6
PZ
5521{
5522 max_load_balance_interval = HZ*num_online_cpus()/10;
5523}
5524
1e3c88bd
PZ
5525/*
5526 * It checks each scheduling domain to see if it is due to be balanced,
5527 * and initiates a balancing operation if so.
5528 *
b9b0853a 5529 * Balancing parameters are set up in init_sched_domains.
1e3c88bd
PZ
5530 */
5531static void rebalance_domains(int cpu, enum cpu_idle_type idle)
5532{
5533 int balance = 1;
5534 struct rq *rq = cpu_rq(cpu);
5535 unsigned long interval;
04f733b4 5536 struct sched_domain *sd;
1e3c88bd
PZ
5537 /* Earliest time when we have to do rebalance again */
5538 unsigned long next_balance = jiffies + 60*HZ;
5539 int update_next_balance = 0;
5540 int need_serialize;
5541
48a16753 5542 update_blocked_averages(cpu);
2069dd75 5543
dce840a0 5544 rcu_read_lock();
1e3c88bd
PZ
5545 for_each_domain(cpu, sd) {
5546 if (!(sd->flags & SD_LOAD_BALANCE))
5547 continue;
5548
5549 interval = sd->balance_interval;
5550 if (idle != CPU_IDLE)
5551 interval *= sd->busy_factor;
5552
5553 /* scale ms to jiffies */
5554 interval = msecs_to_jiffies(interval);
49c022e6 5555 interval = clamp(interval, 1UL, max_load_balance_interval);
1e3c88bd
PZ
5556
5557 need_serialize = sd->flags & SD_SERIALIZE;
5558
5559 if (need_serialize) {
5560 if (!spin_trylock(&balancing))
5561 goto out;
5562 }
5563
5564 if (time_after_eq(jiffies, sd->last_balance + interval)) {
5565 if (load_balance(cpu, rq, sd, idle, &balance)) {
5566 /*
de5eb2dd
JK
5567 * The LBF_SOME_PINNED logic could have changed
5568 * env->dst_cpu, so we can't know our idle
5569 * state even if we migrated tasks. Update it.
1e3c88bd 5570 */
de5eb2dd 5571 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
1e3c88bd
PZ
5572 }
5573 sd->last_balance = jiffies;
5574 }
5575 if (need_serialize)
5576 spin_unlock(&balancing);
5577out:
5578 if (time_after(next_balance, sd->last_balance + interval)) {
5579 next_balance = sd->last_balance + interval;
5580 update_next_balance = 1;
5581 }
5582
5583 /*
5584 * Stop the load balance at this level. There is another
5585 * CPU in our sched group which is doing load balancing more
5586 * actively.
5587 */
5588 if (!balance)
5589 break;
5590 }
dce840a0 5591 rcu_read_unlock();
1e3c88bd
PZ
5592
5593 /*
5594 * next_balance will be updated only when there is a need.
5595 * When the cpu is attached to null domain for ex, it will not be
5596 * updated.
5597 */
5598 if (likely(update_next_balance))
5599 rq->next_balance = next_balance;
5600}
5601
3451d024 5602#ifdef CONFIG_NO_HZ_COMMON
1e3c88bd 5603/*
3451d024 5604 * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
1e3c88bd
PZ
5605 * rebalancing for all the cpus for whom scheduler ticks are stopped.
5606 */
83cd4fe2
VP
5607static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
5608{
5609 struct rq *this_rq = cpu_rq(this_cpu);
5610 struct rq *rq;
5611 int balance_cpu;
5612
1c792db7
SS
5613 if (idle != CPU_IDLE ||
5614 !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
5615 goto end;
83cd4fe2
VP
5616
5617 for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8a6d42d1 5618 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
83cd4fe2
VP
5619 continue;
5620
5621 /*
5622 * If this cpu gets work to do, stop the load balancing
5623 * work being done for other cpus. Next load
5624 * balancing owner will pick it up.
5625 */
1c792db7 5626 if (need_resched())
83cd4fe2 5627 break;
83cd4fe2 5628
5ed4f1d9
VG
5629 rq = cpu_rq(balance_cpu);
5630
5631 raw_spin_lock_irq(&rq->lock);
5632 update_rq_clock(rq);
5633 update_idle_cpu_load(rq);
5634 raw_spin_unlock_irq(&rq->lock);
83cd4fe2
VP
5635
5636 rebalance_domains(balance_cpu, CPU_IDLE);
5637
83cd4fe2
VP
5638 if (time_after(this_rq->next_balance, rq->next_balance))
5639 this_rq->next_balance = rq->next_balance;
5640 }
5641 nohz.next_balance = this_rq->next_balance;
1c792db7
SS
5642end:
5643 clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
83cd4fe2
VP
5644}
5645
5646/*
0b005cf5
SS
5647 * Current heuristic for kicking the idle load balancer in the presence
5648 * of an idle cpu is the system.
5649 * - This rq has more than one task.
5650 * - At any scheduler domain level, this cpu's scheduler group has multiple
5651 * busy cpu's exceeding the group's power.
5652 * - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
5653 * domain span are idle.
83cd4fe2
VP
5654 */
5655static inline int nohz_kick_needed(struct rq *rq, int cpu)
5656{
5657 unsigned long now = jiffies;
0b005cf5 5658 struct sched_domain *sd;
83cd4fe2 5659
1c792db7 5660 if (unlikely(idle_cpu(cpu)))
83cd4fe2
VP
5661 return 0;
5662
1c792db7
SS
5663 /*
5664 * We may be recently in ticked or tickless idle mode. At the first
5665 * busy tick after returning from idle, we will update the busy stats.
5666 */
69e1e811 5667 set_cpu_sd_state_busy();
c1cc017c 5668 nohz_balance_exit_idle(cpu);
0b005cf5
SS
5669
5670 /*
5671 * None are in tickless mode and hence no need for NOHZ idle load
5672 * balancing.
5673 */
5674 if (likely(!atomic_read(&nohz.nr_cpus)))
5675 return 0;
1c792db7
SS
5676
5677 if (time_before(now, nohz.next_balance))
83cd4fe2
VP
5678 return 0;
5679
0b005cf5
SS
5680 if (rq->nr_running >= 2)
5681 goto need_kick;
83cd4fe2 5682
067491b7 5683 rcu_read_lock();
0b005cf5
SS
5684 for_each_domain(cpu, sd) {
5685 struct sched_group *sg = sd->groups;
5686 struct sched_group_power *sgp = sg->sgp;
5687 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
83cd4fe2 5688
0b005cf5 5689 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
067491b7 5690 goto need_kick_unlock;
0b005cf5
SS
5691
5692 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
5693 && (cpumask_first_and(nohz.idle_cpus_mask,
5694 sched_domain_span(sd)) < cpu))
067491b7 5695 goto need_kick_unlock;
0b005cf5
SS
5696
5697 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
5698 break;
83cd4fe2 5699 }
067491b7 5700 rcu_read_unlock();
83cd4fe2 5701 return 0;
067491b7
PZ
5702
5703need_kick_unlock:
5704 rcu_read_unlock();
0b005cf5
SS
5705need_kick:
5706 return 1;
83cd4fe2
VP
5707}
5708#else
5709static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
5710#endif
5711
5712/*
5713 * run_rebalance_domains is triggered when needed from the scheduler tick.
5714 * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
5715 */
1e3c88bd
PZ
5716static void run_rebalance_domains(struct softirq_action *h)
5717{
5718 int this_cpu = smp_processor_id();
5719 struct rq *this_rq = cpu_rq(this_cpu);
6eb57e0d 5720 enum cpu_idle_type idle = this_rq->idle_balance ?
1e3c88bd
PZ
5721 CPU_IDLE : CPU_NOT_IDLE;
5722
5723 rebalance_domains(this_cpu, idle);
5724
1e3c88bd 5725 /*
83cd4fe2 5726 * If this cpu has a pending nohz_balance_kick, then do the
1e3c88bd
PZ
5727 * balancing on behalf of the other idle cpus whose ticks are
5728 * stopped.
5729 */
83cd4fe2 5730 nohz_idle_balance(this_cpu, idle);
1e3c88bd
PZ
5731}
5732
5733static inline int on_null_domain(int cpu)
5734{
90a6501f 5735 return !rcu_dereference_sched(cpu_rq(cpu)->sd);
1e3c88bd
PZ
5736}
5737
5738/*
5739 * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
1e3c88bd 5740 */
029632fb 5741void trigger_load_balance(struct rq *rq, int cpu)
1e3c88bd 5742{
1e3c88bd
PZ
5743 /* Don't need to rebalance while attached to NULL domain */
5744 if (time_after_eq(jiffies, rq->next_balance) &&
5745 likely(!on_null_domain(cpu)))
5746 raise_softirq(SCHED_SOFTIRQ);
3451d024 5747#ifdef CONFIG_NO_HZ_COMMON
1c792db7 5748 if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
83cd4fe2
VP
5749 nohz_balancer_kick(cpu);
5750#endif
1e3c88bd
PZ
5751}
5752
0bcdcf28
CE
5753static void rq_online_fair(struct rq *rq)
5754{
5755 update_sysctl();
5756}
5757
5758static void rq_offline_fair(struct rq *rq)
5759{
5760 update_sysctl();
a4c96ae3
PB
5761
5762 /* Ensure any throttled groups are reachable by pick_next_task */
5763 unthrottle_offline_cfs_rqs(rq);
0bcdcf28
CE
5764}
5765
55e12e5e 5766#endif /* CONFIG_SMP */
e1d1484f 5767
bf0f6f24
IM
5768/*
5769 * scheduler tick hitting a task of our scheduling class:
5770 */
8f4d37ec 5771static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
bf0f6f24
IM
5772{
5773 struct cfs_rq *cfs_rq;
5774 struct sched_entity *se = &curr->se;
5775
5776 for_each_sched_entity(se) {
5777 cfs_rq = cfs_rq_of(se);
8f4d37ec 5778 entity_tick(cfs_rq, se, queued);
bf0f6f24 5779 }
18bf2805 5780
cbee9f88
PZ
5781 if (sched_feat_numa(NUMA))
5782 task_tick_numa(rq, curr);
3d59eebc 5783
18bf2805 5784 update_rq_runnable_avg(rq, 1);
bf0f6f24
IM
5785}
5786
5787/*
cd29fe6f
PZ
5788 * called on fork with the child task as argument from the parent's context
5789 * - child not yet on the tasklist
5790 * - preemption disabled
bf0f6f24 5791 */
cd29fe6f 5792static void task_fork_fair(struct task_struct *p)
bf0f6f24 5793{
4fc420c9
DN
5794 struct cfs_rq *cfs_rq;
5795 struct sched_entity *se = &p->se, *curr;
00bf7bfc 5796 int this_cpu = smp_processor_id();
cd29fe6f
PZ
5797 struct rq *rq = this_rq();
5798 unsigned long flags;
5799
05fa785c 5800 raw_spin_lock_irqsave(&rq->lock, flags);
bf0f6f24 5801
861d034e
PZ
5802 update_rq_clock(rq);
5803
4fc420c9
DN
5804 cfs_rq = task_cfs_rq(current);
5805 curr = cfs_rq->curr;
5806
51f52947
DN
5807 /*
5808 * Not only the cpu but also the task_group of the parent might have
5809 * been changed after parent->se.parent,cfs_rq were copied to
5810 * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
5811 * of child point to valid ones.
5812 */
5813 rcu_read_lock();
5814 __set_task_cpu(p, this_cpu);
5815 rcu_read_unlock();
bf0f6f24 5816
7109c442 5817 update_curr(cfs_rq);
cd29fe6f 5818
b5d9d734
MG
5819 if (curr)
5820 se->vruntime = curr->vruntime;
aeb73b04 5821 place_entity(cfs_rq, se, 1);
4d78e7b6 5822
cd29fe6f 5823 if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
87fefa38 5824 /*
edcb60a3
IM
5825 * Upon rescheduling, sched_class::put_prev_task() will place
5826 * 'current' within the tree based on its new key value.
5827 */
4d78e7b6 5828 swap(curr->vruntime, se->vruntime);
aec0a514 5829 resched_task(rq->curr);
4d78e7b6 5830 }
bf0f6f24 5831
88ec22d3
PZ
5832 se->vruntime -= cfs_rq->min_vruntime;
5833
05fa785c 5834 raw_spin_unlock_irqrestore(&rq->lock, flags);
bf0f6f24
IM
5835}
5836
cb469845
SR
5837/*
5838 * Priority of the task has changed. Check to see if we preempt
5839 * the current task.
5840 */
da7a735e
PZ
5841static void
5842prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
cb469845 5843{
da7a735e
PZ
5844 if (!p->se.on_rq)
5845 return;
5846
cb469845
SR
5847 /*
5848 * Reschedule if we are currently running on this runqueue and
5849 * our priority decreased, or if we are not currently running on
5850 * this runqueue and our priority is higher than the current's
5851 */
da7a735e 5852 if (rq->curr == p) {
cb469845
SR
5853 if (p->prio > oldprio)
5854 resched_task(rq->curr);
5855 } else
15afe09b 5856 check_preempt_curr(rq, p, 0);
cb469845
SR
5857}
5858
da7a735e
PZ
5859static void switched_from_fair(struct rq *rq, struct task_struct *p)
5860{
5861 struct sched_entity *se = &p->se;
5862 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5863
5864 /*
84bb5b64 5865 * Ensure the task's vruntime is normalized, so that when it's
da7a735e
PZ
5866 * switched back to the fair class the enqueue_entity(.flags=0) will
5867 * do the right thing.
5868 *
84bb5b64
GM
5869 * If it's on_rq, then the dequeue_entity(.flags=0) will already
5870 * have normalized the vruntime, if it's !on_rq, then only when
da7a735e
PZ
5871 * the task is sleeping will it still have non-normalized vruntime.
5872 */
84bb5b64 5873 if (!p->on_rq && p->state != TASK_RUNNING) {
da7a735e
PZ
5874 /*
5875 * Fix up our vruntime so that the current sleep doesn't
5876 * cause 'unlimited' sleep bonus.
5877 */
5878 place_entity(cfs_rq, se, 0);
5879 se->vruntime -= cfs_rq->min_vruntime;
5880 }
9ee474f5
PT
5881
5882#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5883 /*
5884 * Remove our load from contribution when we leave sched_fair
5885 * and ensure we don't carry in an old decay_count if we
5886 * switch back.
5887 */
5888 if (p->se.avg.decay_count) {
5889 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
5890 __synchronize_entity_decay(&p->se);
5891 subtract_blocked_load_contrib(cfs_rq,
5892 p->se.avg.load_avg_contrib);
5893 }
5894#endif
da7a735e
PZ
5895}
5896
cb469845
SR
5897/*
5898 * We switched to the sched_fair class.
5899 */
da7a735e 5900static void switched_to_fair(struct rq *rq, struct task_struct *p)
cb469845 5901{
da7a735e
PZ
5902 if (!p->se.on_rq)
5903 return;
5904
cb469845
SR
5905 /*
5906 * We were most likely switched from sched_rt, so
5907 * kick off the schedule if running, otherwise just see
5908 * if we can still preempt the current task.
5909 */
da7a735e 5910 if (rq->curr == p)
cb469845
SR
5911 resched_task(rq->curr);
5912 else
15afe09b 5913 check_preempt_curr(rq, p, 0);
cb469845
SR
5914}
5915
83b699ed
SV
5916/* Account for a task changing its policy or group.
5917 *
5918 * This routine is mostly called to set cfs_rq->curr field when a task
5919 * migrates between groups/classes.
5920 */
5921static void set_curr_task_fair(struct rq *rq)
5922{
5923 struct sched_entity *se = &rq->curr->se;
5924
ec12cb7f
PT
5925 for_each_sched_entity(se) {
5926 struct cfs_rq *cfs_rq = cfs_rq_of(se);
5927
5928 set_next_entity(cfs_rq, se);
5929 /* ensure bandwidth has been allocated on our new cfs_rq */
5930 account_cfs_rq_runtime(cfs_rq, 0);
5931 }
83b699ed
SV
5932}
5933
029632fb
PZ
5934void init_cfs_rq(struct cfs_rq *cfs_rq)
5935{
5936 cfs_rq->tasks_timeline = RB_ROOT;
029632fb
PZ
5937 cfs_rq->min_vruntime = (u64)(-(1LL << 20));
5938#ifndef CONFIG_64BIT
5939 cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
5940#endif
9ee474f5
PT
5941#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
5942 atomic64_set(&cfs_rq->decay_counter, 1);
aff3e498 5943 atomic64_set(&cfs_rq->removed_load, 0);
9ee474f5 5944#endif
029632fb
PZ
5945}
5946
810b3817 5947#ifdef CONFIG_FAIR_GROUP_SCHED
b2b5ce02 5948static void task_move_group_fair(struct task_struct *p, int on_rq)
810b3817 5949{
aff3e498 5950 struct cfs_rq *cfs_rq;
b2b5ce02
PZ
5951 /*
5952 * If the task was not on the rq at the time of this cgroup movement
5953 * it must have been asleep, sleeping tasks keep their ->vruntime
5954 * absolute on their old rq until wakeup (needed for the fair sleeper
5955 * bonus in place_entity()).
5956 *
5957 * If it was on the rq, we've just 'preempted' it, which does convert
5958 * ->vruntime to a relative base.
5959 *
5960 * Make sure both cases convert their relative position when migrating
5961 * to another cgroup's rq. This does somewhat interfere with the
5962 * fair sleeper stuff for the first placement, but who cares.
5963 */
7ceff013
DN
5964 /*
5965 * When !on_rq, vruntime of the task has usually NOT been normalized.
5966 * But there are some cases where it has already been normalized:
5967 *
5968 * - Moving a forked child which is waiting for being woken up by
5969 * wake_up_new_task().
62af3783
DN
5970 * - Moving a task which has been woken up by try_to_wake_up() and
5971 * waiting for actually being woken up by sched_ttwu_pending().
7ceff013
DN
5972 *
5973 * To prevent boost or penalty in the new cfs_rq caused by delta
5974 * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
5975 */
62af3783 5976 if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
7ceff013
DN
5977 on_rq = 1;
5978
b2b5ce02
PZ
5979 if (!on_rq)
5980 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
5981 set_task_rq(p, task_cpu(p));
aff3e498
PT
5982 if (!on_rq) {
5983 cfs_rq = cfs_rq_of(&p->se);
5984 p->se.vruntime += cfs_rq->min_vruntime;
5985#ifdef CONFIG_SMP
5986 /*
5987 * migrate_task_rq_fair() will have removed our previous
5988 * contribution, but we must synchronize for ongoing future
5989 * decay.
5990 */
5991 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
5992 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
5993#endif
5994 }
810b3817 5995}
029632fb
PZ
5996
5997void free_fair_sched_group(struct task_group *tg)
5998{
5999 int i;
6000
6001 destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
6002
6003 for_each_possible_cpu(i) {
6004 if (tg->cfs_rq)
6005 kfree(tg->cfs_rq[i]);
6006 if (tg->se)
6007 kfree(tg->se[i]);
6008 }
6009
6010 kfree(tg->cfs_rq);
6011 kfree(tg->se);
6012}
6013
6014int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6015{
6016 struct cfs_rq *cfs_rq;
6017 struct sched_entity *se;
6018 int i;
6019
6020 tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
6021 if (!tg->cfs_rq)
6022 goto err;
6023 tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
6024 if (!tg->se)
6025 goto err;
6026
6027 tg->shares = NICE_0_LOAD;
6028
6029 init_cfs_bandwidth(tg_cfs_bandwidth(tg));
6030
6031 for_each_possible_cpu(i) {
6032 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
6033 GFP_KERNEL, cpu_to_node(i));
6034 if (!cfs_rq)
6035 goto err;
6036
6037 se = kzalloc_node(sizeof(struct sched_entity),
6038 GFP_KERNEL, cpu_to_node(i));
6039 if (!se)
6040 goto err_free_rq;
6041
6042 init_cfs_rq(cfs_rq);
6043 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
6044 }
6045
6046 return 1;
6047
6048err_free_rq:
6049 kfree(cfs_rq);
6050err:
6051 return 0;
6052}
6053
6054void unregister_fair_sched_group(struct task_group *tg, int cpu)
6055{
6056 struct rq *rq = cpu_rq(cpu);
6057 unsigned long flags;
6058
6059 /*
6060 * Only empty task groups can be destroyed; so we can speculatively
6061 * check on_list without danger of it being re-added.
6062 */
6063 if (!tg->cfs_rq[cpu]->on_list)
6064 return;
6065
6066 raw_spin_lock_irqsave(&rq->lock, flags);
6067 list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
6068 raw_spin_unlock_irqrestore(&rq->lock, flags);
6069}
6070
6071void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
6072 struct sched_entity *se, int cpu,
6073 struct sched_entity *parent)
6074{
6075 struct rq *rq = cpu_rq(cpu);
6076
6077 cfs_rq->tg = tg;
6078 cfs_rq->rq = rq;
029632fb
PZ
6079 init_cfs_rq_runtime(cfs_rq);
6080
6081 tg->cfs_rq[cpu] = cfs_rq;
6082 tg->se[cpu] = se;
6083
6084 /* se could be NULL for root_task_group */
6085 if (!se)
6086 return;
6087
6088 if (!parent)
6089 se->cfs_rq = &rq->cfs;
6090 else
6091 se->cfs_rq = parent->my_q;
6092
6093 se->my_q = cfs_rq;
5ba45423
PT
6094 /* guarantee group entities always have weight */
6095 update_load_set(&se->load, NICE_0_LOAD);
029632fb
PZ
6096 se->parent = parent;
6097}
6098
6099static DEFINE_MUTEX(shares_mutex);
6100
6101int sched_group_set_shares(struct task_group *tg, unsigned long shares)
6102{
6103 int i;
6104 unsigned long flags;
6105
6106 /*
6107 * We can't change the weight of the root cgroup.
6108 */
6109 if (!tg->se[0])
6110 return -EINVAL;
6111
6112 shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
6113
6114 mutex_lock(&shares_mutex);
6115 if (tg->shares == shares)
6116 goto done;
6117
6118 tg->shares = shares;
6119 for_each_possible_cpu(i) {
6120 struct rq *rq = cpu_rq(i);
6121 struct sched_entity *se;
6122
6123 se = tg->se[i];
6124 /* Propagate contribution to hierarchy */
6125 raw_spin_lock_irqsave(&rq->lock, flags);
17bc14b7 6126 for_each_sched_entity(se)
029632fb
PZ
6127 update_cfs_shares(group_cfs_rq(se));
6128 raw_spin_unlock_irqrestore(&rq->lock, flags);
6129 }
6130
6131done:
6132 mutex_unlock(&shares_mutex);
6133 return 0;
6134}
6135#else /* CONFIG_FAIR_GROUP_SCHED */
6136
6137void free_fair_sched_group(struct task_group *tg) { }
6138
6139int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
6140{
6141 return 1;
6142}
6143
6144void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
6145
6146#endif /* CONFIG_FAIR_GROUP_SCHED */
6147
810b3817 6148
6d686f45 6149static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
0d721cea
PW
6150{
6151 struct sched_entity *se = &task->se;
0d721cea
PW
6152 unsigned int rr_interval = 0;
6153
6154 /*
6155 * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
6156 * idle runqueue:
6157 */
0d721cea 6158 if (rq->cfs.load.weight)
a59f4e07 6159 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
0d721cea
PW
6160
6161 return rr_interval;
6162}
6163
bf0f6f24
IM
6164/*
6165 * All the scheduling class methods:
6166 */
029632fb 6167const struct sched_class fair_sched_class = {
5522d5d5 6168 .next = &idle_sched_class,
bf0f6f24
IM
6169 .enqueue_task = enqueue_task_fair,
6170 .dequeue_task = dequeue_task_fair,
6171 .yield_task = yield_task_fair,
d95f4122 6172 .yield_to_task = yield_to_task_fair,
bf0f6f24 6173
2e09bf55 6174 .check_preempt_curr = check_preempt_wakeup,
bf0f6f24
IM
6175
6176 .pick_next_task = pick_next_task_fair,
6177 .put_prev_task = put_prev_task_fair,
6178
681f3e68 6179#ifdef CONFIG_SMP
4ce72a2c 6180 .select_task_rq = select_task_rq_fair,
f4e26b12 6181#ifdef CONFIG_FAIR_GROUP_SCHED
0a74bef8 6182 .migrate_task_rq = migrate_task_rq_fair,
f4e26b12 6183#endif
0bcdcf28
CE
6184 .rq_online = rq_online_fair,
6185 .rq_offline = rq_offline_fair,
88ec22d3
PZ
6186
6187 .task_waking = task_waking_fair,
681f3e68 6188#endif
bf0f6f24 6189
83b699ed 6190 .set_curr_task = set_curr_task_fair,
bf0f6f24 6191 .task_tick = task_tick_fair,
cd29fe6f 6192 .task_fork = task_fork_fair,
cb469845
SR
6193
6194 .prio_changed = prio_changed_fair,
da7a735e 6195 .switched_from = switched_from_fair,
cb469845 6196 .switched_to = switched_to_fair,
810b3817 6197
0d721cea
PW
6198 .get_rr_interval = get_rr_interval_fair,
6199
810b3817 6200#ifdef CONFIG_FAIR_GROUP_SCHED
b2b5ce02 6201 .task_move_group = task_move_group_fair,
810b3817 6202#endif
bf0f6f24
IM
6203};
6204
6205#ifdef CONFIG_SCHED_DEBUG
029632fb 6206void print_cfs_stats(struct seq_file *m, int cpu)
bf0f6f24 6207{
bf0f6f24
IM
6208 struct cfs_rq *cfs_rq;
6209
5973e5b9 6210 rcu_read_lock();
c3b64f1e 6211 for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
5cef9eca 6212 print_cfs_rq(m, cpu, cfs_rq);
5973e5b9 6213 rcu_read_unlock();
bf0f6f24
IM
6214}
6215#endif
029632fb
PZ
6216
6217__init void init_sched_fair_class(void)
6218{
6219#ifdef CONFIG_SMP
6220 open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
6221
3451d024 6222#ifdef CONFIG_NO_HZ_COMMON
554cecaf 6223 nohz.next_balance = jiffies;
029632fb 6224 zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
71325960 6225 cpu_notifier(sched_ilb_notifier, 0);
029632fb
PZ
6226#endif
6227#endif /* SMP */
6228
6229}