Merge 4.14.20 into android-4.14
[GitHub/MotorolaMobilityLLC/kernel-slsi.git] / kernel / sched / rt.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies)
5 */
6
7 #include "sched.h"
8
9 #include <linux/slab.h>
10 #include <linux/irq_work.h>
11
12 #include "walt.h"
13
14 int sched_rr_timeslice = RR_TIMESLICE;
15 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
16
17 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
18
19 struct rt_bandwidth def_rt_bandwidth;
20
21 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
22 {
23 struct rt_bandwidth *rt_b =
24 container_of(timer, struct rt_bandwidth, rt_period_timer);
25 int idle = 0;
26 int overrun;
27
28 raw_spin_lock(&rt_b->rt_runtime_lock);
29 for (;;) {
30 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
31 if (!overrun)
32 break;
33
34 raw_spin_unlock(&rt_b->rt_runtime_lock);
35 idle = do_sched_rt_period_timer(rt_b, overrun);
36 raw_spin_lock(&rt_b->rt_runtime_lock);
37 }
38 if (idle)
39 rt_b->rt_period_active = 0;
40 raw_spin_unlock(&rt_b->rt_runtime_lock);
41
42 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
43 }
44
45 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
46 {
47 rt_b->rt_period = ns_to_ktime(period);
48 rt_b->rt_runtime = runtime;
49
50 raw_spin_lock_init(&rt_b->rt_runtime_lock);
51
52 hrtimer_init(&rt_b->rt_period_timer,
53 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
54 rt_b->rt_period_timer.function = sched_rt_period_timer;
55 }
56
57 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
58 {
59 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
60 return;
61
62 raw_spin_lock(&rt_b->rt_runtime_lock);
63 if (!rt_b->rt_period_active) {
64 rt_b->rt_period_active = 1;
65 /*
66 * SCHED_DEADLINE updates the bandwidth, as a run away
67 * RT task with a DL task could hog a CPU. But DL does
68 * not reset the period. If a deadline task was running
69 * without an RT task running, it can cause RT tasks to
70 * throttle when they start up. Kick the timer right away
71 * to update the period.
72 */
73 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
74 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
75 }
76 raw_spin_unlock(&rt_b->rt_runtime_lock);
77 }
78
79 void init_rt_rq(struct rt_rq *rt_rq)
80 {
81 struct rt_prio_array *array;
82 int i;
83
84 array = &rt_rq->active;
85 for (i = 0; i < MAX_RT_PRIO; i++) {
86 INIT_LIST_HEAD(array->queue + i);
87 __clear_bit(i, array->bitmap);
88 }
89 /* delimiter for bitsearch: */
90 __set_bit(MAX_RT_PRIO, array->bitmap);
91
92 #if defined CONFIG_SMP
93 rt_rq->highest_prio.curr = MAX_RT_PRIO;
94 rt_rq->highest_prio.next = MAX_RT_PRIO;
95 rt_rq->rt_nr_migratory = 0;
96 rt_rq->overloaded = 0;
97 plist_head_init(&rt_rq->pushable_tasks);
98 #endif /* CONFIG_SMP */
99 /* We start is dequeued state, because no RT tasks are queued */
100 rt_rq->rt_queued = 0;
101
102 rt_rq->rt_time = 0;
103 rt_rq->rt_throttled = 0;
104 rt_rq->rt_runtime = 0;
105 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
106 }
107
108 #ifdef CONFIG_RT_GROUP_SCHED
109 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
110 {
111 hrtimer_cancel(&rt_b->rt_period_timer);
112 }
113
114 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
115
116 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
117 {
118 #ifdef CONFIG_SCHED_DEBUG
119 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
120 #endif
121 return container_of(rt_se, struct task_struct, rt);
122 }
123
124 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
125 {
126 return rt_rq->rq;
127 }
128
129 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
130 {
131 return rt_se->rt_rq;
132 }
133
134 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
135 {
136 struct rt_rq *rt_rq = rt_se->rt_rq;
137
138 return rt_rq->rq;
139 }
140
141 void free_rt_sched_group(struct task_group *tg)
142 {
143 int i;
144
145 if (tg->rt_se)
146 destroy_rt_bandwidth(&tg->rt_bandwidth);
147
148 for_each_possible_cpu(i) {
149 if (tg->rt_rq)
150 kfree(tg->rt_rq[i]);
151 if (tg->rt_se)
152 kfree(tg->rt_se[i]);
153 }
154
155 kfree(tg->rt_rq);
156 kfree(tg->rt_se);
157 }
158
159 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
160 struct sched_rt_entity *rt_se, int cpu,
161 struct sched_rt_entity *parent)
162 {
163 struct rq *rq = cpu_rq(cpu);
164
165 rt_rq->highest_prio.curr = MAX_RT_PRIO;
166 rt_rq->rt_nr_boosted = 0;
167 rt_rq->rq = rq;
168 rt_rq->tg = tg;
169
170 tg->rt_rq[cpu] = rt_rq;
171 tg->rt_se[cpu] = rt_se;
172
173 if (!rt_se)
174 return;
175
176 if (!parent)
177 rt_se->rt_rq = &rq->rt;
178 else
179 rt_se->rt_rq = parent->my_q;
180
181 rt_se->my_q = rt_rq;
182 rt_se->parent = parent;
183 INIT_LIST_HEAD(&rt_se->run_list);
184 }
185
186 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
187 {
188 struct rt_rq *rt_rq;
189 struct sched_rt_entity *rt_se;
190 int i;
191
192 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
193 if (!tg->rt_rq)
194 goto err;
195 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
196 if (!tg->rt_se)
197 goto err;
198
199 init_rt_bandwidth(&tg->rt_bandwidth,
200 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
201
202 for_each_possible_cpu(i) {
203 rt_rq = kzalloc_node(sizeof(struct rt_rq),
204 GFP_KERNEL, cpu_to_node(i));
205 if (!rt_rq)
206 goto err;
207
208 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
209 GFP_KERNEL, cpu_to_node(i));
210 if (!rt_se)
211 goto err_free_rq;
212
213 init_rt_rq(rt_rq);
214 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
215 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
216 }
217
218 return 1;
219
220 err_free_rq:
221 kfree(rt_rq);
222 err:
223 return 0;
224 }
225
226 #else /* CONFIG_RT_GROUP_SCHED */
227
228 #define rt_entity_is_task(rt_se) (1)
229
230 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
231 {
232 return container_of(rt_se, struct task_struct, rt);
233 }
234
235 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
236 {
237 return container_of(rt_rq, struct rq, rt);
238 }
239
240 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
241 {
242 struct task_struct *p = rt_task_of(rt_se);
243
244 return task_rq(p);
245 }
246
247 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
248 {
249 struct rq *rq = rq_of_rt_se(rt_se);
250
251 return &rq->rt;
252 }
253
254 void free_rt_sched_group(struct task_group *tg) { }
255
256 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
257 {
258 return 1;
259 }
260 #endif /* CONFIG_RT_GROUP_SCHED */
261
262 #ifdef CONFIG_SMP
263
264 static void pull_rt_task(struct rq *this_rq);
265
266 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
267 {
268 /* Try to pull RT tasks here if we lower this rq's prio */
269 return rq->rt.highest_prio.curr > prev->prio;
270 }
271
272 static inline int rt_overloaded(struct rq *rq)
273 {
274 return atomic_read(&rq->rd->rto_count);
275 }
276
277 static inline void rt_set_overload(struct rq *rq)
278 {
279 if (!rq->online)
280 return;
281
282 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
283 /*
284 * Make sure the mask is visible before we set
285 * the overload count. That is checked to determine
286 * if we should look at the mask. It would be a shame
287 * if we looked at the mask, but the mask was not
288 * updated yet.
289 *
290 * Matched by the barrier in pull_rt_task().
291 */
292 smp_wmb();
293 atomic_inc(&rq->rd->rto_count);
294 }
295
296 static inline void rt_clear_overload(struct rq *rq)
297 {
298 if (!rq->online)
299 return;
300
301 /* the order here really doesn't matter */
302 atomic_dec(&rq->rd->rto_count);
303 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
304 }
305
306 static void update_rt_migration(struct rt_rq *rt_rq)
307 {
308 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
309 if (!rt_rq->overloaded) {
310 rt_set_overload(rq_of_rt_rq(rt_rq));
311 rt_rq->overloaded = 1;
312 }
313 } else if (rt_rq->overloaded) {
314 rt_clear_overload(rq_of_rt_rq(rt_rq));
315 rt_rq->overloaded = 0;
316 }
317 }
318
319 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
320 {
321 struct task_struct *p;
322
323 if (!rt_entity_is_task(rt_se))
324 return;
325
326 p = rt_task_of(rt_se);
327 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
328
329 rt_rq->rt_nr_total++;
330 if (p->nr_cpus_allowed > 1)
331 rt_rq->rt_nr_migratory++;
332
333 update_rt_migration(rt_rq);
334 }
335
336 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
337 {
338 struct task_struct *p;
339
340 if (!rt_entity_is_task(rt_se))
341 return;
342
343 p = rt_task_of(rt_se);
344 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
345
346 rt_rq->rt_nr_total--;
347 if (p->nr_cpus_allowed > 1)
348 rt_rq->rt_nr_migratory--;
349
350 update_rt_migration(rt_rq);
351 }
352
353 static inline int has_pushable_tasks(struct rq *rq)
354 {
355 return !plist_head_empty(&rq->rt.pushable_tasks);
356 }
357
358 static DEFINE_PER_CPU(struct callback_head, rt_push_head);
359 static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
360
361 static void push_rt_tasks(struct rq *);
362 static void pull_rt_task(struct rq *);
363
364 static inline void queue_push_tasks(struct rq *rq)
365 {
366 if (!has_pushable_tasks(rq))
367 return;
368
369 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
370 }
371
372 static inline void queue_pull_task(struct rq *rq)
373 {
374 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
375 }
376
377 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
378 {
379 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
380 plist_node_init(&p->pushable_tasks, p->prio);
381 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
382
383 /* Update the highest prio pushable task */
384 if (p->prio < rq->rt.highest_prio.next)
385 rq->rt.highest_prio.next = p->prio;
386 }
387
388 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
389 {
390 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
391
392 /* Update the new highest prio pushable task */
393 if (has_pushable_tasks(rq)) {
394 p = plist_first_entry(&rq->rt.pushable_tasks,
395 struct task_struct, pushable_tasks);
396 rq->rt.highest_prio.next = p->prio;
397 } else
398 rq->rt.highest_prio.next = MAX_RT_PRIO;
399 }
400
401 #else
402
403 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
404 {
405 }
406
407 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
408 {
409 }
410
411 static inline
412 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
413 {
414 }
415
416 static inline
417 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
418 {
419 }
420
421 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
422 {
423 return false;
424 }
425
426 static inline void pull_rt_task(struct rq *this_rq)
427 {
428 }
429
430 static inline void queue_push_tasks(struct rq *rq)
431 {
432 }
433 #endif /* CONFIG_SMP */
434
435 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
436 static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
437
438 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
439 {
440 return rt_se->on_rq;
441 }
442
443 #ifdef CONFIG_RT_GROUP_SCHED
444
445 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
446 {
447 if (!rt_rq->tg)
448 return RUNTIME_INF;
449
450 return rt_rq->rt_runtime;
451 }
452
453 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
454 {
455 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
456 }
457
458 typedef struct task_group *rt_rq_iter_t;
459
460 static inline struct task_group *next_task_group(struct task_group *tg)
461 {
462 do {
463 tg = list_entry_rcu(tg->list.next,
464 typeof(struct task_group), list);
465 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
466
467 if (&tg->list == &task_groups)
468 tg = NULL;
469
470 return tg;
471 }
472
473 #define for_each_rt_rq(rt_rq, iter, rq) \
474 for (iter = container_of(&task_groups, typeof(*iter), list); \
475 (iter = next_task_group(iter)) && \
476 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
477
478 #define for_each_sched_rt_entity(rt_se) \
479 for (; rt_se; rt_se = rt_se->parent)
480
481 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
482 {
483 return rt_se->my_q;
484 }
485
486 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
487 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
488
489 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
490 {
491 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
492 struct rq *rq = rq_of_rt_rq(rt_rq);
493 struct sched_rt_entity *rt_se;
494
495 int cpu = cpu_of(rq);
496
497 rt_se = rt_rq->tg->rt_se[cpu];
498
499 if (rt_rq->rt_nr_running) {
500 if (!rt_se)
501 enqueue_top_rt_rq(rt_rq);
502 else if (!on_rt_rq(rt_se))
503 enqueue_rt_entity(rt_se, 0);
504
505 if (rt_rq->highest_prio.curr < curr->prio)
506 resched_curr(rq);
507 }
508 }
509
510 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
511 {
512 struct sched_rt_entity *rt_se;
513 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
514
515 rt_se = rt_rq->tg->rt_se[cpu];
516
517 if (!rt_se)
518 dequeue_top_rt_rq(rt_rq);
519 else if (on_rt_rq(rt_se))
520 dequeue_rt_entity(rt_se, 0);
521 }
522
523 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
524 {
525 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
526 }
527
528 static int rt_se_boosted(struct sched_rt_entity *rt_se)
529 {
530 struct rt_rq *rt_rq = group_rt_rq(rt_se);
531 struct task_struct *p;
532
533 if (rt_rq)
534 return !!rt_rq->rt_nr_boosted;
535
536 p = rt_task_of(rt_se);
537 return p->prio != p->normal_prio;
538 }
539
540 #ifdef CONFIG_SMP
541 static inline const struct cpumask *sched_rt_period_mask(void)
542 {
543 return this_rq()->rd->span;
544 }
545 #else
546 static inline const struct cpumask *sched_rt_period_mask(void)
547 {
548 return cpu_online_mask;
549 }
550 #endif
551
552 static inline
553 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
554 {
555 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
556 }
557
558 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
559 {
560 return &rt_rq->tg->rt_bandwidth;
561 }
562
563 #else /* !CONFIG_RT_GROUP_SCHED */
564
565 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
566 {
567 return rt_rq->rt_runtime;
568 }
569
570 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
571 {
572 return ktime_to_ns(def_rt_bandwidth.rt_period);
573 }
574
575 typedef struct rt_rq *rt_rq_iter_t;
576
577 #define for_each_rt_rq(rt_rq, iter, rq) \
578 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
579
580 #define for_each_sched_rt_entity(rt_se) \
581 for (; rt_se; rt_se = NULL)
582
583 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
584 {
585 return NULL;
586 }
587
588 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
589 {
590 struct rq *rq = rq_of_rt_rq(rt_rq);
591
592 if (!rt_rq->rt_nr_running)
593 return;
594
595 enqueue_top_rt_rq(rt_rq);
596 resched_curr(rq);
597 }
598
599 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
600 {
601 dequeue_top_rt_rq(rt_rq);
602 }
603
604 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
605 {
606 return rt_rq->rt_throttled;
607 }
608
609 static inline const struct cpumask *sched_rt_period_mask(void)
610 {
611 return cpu_online_mask;
612 }
613
614 static inline
615 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
616 {
617 return &cpu_rq(cpu)->rt;
618 }
619
620 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
621 {
622 return &def_rt_bandwidth;
623 }
624
625 #endif /* CONFIG_RT_GROUP_SCHED */
626
627 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
628 {
629 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
630
631 return (hrtimer_active(&rt_b->rt_period_timer) ||
632 rt_rq->rt_time < rt_b->rt_runtime);
633 }
634
635 #ifdef CONFIG_SMP
636 /*
637 * We ran out of runtime, see if we can borrow some from our neighbours.
638 */
639 static void do_balance_runtime(struct rt_rq *rt_rq)
640 {
641 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
642 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
643 int i, weight;
644 u64 rt_period;
645
646 weight = cpumask_weight(rd->span);
647
648 raw_spin_lock(&rt_b->rt_runtime_lock);
649 rt_period = ktime_to_ns(rt_b->rt_period);
650 for_each_cpu(i, rd->span) {
651 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
652 s64 diff;
653
654 if (iter == rt_rq)
655 continue;
656
657 raw_spin_lock(&iter->rt_runtime_lock);
658 /*
659 * Either all rqs have inf runtime and there's nothing to steal
660 * or __disable_runtime() below sets a specific rq to inf to
661 * indicate its been disabled and disalow stealing.
662 */
663 if (iter->rt_runtime == RUNTIME_INF)
664 goto next;
665
666 /*
667 * From runqueues with spare time, take 1/n part of their
668 * spare time, but no more than our period.
669 */
670 diff = iter->rt_runtime - iter->rt_time;
671 if (diff > 0) {
672 diff = div_u64((u64)diff, weight);
673 if (rt_rq->rt_runtime + diff > rt_period)
674 diff = rt_period - rt_rq->rt_runtime;
675 iter->rt_runtime -= diff;
676 rt_rq->rt_runtime += diff;
677 if (rt_rq->rt_runtime == rt_period) {
678 raw_spin_unlock(&iter->rt_runtime_lock);
679 break;
680 }
681 }
682 next:
683 raw_spin_unlock(&iter->rt_runtime_lock);
684 }
685 raw_spin_unlock(&rt_b->rt_runtime_lock);
686 }
687
688 /*
689 * Ensure this RQ takes back all the runtime it lend to its neighbours.
690 */
691 static void __disable_runtime(struct rq *rq)
692 {
693 struct root_domain *rd = rq->rd;
694 rt_rq_iter_t iter;
695 struct rt_rq *rt_rq;
696
697 if (unlikely(!scheduler_running))
698 return;
699
700 for_each_rt_rq(rt_rq, iter, rq) {
701 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
702 s64 want;
703 int i;
704
705 raw_spin_lock(&rt_b->rt_runtime_lock);
706 raw_spin_lock(&rt_rq->rt_runtime_lock);
707 /*
708 * Either we're all inf and nobody needs to borrow, or we're
709 * already disabled and thus have nothing to do, or we have
710 * exactly the right amount of runtime to take out.
711 */
712 if (rt_rq->rt_runtime == RUNTIME_INF ||
713 rt_rq->rt_runtime == rt_b->rt_runtime)
714 goto balanced;
715 raw_spin_unlock(&rt_rq->rt_runtime_lock);
716
717 /*
718 * Calculate the difference between what we started out with
719 * and what we current have, that's the amount of runtime
720 * we lend and now have to reclaim.
721 */
722 want = rt_b->rt_runtime - rt_rq->rt_runtime;
723
724 /*
725 * Greedy reclaim, take back as much as we can.
726 */
727 for_each_cpu(i, rd->span) {
728 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
729 s64 diff;
730
731 /*
732 * Can't reclaim from ourselves or disabled runqueues.
733 */
734 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
735 continue;
736
737 raw_spin_lock(&iter->rt_runtime_lock);
738 if (want > 0) {
739 diff = min_t(s64, iter->rt_runtime, want);
740 iter->rt_runtime -= diff;
741 want -= diff;
742 } else {
743 iter->rt_runtime -= want;
744 want -= want;
745 }
746 raw_spin_unlock(&iter->rt_runtime_lock);
747
748 if (!want)
749 break;
750 }
751
752 raw_spin_lock(&rt_rq->rt_runtime_lock);
753 /*
754 * We cannot be left wanting - that would mean some runtime
755 * leaked out of the system.
756 */
757 BUG_ON(want);
758 balanced:
759 /*
760 * Disable all the borrow logic by pretending we have inf
761 * runtime - in which case borrowing doesn't make sense.
762 */
763 rt_rq->rt_runtime = RUNTIME_INF;
764 rt_rq->rt_throttled = 0;
765 raw_spin_unlock(&rt_rq->rt_runtime_lock);
766 raw_spin_unlock(&rt_b->rt_runtime_lock);
767
768 /* Make rt_rq available for pick_next_task() */
769 sched_rt_rq_enqueue(rt_rq);
770 }
771 }
772
773 static void __enable_runtime(struct rq *rq)
774 {
775 rt_rq_iter_t iter;
776 struct rt_rq *rt_rq;
777
778 if (unlikely(!scheduler_running))
779 return;
780
781 /*
782 * Reset each runqueue's bandwidth settings
783 */
784 for_each_rt_rq(rt_rq, iter, rq) {
785 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
786
787 raw_spin_lock(&rt_b->rt_runtime_lock);
788 raw_spin_lock(&rt_rq->rt_runtime_lock);
789 rt_rq->rt_runtime = rt_b->rt_runtime;
790 rt_rq->rt_time = 0;
791 rt_rq->rt_throttled = 0;
792 raw_spin_unlock(&rt_rq->rt_runtime_lock);
793 raw_spin_unlock(&rt_b->rt_runtime_lock);
794 }
795 }
796
797 static void balance_runtime(struct rt_rq *rt_rq)
798 {
799 if (!sched_feat(RT_RUNTIME_SHARE))
800 return;
801
802 if (rt_rq->rt_time > rt_rq->rt_runtime) {
803 raw_spin_unlock(&rt_rq->rt_runtime_lock);
804 do_balance_runtime(rt_rq);
805 raw_spin_lock(&rt_rq->rt_runtime_lock);
806 }
807 }
808 #else /* !CONFIG_SMP */
809 static inline void balance_runtime(struct rt_rq *rt_rq) {}
810 #endif /* CONFIG_SMP */
811
812 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
813 {
814 int i, idle = 1, throttled = 0;
815 const struct cpumask *span;
816
817 span = sched_rt_period_mask();
818 #ifdef CONFIG_RT_GROUP_SCHED
819 /*
820 * FIXME: isolated CPUs should really leave the root task group,
821 * whether they are isolcpus or were isolated via cpusets, lest
822 * the timer run on a CPU which does not service all runqueues,
823 * potentially leaving other CPUs indefinitely throttled. If
824 * isolation is really required, the user will turn the throttle
825 * off to kill the perturbations it causes anyway. Meanwhile,
826 * this maintains functionality for boot and/or troubleshooting.
827 */
828 if (rt_b == &root_task_group.rt_bandwidth)
829 span = cpu_online_mask;
830 #endif
831 for_each_cpu(i, span) {
832 int enqueue = 0;
833 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
834 struct rq *rq = rq_of_rt_rq(rt_rq);
835 int skip;
836
837 /*
838 * When span == cpu_online_mask, taking each rq->lock
839 * can be time-consuming. Try to avoid it when possible.
840 */
841 raw_spin_lock(&rt_rq->rt_runtime_lock);
842 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
843 raw_spin_unlock(&rt_rq->rt_runtime_lock);
844 if (skip)
845 continue;
846
847 raw_spin_lock(&rq->lock);
848 if (rt_rq->rt_time) {
849 u64 runtime;
850
851 raw_spin_lock(&rt_rq->rt_runtime_lock);
852 if (rt_rq->rt_throttled)
853 balance_runtime(rt_rq);
854 runtime = rt_rq->rt_runtime;
855 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
856 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
857 rt_rq->rt_throttled = 0;
858 enqueue = 1;
859
860 /*
861 * When we're idle and a woken (rt) task is
862 * throttled check_preempt_curr() will set
863 * skip_update and the time between the wakeup
864 * and this unthrottle will get accounted as
865 * 'runtime'.
866 */
867 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
868 rq_clock_skip_update(rq, false);
869 }
870 if (rt_rq->rt_time || rt_rq->rt_nr_running)
871 idle = 0;
872 raw_spin_unlock(&rt_rq->rt_runtime_lock);
873 } else if (rt_rq->rt_nr_running) {
874 idle = 0;
875 if (!rt_rq_throttled(rt_rq))
876 enqueue = 1;
877 }
878 if (rt_rq->rt_throttled)
879 throttled = 1;
880
881 if (enqueue)
882 sched_rt_rq_enqueue(rt_rq);
883 raw_spin_unlock(&rq->lock);
884 }
885
886 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
887 return 1;
888
889 return idle;
890 }
891
892 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
893 {
894 #ifdef CONFIG_RT_GROUP_SCHED
895 struct rt_rq *rt_rq = group_rt_rq(rt_se);
896
897 if (rt_rq)
898 return rt_rq->highest_prio.curr;
899 #endif
900
901 return rt_task_of(rt_se)->prio;
902 }
903
904 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
905 {
906 u64 runtime = sched_rt_runtime(rt_rq);
907
908 if (rt_rq->rt_throttled)
909 return rt_rq_throttled(rt_rq);
910
911 if (runtime >= sched_rt_period(rt_rq))
912 return 0;
913
914 balance_runtime(rt_rq);
915 runtime = sched_rt_runtime(rt_rq);
916 if (runtime == RUNTIME_INF)
917 return 0;
918
919 if (rt_rq->rt_time > runtime) {
920 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
921
922 /*
923 * Don't actually throttle groups that have no runtime assigned
924 * but accrue some time due to boosting.
925 */
926 if (likely(rt_b->rt_runtime)) {
927 rt_rq->rt_throttled = 1;
928 printk_deferred_once("sched: RT throttling activated\n");
929 } else {
930 /*
931 * In case we did anyway, make it go away,
932 * replenishment is a joke, since it will replenish us
933 * with exactly 0 ns.
934 */
935 rt_rq->rt_time = 0;
936 }
937
938 if (rt_rq_throttled(rt_rq)) {
939 sched_rt_rq_dequeue(rt_rq);
940 return 1;
941 }
942 }
943
944 return 0;
945 }
946
947 /*
948 * Update the current task's runtime statistics. Skip current tasks that
949 * are not in our scheduling class.
950 */
951 static void update_curr_rt(struct rq *rq)
952 {
953 struct task_struct *curr = rq->curr;
954 struct sched_rt_entity *rt_se = &curr->rt;
955 u64 delta_exec;
956
957 if (curr->sched_class != &rt_sched_class)
958 return;
959
960 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
961 if (unlikely((s64)delta_exec <= 0))
962 return;
963
964 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
965 cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
966
967 schedstat_set(curr->se.statistics.exec_max,
968 max(curr->se.statistics.exec_max, delta_exec));
969
970 curr->se.sum_exec_runtime += delta_exec;
971 account_group_exec_runtime(curr, delta_exec);
972
973 curr->se.exec_start = rq_clock_task(rq);
974 cpuacct_charge(curr, delta_exec);
975
976 sched_rt_avg_update(rq, delta_exec);
977
978 if (!rt_bandwidth_enabled())
979 return;
980
981 for_each_sched_rt_entity(rt_se) {
982 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
983
984 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
985 raw_spin_lock(&rt_rq->rt_runtime_lock);
986 rt_rq->rt_time += delta_exec;
987 if (sched_rt_runtime_exceeded(rt_rq))
988 resched_curr(rq);
989 raw_spin_unlock(&rt_rq->rt_runtime_lock);
990 }
991 }
992 }
993
994 static void
995 dequeue_top_rt_rq(struct rt_rq *rt_rq)
996 {
997 struct rq *rq = rq_of_rt_rq(rt_rq);
998
999 BUG_ON(&rq->rt != rt_rq);
1000
1001 if (!rt_rq->rt_queued)
1002 return;
1003
1004 BUG_ON(!rq->nr_running);
1005
1006 sub_nr_running(rq, rt_rq->rt_nr_running);
1007 rt_rq->rt_queued = 0;
1008 }
1009
1010 static void
1011 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1012 {
1013 struct rq *rq = rq_of_rt_rq(rt_rq);
1014
1015 BUG_ON(&rq->rt != rt_rq);
1016
1017 if (rt_rq->rt_queued)
1018 return;
1019 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
1020 return;
1021
1022 add_nr_running(rq, rt_rq->rt_nr_running);
1023 rt_rq->rt_queued = 1;
1024 }
1025
1026 #if defined CONFIG_SMP
1027
1028 static void
1029 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1030 {
1031 struct rq *rq = rq_of_rt_rq(rt_rq);
1032
1033 #ifdef CONFIG_RT_GROUP_SCHED
1034 /*
1035 * Change rq's cpupri only if rt_rq is the top queue.
1036 */
1037 if (&rq->rt != rt_rq)
1038 return;
1039 #endif
1040 if (rq->online && prio < prev_prio)
1041 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1042 }
1043
1044 static void
1045 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1046 {
1047 struct rq *rq = rq_of_rt_rq(rt_rq);
1048
1049 #ifdef CONFIG_RT_GROUP_SCHED
1050 /*
1051 * Change rq's cpupri only if rt_rq is the top queue.
1052 */
1053 if (&rq->rt != rt_rq)
1054 return;
1055 #endif
1056 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1057 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1058 }
1059
1060 #else /* CONFIG_SMP */
1061
1062 static inline
1063 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1064 static inline
1065 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1066
1067 #endif /* CONFIG_SMP */
1068
1069 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1070 static void
1071 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1072 {
1073 int prev_prio = rt_rq->highest_prio.curr;
1074
1075 if (prio < prev_prio)
1076 rt_rq->highest_prio.curr = prio;
1077
1078 inc_rt_prio_smp(rt_rq, prio, prev_prio);
1079 }
1080
1081 static void
1082 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1083 {
1084 int prev_prio = rt_rq->highest_prio.curr;
1085
1086 if (rt_rq->rt_nr_running) {
1087
1088 WARN_ON(prio < prev_prio);
1089
1090 /*
1091 * This may have been our highest task, and therefore
1092 * we may have some recomputation to do
1093 */
1094 if (prio == prev_prio) {
1095 struct rt_prio_array *array = &rt_rq->active;
1096
1097 rt_rq->highest_prio.curr =
1098 sched_find_first_bit(array->bitmap);
1099 }
1100
1101 } else
1102 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1103
1104 dec_rt_prio_smp(rt_rq, prio, prev_prio);
1105 }
1106
1107 #else
1108
1109 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1110 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1111
1112 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1113
1114 #ifdef CONFIG_RT_GROUP_SCHED
1115
1116 static void
1117 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1118 {
1119 if (rt_se_boosted(rt_se))
1120 rt_rq->rt_nr_boosted++;
1121
1122 if (rt_rq->tg)
1123 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1124 }
1125
1126 static void
1127 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1128 {
1129 if (rt_se_boosted(rt_se))
1130 rt_rq->rt_nr_boosted--;
1131
1132 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1133 }
1134
1135 #else /* CONFIG_RT_GROUP_SCHED */
1136
1137 static void
1138 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1139 {
1140 start_rt_bandwidth(&def_rt_bandwidth);
1141 }
1142
1143 static inline
1144 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1145
1146 #endif /* CONFIG_RT_GROUP_SCHED */
1147
1148 static inline
1149 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1150 {
1151 struct rt_rq *group_rq = group_rt_rq(rt_se);
1152
1153 if (group_rq)
1154 return group_rq->rt_nr_running;
1155 else
1156 return 1;
1157 }
1158
1159 static inline
1160 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1161 {
1162 struct rt_rq *group_rq = group_rt_rq(rt_se);
1163 struct task_struct *tsk;
1164
1165 if (group_rq)
1166 return group_rq->rr_nr_running;
1167
1168 tsk = rt_task_of(rt_se);
1169
1170 return (tsk->policy == SCHED_RR) ? 1 : 0;
1171 }
1172
1173 static inline
1174 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1175 {
1176 int prio = rt_se_prio(rt_se);
1177
1178 WARN_ON(!rt_prio(prio));
1179 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1180 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1181
1182 inc_rt_prio(rt_rq, prio);
1183 inc_rt_migration(rt_se, rt_rq);
1184 inc_rt_group(rt_se, rt_rq);
1185 }
1186
1187 static inline
1188 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1189 {
1190 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1191 WARN_ON(!rt_rq->rt_nr_running);
1192 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1193 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1194
1195 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1196 dec_rt_migration(rt_se, rt_rq);
1197 dec_rt_group(rt_se, rt_rq);
1198 }
1199
1200 /*
1201 * Change rt_se->run_list location unless SAVE && !MOVE
1202 *
1203 * assumes ENQUEUE/DEQUEUE flags match
1204 */
1205 static inline bool move_entity(unsigned int flags)
1206 {
1207 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1208 return false;
1209
1210 return true;
1211 }
1212
1213 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1214 {
1215 list_del_init(&rt_se->run_list);
1216
1217 if (list_empty(array->queue + rt_se_prio(rt_se)))
1218 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1219
1220 rt_se->on_list = 0;
1221 }
1222
1223 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1224 {
1225 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1226 struct rt_prio_array *array = &rt_rq->active;
1227 struct rt_rq *group_rq = group_rt_rq(rt_se);
1228 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1229
1230 /*
1231 * Don't enqueue the group if its throttled, or when empty.
1232 * The latter is a consequence of the former when a child group
1233 * get throttled and the current group doesn't have any other
1234 * active members.
1235 */
1236 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1237 if (rt_se->on_list)
1238 __delist_rt_entity(rt_se, array);
1239 return;
1240 }
1241
1242 if (move_entity(flags)) {
1243 WARN_ON_ONCE(rt_se->on_list);
1244 if (flags & ENQUEUE_HEAD)
1245 list_add(&rt_se->run_list, queue);
1246 else
1247 list_add_tail(&rt_se->run_list, queue);
1248
1249 __set_bit(rt_se_prio(rt_se), array->bitmap);
1250 rt_se->on_list = 1;
1251 }
1252 rt_se->on_rq = 1;
1253
1254 inc_rt_tasks(rt_se, rt_rq);
1255 }
1256
1257 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1258 {
1259 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1260 struct rt_prio_array *array = &rt_rq->active;
1261
1262 if (move_entity(flags)) {
1263 WARN_ON_ONCE(!rt_se->on_list);
1264 __delist_rt_entity(rt_se, array);
1265 }
1266 rt_se->on_rq = 0;
1267
1268 dec_rt_tasks(rt_se, rt_rq);
1269 }
1270
1271 /*
1272 * Because the prio of an upper entry depends on the lower
1273 * entries, we must remove entries top - down.
1274 */
1275 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1276 {
1277 struct sched_rt_entity *back = NULL;
1278
1279 for_each_sched_rt_entity(rt_se) {
1280 rt_se->back = back;
1281 back = rt_se;
1282 }
1283
1284 dequeue_top_rt_rq(rt_rq_of_se(back));
1285
1286 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1287 if (on_rt_rq(rt_se))
1288 __dequeue_rt_entity(rt_se, flags);
1289 }
1290 }
1291
1292 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1293 {
1294 struct rq *rq = rq_of_rt_se(rt_se);
1295
1296 dequeue_rt_stack(rt_se, flags);
1297 for_each_sched_rt_entity(rt_se)
1298 __enqueue_rt_entity(rt_se, flags);
1299 enqueue_top_rt_rq(&rq->rt);
1300 }
1301
1302 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1303 {
1304 struct rq *rq = rq_of_rt_se(rt_se);
1305
1306 dequeue_rt_stack(rt_se, flags);
1307
1308 for_each_sched_rt_entity(rt_se) {
1309 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1310
1311 if (rt_rq && rt_rq->rt_nr_running)
1312 __enqueue_rt_entity(rt_se, flags);
1313 }
1314 enqueue_top_rt_rq(&rq->rt);
1315 }
1316
1317 /*
1318 * Adding/removing a task to/from a priority array:
1319 */
1320 static void
1321 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1322 {
1323 struct sched_rt_entity *rt_se = &p->rt;
1324
1325 if (flags & ENQUEUE_WAKEUP)
1326 rt_se->timeout = 0;
1327
1328 enqueue_rt_entity(rt_se, flags);
1329 walt_inc_cumulative_runnable_avg(rq, p);
1330
1331 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1332 enqueue_pushable_task(rq, p);
1333 }
1334
1335 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1336 {
1337 struct sched_rt_entity *rt_se = &p->rt;
1338
1339 update_curr_rt(rq);
1340 dequeue_rt_entity(rt_se, flags);
1341 walt_dec_cumulative_runnable_avg(rq, p);
1342
1343 dequeue_pushable_task(rq, p);
1344 }
1345
1346 /*
1347 * Put task to the head or the end of the run list without the overhead of
1348 * dequeue followed by enqueue.
1349 */
1350 static void
1351 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1352 {
1353 if (on_rt_rq(rt_se)) {
1354 struct rt_prio_array *array = &rt_rq->active;
1355 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1356
1357 if (head)
1358 list_move(&rt_se->run_list, queue);
1359 else
1360 list_move_tail(&rt_se->run_list, queue);
1361 }
1362 }
1363
1364 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1365 {
1366 struct sched_rt_entity *rt_se = &p->rt;
1367 struct rt_rq *rt_rq;
1368
1369 for_each_sched_rt_entity(rt_se) {
1370 rt_rq = rt_rq_of_se(rt_se);
1371 requeue_rt_entity(rt_rq, rt_se, head);
1372 }
1373 }
1374
1375 static void yield_task_rt(struct rq *rq)
1376 {
1377 requeue_task_rt(rq, rq->curr, 0);
1378 }
1379
1380 #ifdef CONFIG_SMP
1381 static int find_lowest_rq(struct task_struct *task);
1382
1383 static int
1384 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
1385 int sibling_count_hint)
1386 {
1387 struct task_struct *curr;
1388 struct rq *rq;
1389
1390 /* For anything but wake ups, just return the task_cpu */
1391 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1392 goto out;
1393
1394 rq = cpu_rq(cpu);
1395
1396 rcu_read_lock();
1397 curr = READ_ONCE(rq->curr); /* unlocked access */
1398
1399 /*
1400 * If the current task on @p's runqueue is an RT task, then
1401 * try to see if we can wake this RT task up on another
1402 * runqueue. Otherwise simply start this RT task
1403 * on its current runqueue.
1404 *
1405 * We want to avoid overloading runqueues. If the woken
1406 * task is a higher priority, then it will stay on this CPU
1407 * and the lower prio task should be moved to another CPU.
1408 * Even though this will probably make the lower prio task
1409 * lose its cache, we do not want to bounce a higher task
1410 * around just because it gave up its CPU, perhaps for a
1411 * lock?
1412 *
1413 * For equal prio tasks, we just let the scheduler sort it out.
1414 *
1415 * Otherwise, just let it ride on the affined RQ and the
1416 * post-schedule router will push the preempted task away
1417 *
1418 * This test is optimistic, if we get it wrong the load-balancer
1419 * will have to sort it out.
1420 */
1421 if (curr && unlikely(rt_task(curr)) &&
1422 (curr->nr_cpus_allowed < 2 ||
1423 curr->prio <= p->prio)) {
1424 int target = find_lowest_rq(p);
1425
1426 /*
1427 * Don't bother moving it if the destination CPU is
1428 * not running a lower priority task.
1429 */
1430 if (target != -1 &&
1431 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1432 cpu = target;
1433 }
1434 rcu_read_unlock();
1435
1436 out:
1437 return cpu;
1438 }
1439
1440 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1441 {
1442 /*
1443 * Current can't be migrated, useless to reschedule,
1444 * let's hope p can move out.
1445 */
1446 if (rq->curr->nr_cpus_allowed == 1 ||
1447 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1448 return;
1449
1450 /*
1451 * p is migratable, so let's not schedule it and
1452 * see if it is pushed or pulled somewhere else.
1453 */
1454 if (p->nr_cpus_allowed != 1
1455 && cpupri_find(&rq->rd->cpupri, p, NULL))
1456 return;
1457
1458 /*
1459 * There appears to be other cpus that can accept
1460 * current and none to run 'p', so lets reschedule
1461 * to try and push current away:
1462 */
1463 requeue_task_rt(rq, p, 1);
1464 resched_curr(rq);
1465 }
1466
1467 #endif /* CONFIG_SMP */
1468
1469 /*
1470 * Preempt the current task with a newly woken task if needed:
1471 */
1472 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1473 {
1474 if (p->prio < rq->curr->prio) {
1475 resched_curr(rq);
1476 return;
1477 }
1478
1479 #ifdef CONFIG_SMP
1480 /*
1481 * If:
1482 *
1483 * - the newly woken task is of equal priority to the current task
1484 * - the newly woken task is non-migratable while current is migratable
1485 * - current will be preempted on the next reschedule
1486 *
1487 * we should check to see if current can readily move to a different
1488 * cpu. If so, we will reschedule to allow the push logic to try
1489 * to move current somewhere else, making room for our non-migratable
1490 * task.
1491 */
1492 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1493 check_preempt_equal_prio(rq, p);
1494 #endif
1495 }
1496
1497 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1498 struct rt_rq *rt_rq)
1499 {
1500 struct rt_prio_array *array = &rt_rq->active;
1501 struct sched_rt_entity *next = NULL;
1502 struct list_head *queue;
1503 int idx;
1504
1505 idx = sched_find_first_bit(array->bitmap);
1506 BUG_ON(idx >= MAX_RT_PRIO);
1507
1508 queue = array->queue + idx;
1509 next = list_entry(queue->next, struct sched_rt_entity, run_list);
1510
1511 return next;
1512 }
1513
1514 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1515 {
1516 struct sched_rt_entity *rt_se;
1517 struct task_struct *p;
1518 struct rt_rq *rt_rq = &rq->rt;
1519
1520 do {
1521 rt_se = pick_next_rt_entity(rq, rt_rq);
1522 BUG_ON(!rt_se);
1523 rt_rq = group_rt_rq(rt_se);
1524 } while (rt_rq);
1525
1526 p = rt_task_of(rt_se);
1527 p->se.exec_start = rq_clock_task(rq);
1528
1529 return p;
1530 }
1531
1532 extern int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running);
1533
1534 static struct task_struct *
1535 pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1536 {
1537 struct task_struct *p;
1538 struct rt_rq *rt_rq = &rq->rt;
1539
1540 if (need_pull_rt_task(rq, prev)) {
1541 /*
1542 * This is OK, because current is on_cpu, which avoids it being
1543 * picked for load-balance and preemption/IRQs are still
1544 * disabled avoiding further scheduler activity on it and we're
1545 * being very careful to re-start the picking loop.
1546 */
1547 rq_unpin_lock(rq, rf);
1548 pull_rt_task(rq);
1549 rq_repin_lock(rq, rf);
1550 /*
1551 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1552 * means a dl or stop task can slip in, in which case we need
1553 * to re-start task selection.
1554 */
1555 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1556 rq->dl.dl_nr_running))
1557 return RETRY_TASK;
1558 }
1559
1560 /*
1561 * We may dequeue prev's rt_rq in put_prev_task().
1562 * So, we update time before rt_nr_running check.
1563 */
1564 if (prev->sched_class == &rt_sched_class)
1565 update_curr_rt(rq);
1566
1567 if (!rt_rq->rt_queued)
1568 return NULL;
1569
1570 put_prev_task(rq, prev);
1571
1572 p = _pick_next_task_rt(rq);
1573
1574 /* The running task is never eligible for pushing */
1575 dequeue_pushable_task(rq, p);
1576
1577 queue_push_tasks(rq);
1578
1579 if (p)
1580 update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), rt_rq,
1581 rq->curr->sched_class == &rt_sched_class);
1582
1583 return p;
1584 }
1585
1586 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1587 {
1588 update_curr_rt(rq);
1589
1590 update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
1591
1592 /*
1593 * The previous task needs to be made eligible for pushing
1594 * if it is still active
1595 */
1596 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1597 enqueue_pushable_task(rq, p);
1598 }
1599
1600 #ifdef CONFIG_SMP
1601
1602 /* Only try algorithms three times */
1603 #define RT_MAX_TRIES 3
1604
1605 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1606 {
1607 if (!task_running(rq, p) &&
1608 cpumask_test_cpu(cpu, &p->cpus_allowed))
1609 return 1;
1610 return 0;
1611 }
1612
1613 /*
1614 * Return the highest pushable rq's task, which is suitable to be executed
1615 * on the cpu, NULL otherwise
1616 */
1617 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1618 {
1619 struct plist_head *head = &rq->rt.pushable_tasks;
1620 struct task_struct *p;
1621
1622 if (!has_pushable_tasks(rq))
1623 return NULL;
1624
1625 plist_for_each_entry(p, head, pushable_tasks) {
1626 if (pick_rt_task(rq, p, cpu))
1627 return p;
1628 }
1629
1630 return NULL;
1631 }
1632
1633 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1634
1635 static int find_lowest_rq(struct task_struct *task)
1636 {
1637 struct sched_domain *sd;
1638 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1639 int this_cpu = smp_processor_id();
1640 int cpu = task_cpu(task);
1641
1642 /* Make sure the mask is initialized first */
1643 if (unlikely(!lowest_mask))
1644 return -1;
1645
1646 if (task->nr_cpus_allowed == 1)
1647 return -1; /* No other targets possible */
1648
1649 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1650 return -1; /* No targets found */
1651
1652 /*
1653 * At this point we have built a mask of cpus representing the
1654 * lowest priority tasks in the system. Now we want to elect
1655 * the best one based on our affinity and topology.
1656 *
1657 * We prioritize the last cpu that the task executed on since
1658 * it is most likely cache-hot in that location.
1659 */
1660 if (cpumask_test_cpu(cpu, lowest_mask))
1661 return cpu;
1662
1663 /*
1664 * Otherwise, we consult the sched_domains span maps to figure
1665 * out which cpu is logically closest to our hot cache data.
1666 */
1667 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1668 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1669
1670 rcu_read_lock();
1671 for_each_domain(cpu, sd) {
1672 if (sd->flags & SD_WAKE_AFFINE) {
1673 int best_cpu;
1674
1675 /*
1676 * "this_cpu" is cheaper to preempt than a
1677 * remote processor.
1678 */
1679 if (this_cpu != -1 &&
1680 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1681 rcu_read_unlock();
1682 return this_cpu;
1683 }
1684
1685 best_cpu = cpumask_first_and(lowest_mask,
1686 sched_domain_span(sd));
1687 if (best_cpu < nr_cpu_ids) {
1688 rcu_read_unlock();
1689 return best_cpu;
1690 }
1691 }
1692 }
1693 rcu_read_unlock();
1694
1695 /*
1696 * And finally, if there were no matches within the domains
1697 * just give the caller *something* to work with from the compatible
1698 * locations.
1699 */
1700 if (this_cpu != -1)
1701 return this_cpu;
1702
1703 cpu = cpumask_any(lowest_mask);
1704 if (cpu < nr_cpu_ids)
1705 return cpu;
1706 return -1;
1707 }
1708
1709 /* Will lock the rq it finds */
1710 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1711 {
1712 struct rq *lowest_rq = NULL;
1713 int tries;
1714 int cpu;
1715
1716 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1717 cpu = find_lowest_rq(task);
1718
1719 if ((cpu == -1) || (cpu == rq->cpu))
1720 break;
1721
1722 lowest_rq = cpu_rq(cpu);
1723
1724 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1725 /*
1726 * Target rq has tasks of equal or higher priority,
1727 * retrying does not release any lock and is unlikely
1728 * to yield a different result.
1729 */
1730 lowest_rq = NULL;
1731 break;
1732 }
1733
1734 /* if the prio of this runqueue changed, try again */
1735 if (double_lock_balance(rq, lowest_rq)) {
1736 /*
1737 * We had to unlock the run queue. In
1738 * the mean time, task could have
1739 * migrated already or had its affinity changed.
1740 * Also make sure that it wasn't scheduled on its rq.
1741 */
1742 if (unlikely(task_rq(task) != rq ||
1743 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
1744 task_running(rq, task) ||
1745 !rt_task(task) ||
1746 !task_on_rq_queued(task))) {
1747
1748 double_unlock_balance(rq, lowest_rq);
1749 lowest_rq = NULL;
1750 break;
1751 }
1752 }
1753
1754 /* If this rq is still suitable use it. */
1755 if (lowest_rq->rt.highest_prio.curr > task->prio)
1756 break;
1757
1758 /* try again */
1759 double_unlock_balance(rq, lowest_rq);
1760 lowest_rq = NULL;
1761 }
1762
1763 return lowest_rq;
1764 }
1765
1766 static struct task_struct *pick_next_pushable_task(struct rq *rq)
1767 {
1768 struct task_struct *p;
1769
1770 if (!has_pushable_tasks(rq))
1771 return NULL;
1772
1773 p = plist_first_entry(&rq->rt.pushable_tasks,
1774 struct task_struct, pushable_tasks);
1775
1776 BUG_ON(rq->cpu != task_cpu(p));
1777 BUG_ON(task_current(rq, p));
1778 BUG_ON(p->nr_cpus_allowed <= 1);
1779
1780 BUG_ON(!task_on_rq_queued(p));
1781 BUG_ON(!rt_task(p));
1782
1783 return p;
1784 }
1785
1786 /*
1787 * If the current CPU has more than one RT task, see if the non
1788 * running task can migrate over to a CPU that is running a task
1789 * of lesser priority.
1790 */
1791 static int push_rt_task(struct rq *rq)
1792 {
1793 struct task_struct *next_task;
1794 struct rq *lowest_rq;
1795 int ret = 0;
1796
1797 if (!rq->rt.overloaded)
1798 return 0;
1799
1800 next_task = pick_next_pushable_task(rq);
1801 if (!next_task)
1802 return 0;
1803
1804 retry:
1805 if (unlikely(next_task == rq->curr)) {
1806 WARN_ON(1);
1807 return 0;
1808 }
1809
1810 /*
1811 * It's possible that the next_task slipped in of
1812 * higher priority than current. If that's the case
1813 * just reschedule current.
1814 */
1815 if (unlikely(next_task->prio < rq->curr->prio)) {
1816 resched_curr(rq);
1817 return 0;
1818 }
1819
1820 /* We might release rq lock */
1821 get_task_struct(next_task);
1822
1823 /* find_lock_lowest_rq locks the rq if found */
1824 lowest_rq = find_lock_lowest_rq(next_task, rq);
1825 if (!lowest_rq) {
1826 struct task_struct *task;
1827 /*
1828 * find_lock_lowest_rq releases rq->lock
1829 * so it is possible that next_task has migrated.
1830 *
1831 * We need to make sure that the task is still on the same
1832 * run-queue and is also still the next task eligible for
1833 * pushing.
1834 */
1835 task = pick_next_pushable_task(rq);
1836 if (task == next_task) {
1837 /*
1838 * The task hasn't migrated, and is still the next
1839 * eligible task, but we failed to find a run-queue
1840 * to push it to. Do not retry in this case, since
1841 * other cpus will pull from us when ready.
1842 */
1843 goto out;
1844 }
1845
1846 if (!task)
1847 /* No more tasks, just exit */
1848 goto out;
1849
1850 /*
1851 * Something has shifted, try again.
1852 */
1853 put_task_struct(next_task);
1854 next_task = task;
1855 goto retry;
1856 }
1857
1858 deactivate_task(rq, next_task, 0);
1859 next_task->on_rq = TASK_ON_RQ_MIGRATING;
1860 set_task_cpu(next_task, lowest_rq->cpu);
1861 next_task->on_rq = TASK_ON_RQ_QUEUED;
1862 activate_task(lowest_rq, next_task, 0);
1863 ret = 1;
1864
1865 resched_curr(lowest_rq);
1866
1867 double_unlock_balance(rq, lowest_rq);
1868
1869 out:
1870 put_task_struct(next_task);
1871
1872 return ret;
1873 }
1874
1875 static void push_rt_tasks(struct rq *rq)
1876 {
1877 /* push_rt_task will return true if it moved an RT */
1878 while (push_rt_task(rq))
1879 ;
1880 }
1881
1882 #ifdef HAVE_RT_PUSH_IPI
1883
1884 /*
1885 * When a high priority task schedules out from a CPU and a lower priority
1886 * task is scheduled in, a check is made to see if there's any RT tasks
1887 * on other CPUs that are waiting to run because a higher priority RT task
1888 * is currently running on its CPU. In this case, the CPU with multiple RT
1889 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1890 * up that may be able to run one of its non-running queued RT tasks.
1891 *
1892 * All CPUs with overloaded RT tasks need to be notified as there is currently
1893 * no way to know which of these CPUs have the highest priority task waiting
1894 * to run. Instead of trying to take a spinlock on each of these CPUs,
1895 * which has shown to cause large latency when done on machines with many
1896 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
1897 * RT tasks waiting to run.
1898 *
1899 * Just sending an IPI to each of the CPUs is also an issue, as on large
1900 * count CPU machines, this can cause an IPI storm on a CPU, especially
1901 * if its the only CPU with multiple RT tasks queued, and a large number
1902 * of CPUs scheduling a lower priority task at the same time.
1903 *
1904 * Each root domain has its own irq work function that can iterate over
1905 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
1906 * tassk must be checked if there's one or many CPUs that are lowering
1907 * their priority, there's a single irq work iterator that will try to
1908 * push off RT tasks that are waiting to run.
1909 *
1910 * When a CPU schedules a lower priority task, it will kick off the
1911 * irq work iterator that will jump to each CPU with overloaded RT tasks.
1912 * As it only takes the first CPU that schedules a lower priority task
1913 * to start the process, the rto_start variable is incremented and if
1914 * the atomic result is one, then that CPU will try to take the rto_lock.
1915 * This prevents high contention on the lock as the process handles all
1916 * CPUs scheduling lower priority tasks.
1917 *
1918 * All CPUs that are scheduling a lower priority task will increment the
1919 * rt_loop_next variable. This will make sure that the irq work iterator
1920 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
1921 * priority task, even if the iterator is in the middle of a scan. Incrementing
1922 * the rt_loop_next will cause the iterator to perform another scan.
1923 *
1924 */
1925 static int rto_next_cpu(struct root_domain *rd)
1926 {
1927 int next;
1928 int cpu;
1929
1930 /*
1931 * When starting the IPI RT pushing, the rto_cpu is set to -1,
1932 * rt_next_cpu() will simply return the first CPU found in
1933 * the rto_mask.
1934 *
1935 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
1936 * will return the next CPU found in the rto_mask.
1937 *
1938 * If there are no more CPUs left in the rto_mask, then a check is made
1939 * against rto_loop and rto_loop_next. rto_loop is only updated with
1940 * the rto_lock held, but any CPU may increment the rto_loop_next
1941 * without any locking.
1942 */
1943 for (;;) {
1944
1945 /* When rto_cpu is -1 this acts like cpumask_first() */
1946 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
1947
1948 rd->rto_cpu = cpu;
1949
1950 if (cpu < nr_cpu_ids)
1951 return cpu;
1952
1953 rd->rto_cpu = -1;
1954
1955 /*
1956 * ACQUIRE ensures we see the @rto_mask changes
1957 * made prior to the @next value observed.
1958 *
1959 * Matches WMB in rt_set_overload().
1960 */
1961 next = atomic_read_acquire(&rd->rto_loop_next);
1962
1963 if (rd->rto_loop == next)
1964 break;
1965
1966 rd->rto_loop = next;
1967 }
1968
1969 return -1;
1970 }
1971
1972 static inline bool rto_start_trylock(atomic_t *v)
1973 {
1974 return !atomic_cmpxchg_acquire(v, 0, 1);
1975 }
1976
1977 static inline void rto_start_unlock(atomic_t *v)
1978 {
1979 atomic_set_release(v, 0);
1980 }
1981
1982 static void tell_cpu_to_push(struct rq *rq)
1983 {
1984 int cpu = -1;
1985
1986 /* Keep the loop going if the IPI is currently active */
1987 atomic_inc(&rq->rd->rto_loop_next);
1988
1989 /* Only one CPU can initiate a loop at a time */
1990 if (!rto_start_trylock(&rq->rd->rto_loop_start))
1991 return;
1992
1993 raw_spin_lock(&rq->rd->rto_lock);
1994
1995 /*
1996 * The rto_cpu is updated under the lock, if it has a valid cpu
1997 * then the IPI is still running and will continue due to the
1998 * update to loop_next, and nothing needs to be done here.
1999 * Otherwise it is finishing up and an ipi needs to be sent.
2000 */
2001 if (rq->rd->rto_cpu < 0)
2002 cpu = rto_next_cpu(rq->rd);
2003
2004 raw_spin_unlock(&rq->rd->rto_lock);
2005
2006 rto_start_unlock(&rq->rd->rto_loop_start);
2007
2008 if (cpu >= 0) {
2009 /* Make sure the rd does not get freed while pushing */
2010 sched_get_rd(rq->rd);
2011 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2012 }
2013 }
2014
2015 /* Called from hardirq context */
2016 void rto_push_irq_work_func(struct irq_work *work)
2017 {
2018 struct root_domain *rd =
2019 container_of(work, struct root_domain, rto_push_work);
2020 struct rq *rq;
2021 int cpu;
2022
2023 rq = this_rq();
2024
2025 /*
2026 * We do not need to grab the lock to check for has_pushable_tasks.
2027 * When it gets updated, a check is made if a push is possible.
2028 */
2029 if (has_pushable_tasks(rq)) {
2030 raw_spin_lock(&rq->lock);
2031 push_rt_tasks(rq);
2032 raw_spin_unlock(&rq->lock);
2033 }
2034
2035 raw_spin_lock(&rd->rto_lock);
2036
2037 /* Pass the IPI to the next rt overloaded queue */
2038 cpu = rto_next_cpu(rd);
2039
2040 raw_spin_unlock(&rd->rto_lock);
2041
2042 if (cpu < 0) {
2043 sched_put_rd(rd);
2044 return;
2045 }
2046
2047 /* Try the next RT overloaded CPU */
2048 irq_work_queue_on(&rd->rto_push_work, cpu);
2049 }
2050 #endif /* HAVE_RT_PUSH_IPI */
2051
2052 static void pull_rt_task(struct rq *this_rq)
2053 {
2054 int this_cpu = this_rq->cpu, cpu;
2055 bool resched = false;
2056 struct task_struct *p;
2057 struct rq *src_rq;
2058 int rt_overload_count = rt_overloaded(this_rq);
2059
2060 if (likely(!rt_overload_count))
2061 return;
2062
2063 /*
2064 * Match the barrier from rt_set_overloaded; this guarantees that if we
2065 * see overloaded we must also see the rto_mask bit.
2066 */
2067 smp_rmb();
2068
2069 /* If we are the only overloaded CPU do nothing */
2070 if (rt_overload_count == 1 &&
2071 cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2072 return;
2073
2074 #ifdef HAVE_RT_PUSH_IPI
2075 if (sched_feat(RT_PUSH_IPI)) {
2076 tell_cpu_to_push(this_rq);
2077 return;
2078 }
2079 #endif
2080
2081 for_each_cpu(cpu, this_rq->rd->rto_mask) {
2082 if (this_cpu == cpu)
2083 continue;
2084
2085 src_rq = cpu_rq(cpu);
2086
2087 /*
2088 * Don't bother taking the src_rq->lock if the next highest
2089 * task is known to be lower-priority than our current task.
2090 * This may look racy, but if this value is about to go
2091 * logically higher, the src_rq will push this task away.
2092 * And if its going logically lower, we do not care
2093 */
2094 if (src_rq->rt.highest_prio.next >=
2095 this_rq->rt.highest_prio.curr)
2096 continue;
2097
2098 /*
2099 * We can potentially drop this_rq's lock in
2100 * double_lock_balance, and another CPU could
2101 * alter this_rq
2102 */
2103 double_lock_balance(this_rq, src_rq);
2104
2105 /*
2106 * We can pull only a task, which is pushable
2107 * on its rq, and no others.
2108 */
2109 p = pick_highest_pushable_task(src_rq, this_cpu);
2110
2111 /*
2112 * Do we have an RT task that preempts
2113 * the to-be-scheduled task?
2114 */
2115 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2116 WARN_ON(p == src_rq->curr);
2117 WARN_ON(!task_on_rq_queued(p));
2118
2119 /*
2120 * There's a chance that p is higher in priority
2121 * than what's currently running on its cpu.
2122 * This is just that p is wakeing up and hasn't
2123 * had a chance to schedule. We only pull
2124 * p if it is lower in priority than the
2125 * current task on the run queue
2126 */
2127 if (p->prio < src_rq->curr->prio)
2128 goto skip;
2129
2130 resched = true;
2131
2132 deactivate_task(src_rq, p, 0);
2133 p->on_rq = TASK_ON_RQ_MIGRATING;
2134 set_task_cpu(p, this_cpu);
2135 p->on_rq = TASK_ON_RQ_QUEUED;
2136 activate_task(this_rq, p, 0);
2137 /*
2138 * We continue with the search, just in
2139 * case there's an even higher prio task
2140 * in another runqueue. (low likelihood
2141 * but possible)
2142 */
2143 }
2144 skip:
2145 double_unlock_balance(this_rq, src_rq);
2146 }
2147
2148 if (resched)
2149 resched_curr(this_rq);
2150 }
2151
2152 /*
2153 * If we are not running and we are not going to reschedule soon, we should
2154 * try to push tasks away now
2155 */
2156 static void task_woken_rt(struct rq *rq, struct task_struct *p)
2157 {
2158 if (!task_running(rq, p) &&
2159 !test_tsk_need_resched(rq->curr) &&
2160 p->nr_cpus_allowed > 1 &&
2161 (dl_task(rq->curr) || rt_task(rq->curr)) &&
2162 (rq->curr->nr_cpus_allowed < 2 ||
2163 rq->curr->prio <= p->prio))
2164 push_rt_tasks(rq);
2165 }
2166
2167 /* Assumes rq->lock is held */
2168 static void rq_online_rt(struct rq *rq)
2169 {
2170 if (rq->rt.overloaded)
2171 rt_set_overload(rq);
2172
2173 __enable_runtime(rq);
2174
2175 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2176 }
2177
2178 /* Assumes rq->lock is held */
2179 static void rq_offline_rt(struct rq *rq)
2180 {
2181 if (rq->rt.overloaded)
2182 rt_clear_overload(rq);
2183
2184 __disable_runtime(rq);
2185
2186 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2187 }
2188
2189 /*
2190 * When switch from the rt queue, we bring ourselves to a position
2191 * that we might want to pull RT tasks from other runqueues.
2192 */
2193 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2194 {
2195 /*
2196 * If there are other RT tasks then we will reschedule
2197 * and the scheduling of the other RT tasks will handle
2198 * the balancing. But if we are the last RT task
2199 * we may need to handle the pulling of RT tasks
2200 * now.
2201 */
2202 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2203 return;
2204
2205 queue_pull_task(rq);
2206 }
2207
2208 void __init init_sched_rt_class(void)
2209 {
2210 unsigned int i;
2211
2212 for_each_possible_cpu(i) {
2213 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2214 GFP_KERNEL, cpu_to_node(i));
2215 }
2216 }
2217 #endif /* CONFIG_SMP */
2218
2219 /*
2220 * When switching a task to RT, we may overload the runqueue
2221 * with RT tasks. In this case we try to push them off to
2222 * other runqueues.
2223 */
2224 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2225 {
2226 /*
2227 * If we are already running, then there's nothing
2228 * that needs to be done. But if we are not running
2229 * we may need to preempt the current running task.
2230 * If that current running task is also an RT task
2231 * then see if we can move to another run queue.
2232 */
2233 if (task_on_rq_queued(p) && rq->curr != p) {
2234 #ifdef CONFIG_SMP
2235 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2236 queue_push_tasks(rq);
2237 #endif /* CONFIG_SMP */
2238 if (p->prio < rq->curr->prio)
2239 resched_curr(rq);
2240 }
2241 }
2242
2243 /*
2244 * Priority of the task has changed. This may cause
2245 * us to initiate a push or pull.
2246 */
2247 static void
2248 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2249 {
2250 if (!task_on_rq_queued(p))
2251 return;
2252
2253 if (rq->curr == p) {
2254 #ifdef CONFIG_SMP
2255 /*
2256 * If our priority decreases while running, we
2257 * may need to pull tasks to this runqueue.
2258 */
2259 if (oldprio < p->prio)
2260 queue_pull_task(rq);
2261
2262 /*
2263 * If there's a higher priority task waiting to run
2264 * then reschedule.
2265 */
2266 if (p->prio > rq->rt.highest_prio.curr)
2267 resched_curr(rq);
2268 #else
2269 /* For UP simply resched on drop of prio */
2270 if (oldprio < p->prio)
2271 resched_curr(rq);
2272 #endif /* CONFIG_SMP */
2273 } else {
2274 /*
2275 * This task is not running, but if it is
2276 * greater than the current running task
2277 * then reschedule.
2278 */
2279 if (p->prio < rq->curr->prio)
2280 resched_curr(rq);
2281 }
2282 }
2283
2284 #ifdef CONFIG_POSIX_TIMERS
2285 static void watchdog(struct rq *rq, struct task_struct *p)
2286 {
2287 unsigned long soft, hard;
2288
2289 /* max may change after cur was read, this will be fixed next tick */
2290 soft = task_rlimit(p, RLIMIT_RTTIME);
2291 hard = task_rlimit_max(p, RLIMIT_RTTIME);
2292
2293 if (soft != RLIM_INFINITY) {
2294 unsigned long next;
2295
2296 if (p->rt.watchdog_stamp != jiffies) {
2297 p->rt.timeout++;
2298 p->rt.watchdog_stamp = jiffies;
2299 }
2300
2301 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2302 if (p->rt.timeout > next)
2303 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2304 }
2305 }
2306 #else
2307 static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2308 #endif
2309
2310 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2311 {
2312 struct sched_rt_entity *rt_se = &p->rt;
2313
2314 update_curr_rt(rq);
2315 update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
2316
2317 watchdog(rq, p);
2318
2319 /*
2320 * RR tasks need a special form of timeslice management.
2321 * FIFO tasks have no timeslices.
2322 */
2323 if (p->policy != SCHED_RR)
2324 return;
2325
2326 if (--p->rt.time_slice)
2327 return;
2328
2329 p->rt.time_slice = sched_rr_timeslice;
2330
2331 /*
2332 * Requeue to the end of queue if we (and all of our ancestors) are not
2333 * the only element on the queue
2334 */
2335 for_each_sched_rt_entity(rt_se) {
2336 if (rt_se->run_list.prev != rt_se->run_list.next) {
2337 requeue_task_rt(rq, p, 0);
2338 resched_curr(rq);
2339 return;
2340 }
2341 }
2342 }
2343
2344 static void set_curr_task_rt(struct rq *rq)
2345 {
2346 struct task_struct *p = rq->curr;
2347
2348 p->se.exec_start = rq_clock_task(rq);
2349
2350 /* The running task is never eligible for pushing */
2351 dequeue_pushable_task(rq, p);
2352 }
2353
2354 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2355 {
2356 /*
2357 * Time slice is 0 for SCHED_FIFO tasks
2358 */
2359 if (task->policy == SCHED_RR)
2360 return sched_rr_timeslice;
2361 else
2362 return 0;
2363 }
2364
2365 const struct sched_class rt_sched_class = {
2366 .next = &fair_sched_class,
2367 .enqueue_task = enqueue_task_rt,
2368 .dequeue_task = dequeue_task_rt,
2369 .yield_task = yield_task_rt,
2370
2371 .check_preempt_curr = check_preempt_curr_rt,
2372
2373 .pick_next_task = pick_next_task_rt,
2374 .put_prev_task = put_prev_task_rt,
2375
2376 #ifdef CONFIG_SMP
2377 .select_task_rq = select_task_rq_rt,
2378
2379 .set_cpus_allowed = set_cpus_allowed_common,
2380 .rq_online = rq_online_rt,
2381 .rq_offline = rq_offline_rt,
2382 .task_woken = task_woken_rt,
2383 .switched_from = switched_from_rt,
2384 #endif
2385
2386 .set_curr_task = set_curr_task_rt,
2387 .task_tick = task_tick_rt,
2388
2389 .get_rr_interval = get_rr_interval_rt,
2390
2391 .prio_changed = prio_changed_rt,
2392 .switched_to = switched_to_rt,
2393
2394 .update_curr = update_curr_rt,
2395 };
2396
2397 #ifdef CONFIG_RT_GROUP_SCHED
2398 /*
2399 * Ensure that the real time constraints are schedulable.
2400 */
2401 static DEFINE_MUTEX(rt_constraints_mutex);
2402
2403 /* Must be called with tasklist_lock held */
2404 static inline int tg_has_rt_tasks(struct task_group *tg)
2405 {
2406 struct task_struct *g, *p;
2407
2408 /*
2409 * Autogroups do not have RT tasks; see autogroup_create().
2410 */
2411 if (task_group_is_autogroup(tg))
2412 return 0;
2413
2414 for_each_process_thread(g, p) {
2415 if (rt_task(p) && task_group(p) == tg)
2416 return 1;
2417 }
2418
2419 return 0;
2420 }
2421
2422 struct rt_schedulable_data {
2423 struct task_group *tg;
2424 u64 rt_period;
2425 u64 rt_runtime;
2426 };
2427
2428 static int tg_rt_schedulable(struct task_group *tg, void *data)
2429 {
2430 struct rt_schedulable_data *d = data;
2431 struct task_group *child;
2432 unsigned long total, sum = 0;
2433 u64 period, runtime;
2434
2435 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2436 runtime = tg->rt_bandwidth.rt_runtime;
2437
2438 if (tg == d->tg) {
2439 period = d->rt_period;
2440 runtime = d->rt_runtime;
2441 }
2442
2443 /*
2444 * Cannot have more runtime than the period.
2445 */
2446 if (runtime > period && runtime != RUNTIME_INF)
2447 return -EINVAL;
2448
2449 /*
2450 * Ensure we don't starve existing RT tasks.
2451 */
2452 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2453 return -EBUSY;
2454
2455 total = to_ratio(period, runtime);
2456
2457 /*
2458 * Nobody can have more than the global setting allows.
2459 */
2460 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2461 return -EINVAL;
2462
2463 /*
2464 * The sum of our children's runtime should not exceed our own.
2465 */
2466 list_for_each_entry_rcu(child, &tg->children, siblings) {
2467 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2468 runtime = child->rt_bandwidth.rt_runtime;
2469
2470 if (child == d->tg) {
2471 period = d->rt_period;
2472 runtime = d->rt_runtime;
2473 }
2474
2475 sum += to_ratio(period, runtime);
2476 }
2477
2478 if (sum > total)
2479 return -EINVAL;
2480
2481 return 0;
2482 }
2483
2484 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2485 {
2486 int ret;
2487
2488 struct rt_schedulable_data data = {
2489 .tg = tg,
2490 .rt_period = period,
2491 .rt_runtime = runtime,
2492 };
2493
2494 rcu_read_lock();
2495 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2496 rcu_read_unlock();
2497
2498 return ret;
2499 }
2500
2501 static int tg_set_rt_bandwidth(struct task_group *tg,
2502 u64 rt_period, u64 rt_runtime)
2503 {
2504 int i, err = 0;
2505
2506 /*
2507 * Disallowing the root group RT runtime is BAD, it would disallow the
2508 * kernel creating (and or operating) RT threads.
2509 */
2510 if (tg == &root_task_group && rt_runtime == 0)
2511 return -EINVAL;
2512
2513 /* No period doesn't make any sense. */
2514 if (rt_period == 0)
2515 return -EINVAL;
2516
2517 mutex_lock(&rt_constraints_mutex);
2518 read_lock(&tasklist_lock);
2519 err = __rt_schedulable(tg, rt_period, rt_runtime);
2520 if (err)
2521 goto unlock;
2522
2523 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2524 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2525 tg->rt_bandwidth.rt_runtime = rt_runtime;
2526
2527 for_each_possible_cpu(i) {
2528 struct rt_rq *rt_rq = tg->rt_rq[i];
2529
2530 raw_spin_lock(&rt_rq->rt_runtime_lock);
2531 rt_rq->rt_runtime = rt_runtime;
2532 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2533 }
2534 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2535 unlock:
2536 read_unlock(&tasklist_lock);
2537 mutex_unlock(&rt_constraints_mutex);
2538
2539 return err;
2540 }
2541
2542 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2543 {
2544 u64 rt_runtime, rt_period;
2545
2546 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2547 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2548 if (rt_runtime_us < 0)
2549 rt_runtime = RUNTIME_INF;
2550
2551 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2552 }
2553
2554 long sched_group_rt_runtime(struct task_group *tg)
2555 {
2556 u64 rt_runtime_us;
2557
2558 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2559 return -1;
2560
2561 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2562 do_div(rt_runtime_us, NSEC_PER_USEC);
2563 return rt_runtime_us;
2564 }
2565
2566 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2567 {
2568 u64 rt_runtime, rt_period;
2569
2570 rt_period = rt_period_us * NSEC_PER_USEC;
2571 rt_runtime = tg->rt_bandwidth.rt_runtime;
2572
2573 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2574 }
2575
2576 long sched_group_rt_period(struct task_group *tg)
2577 {
2578 u64 rt_period_us;
2579
2580 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2581 do_div(rt_period_us, NSEC_PER_USEC);
2582 return rt_period_us;
2583 }
2584
2585 static int sched_rt_global_constraints(void)
2586 {
2587 int ret = 0;
2588
2589 mutex_lock(&rt_constraints_mutex);
2590 read_lock(&tasklist_lock);
2591 ret = __rt_schedulable(NULL, 0, 0);
2592 read_unlock(&tasklist_lock);
2593 mutex_unlock(&rt_constraints_mutex);
2594
2595 return ret;
2596 }
2597
2598 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2599 {
2600 /* Don't accept realtime tasks when there is no way for them to run */
2601 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2602 return 0;
2603
2604 return 1;
2605 }
2606
2607 #else /* !CONFIG_RT_GROUP_SCHED */
2608 static int sched_rt_global_constraints(void)
2609 {
2610 unsigned long flags;
2611 int i;
2612
2613 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2614 for_each_possible_cpu(i) {
2615 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2616
2617 raw_spin_lock(&rt_rq->rt_runtime_lock);
2618 rt_rq->rt_runtime = global_rt_runtime();
2619 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2620 }
2621 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2622
2623 return 0;
2624 }
2625 #endif /* CONFIG_RT_GROUP_SCHED */
2626
2627 static int sched_rt_global_validate(void)
2628 {
2629 if (sysctl_sched_rt_period <= 0)
2630 return -EINVAL;
2631
2632 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2633 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
2634 return -EINVAL;
2635
2636 return 0;
2637 }
2638
2639 static void sched_rt_do_global(void)
2640 {
2641 def_rt_bandwidth.rt_runtime = global_rt_runtime();
2642 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2643 }
2644
2645 int sched_rt_handler(struct ctl_table *table, int write,
2646 void __user *buffer, size_t *lenp,
2647 loff_t *ppos)
2648 {
2649 int old_period, old_runtime;
2650 static DEFINE_MUTEX(mutex);
2651 int ret;
2652
2653 mutex_lock(&mutex);
2654 old_period = sysctl_sched_rt_period;
2655 old_runtime = sysctl_sched_rt_runtime;
2656
2657 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2658
2659 if (!ret && write) {
2660 ret = sched_rt_global_validate();
2661 if (ret)
2662 goto undo;
2663
2664 ret = sched_dl_global_validate();
2665 if (ret)
2666 goto undo;
2667
2668 ret = sched_rt_global_constraints();
2669 if (ret)
2670 goto undo;
2671
2672 sched_rt_do_global();
2673 sched_dl_do_global();
2674 }
2675 if (0) {
2676 undo:
2677 sysctl_sched_rt_period = old_period;
2678 sysctl_sched_rt_runtime = old_runtime;
2679 }
2680 mutex_unlock(&mutex);
2681
2682 return ret;
2683 }
2684
2685 int sched_rr_handler(struct ctl_table *table, int write,
2686 void __user *buffer, size_t *lenp,
2687 loff_t *ppos)
2688 {
2689 int ret;
2690 static DEFINE_MUTEX(mutex);
2691
2692 mutex_lock(&mutex);
2693 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2694 /*
2695 * Make sure that internally we keep jiffies.
2696 * Also, writing zero resets the timeslice to default:
2697 */
2698 if (!ret && write) {
2699 sched_rr_timeslice =
2700 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2701 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2702 }
2703 mutex_unlock(&mutex);
2704 return ret;
2705 }
2706
2707 #ifdef CONFIG_SCHED_DEBUG
2708 extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq);
2709
2710 void print_rt_stats(struct seq_file *m, int cpu)
2711 {
2712 rt_rq_iter_t iter;
2713 struct rt_rq *rt_rq;
2714
2715 rcu_read_lock();
2716 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2717 print_rt_rq(m, cpu, rt_rq);
2718 rcu_read_unlock();
2719 }
2720 #endif /* CONFIG_SCHED_DEBUG */