Merge 4.14.235 into android-4.14-q
[GitHub/LineageOS/android_kernel_motorola_exynos9610.git] / kernel / sched / rt.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
4 * policies)
5 */
6
7 #include "sched.h"
8
9 #include <linux/slab.h>
10 #include <linux/irq_work.h>
11 #include "tune.h"
12
13 #include "walt.h"
14
15 int sched_rr_timeslice = RR_TIMESLICE;
16 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
17
18 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
19
20 struct rt_bandwidth def_rt_bandwidth;
21
22 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
23 {
24 struct rt_bandwidth *rt_b =
25 container_of(timer, struct rt_bandwidth, rt_period_timer);
26 int idle = 0;
27 int overrun;
28
29 raw_spin_lock(&rt_b->rt_runtime_lock);
30 for (;;) {
31 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
32 if (!overrun)
33 break;
34
35 raw_spin_unlock(&rt_b->rt_runtime_lock);
36 idle = do_sched_rt_period_timer(rt_b, overrun);
37 raw_spin_lock(&rt_b->rt_runtime_lock);
38 }
39 if (idle)
40 rt_b->rt_period_active = 0;
41 raw_spin_unlock(&rt_b->rt_runtime_lock);
42
43 return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
44 }
45
46 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
47 {
48 rt_b->rt_period = ns_to_ktime(period);
49 rt_b->rt_runtime = runtime;
50
51 raw_spin_lock_init(&rt_b->rt_runtime_lock);
52
53 hrtimer_init(&rt_b->rt_period_timer,
54 CLOCK_MONOTONIC, HRTIMER_MODE_REL);
55 rt_b->rt_period_timer.function = sched_rt_period_timer;
56 }
57
58 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
59 {
60 if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
61 return;
62
63 raw_spin_lock(&rt_b->rt_runtime_lock);
64 if (!rt_b->rt_period_active) {
65 rt_b->rt_period_active = 1;
66 /*
67 * SCHED_DEADLINE updates the bandwidth, as a run away
68 * RT task with a DL task could hog a CPU. But DL does
69 * not reset the period. If a deadline task was running
70 * without an RT task running, it can cause RT tasks to
71 * throttle when they start up. Kick the timer right away
72 * to update the period.
73 */
74 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
75 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
76 }
77 raw_spin_unlock(&rt_b->rt_runtime_lock);
78 }
79
80 void init_rt_rq(struct rt_rq *rt_rq)
81 {
82 struct rt_prio_array *array;
83 int i;
84
85 array = &rt_rq->active;
86 for (i = 0; i < MAX_RT_PRIO; i++) {
87 INIT_LIST_HEAD(array->queue + i);
88 __clear_bit(i, array->bitmap);
89 }
90 /* delimiter for bitsearch: */
91 __set_bit(MAX_RT_PRIO, array->bitmap);
92
93 #if defined CONFIG_SMP
94 rt_rq->highest_prio.curr = MAX_RT_PRIO;
95 rt_rq->highest_prio.next = MAX_RT_PRIO;
96 rt_rq->rt_nr_migratory = 0;
97 rt_rq->overloaded = 0;
98 plist_head_init(&rt_rq->pushable_tasks);
99 #endif /* CONFIG_SMP */
100 /* We start is dequeued state, because no RT tasks are queued */
101 rt_rq->rt_queued = 0;
102
103 rt_rq->rt_time = 0;
104 rt_rq->rt_throttled = 0;
105 rt_rq->rt_runtime = 0;
106 raw_spin_lock_init(&rt_rq->rt_runtime_lock);
107 }
108
109 #ifdef CONFIG_RT_GROUP_SCHED
110 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
111 {
112 hrtimer_cancel(&rt_b->rt_period_timer);
113 }
114
115 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
116
117 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
118 {
119 #ifdef CONFIG_SCHED_DEBUG
120 WARN_ON_ONCE(!rt_entity_is_task(rt_se));
121 #endif
122 return container_of(rt_se, struct task_struct, rt);
123 }
124
125 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
126 {
127 return rt_rq->rq;
128 }
129
130 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
131 {
132 return rt_se->rt_rq;
133 }
134
135 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
136 {
137 struct rt_rq *rt_rq = rt_se->rt_rq;
138
139 return rt_rq->rq;
140 }
141
142 void free_rt_sched_group(struct task_group *tg)
143 {
144 int i;
145
146 if (tg->rt_se)
147 destroy_rt_bandwidth(&tg->rt_bandwidth);
148
149 for_each_possible_cpu(i) {
150 if (tg->rt_rq)
151 kfree(tg->rt_rq[i]);
152 if (tg->rt_se)
153 kfree(tg->rt_se[i]);
154 }
155
156 kfree(tg->rt_rq);
157 kfree(tg->rt_se);
158 }
159
160 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
161 struct sched_rt_entity *rt_se, int cpu,
162 struct sched_rt_entity *parent)
163 {
164 struct rq *rq = cpu_rq(cpu);
165
166 rt_rq->highest_prio.curr = MAX_RT_PRIO;
167 rt_rq->rt_nr_boosted = 0;
168 rt_rq->rq = rq;
169 rt_rq->tg = tg;
170
171 tg->rt_rq[cpu] = rt_rq;
172 tg->rt_se[cpu] = rt_se;
173
174 if (!rt_se)
175 return;
176
177 if (!parent)
178 rt_se->rt_rq = &rq->rt;
179 else
180 rt_se->rt_rq = parent->my_q;
181
182 rt_se->my_q = rt_rq;
183 rt_se->parent = parent;
184 INIT_LIST_HEAD(&rt_se->run_list);
185 }
186
187 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
188 {
189 struct rt_rq *rt_rq;
190 struct sched_rt_entity *rt_se;
191 int i;
192
193 tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
194 if (!tg->rt_rq)
195 goto err;
196 tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
197 if (!tg->rt_se)
198 goto err;
199
200 init_rt_bandwidth(&tg->rt_bandwidth,
201 ktime_to_ns(def_rt_bandwidth.rt_period), 0);
202
203 for_each_possible_cpu(i) {
204 rt_rq = kzalloc_node(sizeof(struct rt_rq),
205 GFP_KERNEL, cpu_to_node(i));
206 if (!rt_rq)
207 goto err;
208
209 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
210 GFP_KERNEL, cpu_to_node(i));
211 if (!rt_se)
212 goto err_free_rq;
213
214 init_rt_rq(rt_rq);
215 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
216 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
217 }
218
219 return 1;
220
221 err_free_rq:
222 kfree(rt_rq);
223 err:
224 return 0;
225 }
226
227 #else /* CONFIG_RT_GROUP_SCHED */
228
229 #define rt_entity_is_task(rt_se) (1)
230
231 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
232 {
233 return container_of(rt_se, struct task_struct, rt);
234 }
235
236 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
237 {
238 return container_of(rt_rq, struct rq, rt);
239 }
240
241 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
242 {
243 struct task_struct *p = rt_task_of(rt_se);
244
245 return task_rq(p);
246 }
247
248 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
249 {
250 struct rq *rq = rq_of_rt_se(rt_se);
251
252 return &rq->rt;
253 }
254
255 void free_rt_sched_group(struct task_group *tg) { }
256
257 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
258 {
259 return 1;
260 }
261 #endif /* CONFIG_RT_GROUP_SCHED */
262
263 #ifdef CONFIG_SMP
264
265 static void pull_rt_task(struct rq *this_rq);
266
267 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
268 {
269 /* Try to pull RT tasks here if we lower this rq's prio */
270 return rq->rt.highest_prio.curr > prev->prio;
271 }
272
273 static inline int rt_overloaded(struct rq *rq)
274 {
275 return atomic_read(&rq->rd->rto_count);
276 }
277
278 static inline void rt_set_overload(struct rq *rq)
279 {
280 if (!rq->online)
281 return;
282
283 cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
284 /*
285 * Make sure the mask is visible before we set
286 * the overload count. That is checked to determine
287 * if we should look at the mask. It would be a shame
288 * if we looked at the mask, but the mask was not
289 * updated yet.
290 *
291 * Matched by the barrier in pull_rt_task().
292 */
293 smp_wmb();
294 atomic_inc(&rq->rd->rto_count);
295 }
296
297 static inline void rt_clear_overload(struct rq *rq)
298 {
299 if (!rq->online)
300 return;
301
302 /* the order here really doesn't matter */
303 atomic_dec(&rq->rd->rto_count);
304 cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
305 }
306
307 static void update_rt_migration(struct rt_rq *rt_rq)
308 {
309 if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
310 if (!rt_rq->overloaded) {
311 rt_set_overload(rq_of_rt_rq(rt_rq));
312 rt_rq->overloaded = 1;
313 }
314 } else if (rt_rq->overloaded) {
315 rt_clear_overload(rq_of_rt_rq(rt_rq));
316 rt_rq->overloaded = 0;
317 }
318 }
319
320 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
321 {
322 struct task_struct *p;
323
324 if (!rt_entity_is_task(rt_se))
325 return;
326
327 p = rt_task_of(rt_se);
328 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
329
330 rt_rq->rt_nr_total++;
331 if (p->nr_cpus_allowed > 1)
332 rt_rq->rt_nr_migratory++;
333
334 update_rt_migration(rt_rq);
335 }
336
337 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
338 {
339 struct task_struct *p;
340
341 if (!rt_entity_is_task(rt_se))
342 return;
343
344 p = rt_task_of(rt_se);
345 rt_rq = &rq_of_rt_rq(rt_rq)->rt;
346
347 rt_rq->rt_nr_total--;
348 if (p->nr_cpus_allowed > 1)
349 rt_rq->rt_nr_migratory--;
350
351 update_rt_migration(rt_rq);
352 }
353
354 static inline int has_pushable_tasks(struct rq *rq)
355 {
356 return !plist_head_empty(&rq->rt.pushable_tasks);
357 }
358
359 static DEFINE_PER_CPU(struct callback_head, rt_push_head);
360 static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
361
362 static void push_rt_tasks(struct rq *);
363 static void pull_rt_task(struct rq *);
364
365 static inline void queue_push_tasks(struct rq *rq)
366 {
367 if (!has_pushable_tasks(rq))
368 return;
369
370 queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
371 }
372
373 static inline void queue_pull_task(struct rq *rq)
374 {
375 queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
376 }
377
378 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
379 {
380 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
381 plist_node_init(&p->pushable_tasks, p->prio);
382 plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
383
384 /* Update the highest prio pushable task */
385 if (p->prio < rq->rt.highest_prio.next)
386 rq->rt.highest_prio.next = p->prio;
387 }
388
389 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
390 {
391 plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
392
393 /* Update the new highest prio pushable task */
394 if (has_pushable_tasks(rq)) {
395 p = plist_first_entry(&rq->rt.pushable_tasks,
396 struct task_struct, pushable_tasks);
397 rq->rt.highest_prio.next = p->prio;
398 } else
399 rq->rt.highest_prio.next = MAX_RT_PRIO;
400 }
401
402 #else
403
404 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
405 {
406 }
407
408 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
409 {
410 }
411
412 static inline
413 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
414 {
415 }
416
417 static inline
418 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
419 {
420 }
421
422 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
423 {
424 return false;
425 }
426
427 static inline void pull_rt_task(struct rq *this_rq)
428 {
429 }
430
431 static inline void queue_push_tasks(struct rq *rq)
432 {
433 }
434 #endif /* CONFIG_SMP */
435
436 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
437 static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
438
439 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
440 {
441 return rt_se->on_rq;
442 }
443
444 #ifdef CONFIG_RT_GROUP_SCHED
445
446 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
447 {
448 if (!rt_rq->tg)
449 return RUNTIME_INF;
450
451 return rt_rq->rt_runtime;
452 }
453
454 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
455 {
456 return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
457 }
458
459 typedef struct task_group *rt_rq_iter_t;
460
461 static inline struct task_group *next_task_group(struct task_group *tg)
462 {
463 do {
464 tg = list_entry_rcu(tg->list.next,
465 typeof(struct task_group), list);
466 } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
467
468 if (&tg->list == &task_groups)
469 tg = NULL;
470
471 return tg;
472 }
473
474 #define for_each_rt_rq(rt_rq, iter, rq) \
475 for (iter = container_of(&task_groups, typeof(*iter), list); \
476 (iter = next_task_group(iter)) && \
477 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
478
479 #define for_each_sched_rt_entity(rt_se) \
480 for (; rt_se; rt_se = rt_se->parent)
481
482 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
483 {
484 return rt_se->my_q;
485 }
486
487 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
488 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
489
490 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
491 {
492 struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
493 struct rq *rq = rq_of_rt_rq(rt_rq);
494 struct sched_rt_entity *rt_se;
495
496 int cpu = cpu_of(rq);
497
498 rt_se = rt_rq->tg->rt_se[cpu];
499
500 if (rt_rq->rt_nr_running) {
501 if (!rt_se)
502 enqueue_top_rt_rq(rt_rq);
503 else if (!on_rt_rq(rt_se))
504 enqueue_rt_entity(rt_se, 0);
505
506 if (rt_rq->highest_prio.curr < curr->prio)
507 resched_curr(rq);
508 }
509 }
510
511 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
512 {
513 struct sched_rt_entity *rt_se;
514 int cpu = cpu_of(rq_of_rt_rq(rt_rq));
515
516 rt_se = rt_rq->tg->rt_se[cpu];
517
518 if (!rt_se)
519 dequeue_top_rt_rq(rt_rq);
520 else if (on_rt_rq(rt_se))
521 dequeue_rt_entity(rt_se, 0);
522 }
523
524 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
525 {
526 return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
527 }
528
529 static int rt_se_boosted(struct sched_rt_entity *rt_se)
530 {
531 struct rt_rq *rt_rq = group_rt_rq(rt_se);
532 struct task_struct *p;
533
534 if (rt_rq)
535 return !!rt_rq->rt_nr_boosted;
536
537 p = rt_task_of(rt_se);
538 return p->prio != p->normal_prio;
539 }
540
541 #ifdef CONFIG_SMP
542 static inline const struct cpumask *sched_rt_period_mask(void)
543 {
544 return this_rq()->rd->span;
545 }
546 #else
547 static inline const struct cpumask *sched_rt_period_mask(void)
548 {
549 return cpu_online_mask;
550 }
551 #endif
552
553 static inline
554 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
555 {
556 return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
557 }
558
559 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
560 {
561 return &rt_rq->tg->rt_bandwidth;
562 }
563
564 #else /* !CONFIG_RT_GROUP_SCHED */
565
566 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
567 {
568 return rt_rq->rt_runtime;
569 }
570
571 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
572 {
573 return ktime_to_ns(def_rt_bandwidth.rt_period);
574 }
575
576 typedef struct rt_rq *rt_rq_iter_t;
577
578 #define for_each_rt_rq(rt_rq, iter, rq) \
579 for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
580
581 #define for_each_sched_rt_entity(rt_se) \
582 for (; rt_se; rt_se = NULL)
583
584 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
585 {
586 return NULL;
587 }
588
589 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
590 {
591 struct rq *rq = rq_of_rt_rq(rt_rq);
592
593 if (!rt_rq->rt_nr_running)
594 return;
595
596 enqueue_top_rt_rq(rt_rq);
597 resched_curr(rq);
598 }
599
600 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
601 {
602 dequeue_top_rt_rq(rt_rq);
603 }
604
605 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
606 {
607 return rt_rq->rt_throttled;
608 }
609
610 static inline const struct cpumask *sched_rt_period_mask(void)
611 {
612 return cpu_online_mask;
613 }
614
615 static inline
616 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
617 {
618 return &cpu_rq(cpu)->rt;
619 }
620
621 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
622 {
623 return &def_rt_bandwidth;
624 }
625
626 #endif /* CONFIG_RT_GROUP_SCHED */
627
628 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
629 {
630 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
631
632 return (hrtimer_active(&rt_b->rt_period_timer) ||
633 rt_rq->rt_time < rt_b->rt_runtime);
634 }
635
636 #ifdef CONFIG_SMP
637 /*
638 * We ran out of runtime, see if we can borrow some from our neighbours.
639 */
640 static void do_balance_runtime(struct rt_rq *rt_rq)
641 {
642 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
643 struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
644 int i, weight;
645 u64 rt_period;
646
647 weight = cpumask_weight(rd->span);
648
649 raw_spin_lock(&rt_b->rt_runtime_lock);
650 rt_period = ktime_to_ns(rt_b->rt_period);
651 for_each_cpu(i, rd->span) {
652 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
653 s64 diff;
654
655 if (iter == rt_rq)
656 continue;
657
658 raw_spin_lock(&iter->rt_runtime_lock);
659 /*
660 * Either all rqs have inf runtime and there's nothing to steal
661 * or __disable_runtime() below sets a specific rq to inf to
662 * indicate its been disabled and disalow stealing.
663 */
664 if (iter->rt_runtime == RUNTIME_INF)
665 goto next;
666
667 /*
668 * From runqueues with spare time, take 1/n part of their
669 * spare time, but no more than our period.
670 */
671 diff = iter->rt_runtime - iter->rt_time;
672 if (diff > 0) {
673 diff = div_u64((u64)diff, weight);
674 if (rt_rq->rt_runtime + diff > rt_period)
675 diff = rt_period - rt_rq->rt_runtime;
676 iter->rt_runtime -= diff;
677 rt_rq->rt_runtime += diff;
678 if (rt_rq->rt_runtime == rt_period) {
679 raw_spin_unlock(&iter->rt_runtime_lock);
680 break;
681 }
682 }
683 next:
684 raw_spin_unlock(&iter->rt_runtime_lock);
685 }
686 raw_spin_unlock(&rt_b->rt_runtime_lock);
687 }
688
689 /*
690 * Ensure this RQ takes back all the runtime it lend to its neighbours.
691 */
692 static void __disable_runtime(struct rq *rq)
693 {
694 struct root_domain *rd = rq->rd;
695 rt_rq_iter_t iter;
696 struct rt_rq *rt_rq;
697
698 if (unlikely(!scheduler_running))
699 return;
700
701 for_each_rt_rq(rt_rq, iter, rq) {
702 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
703 s64 want;
704 int i;
705
706 raw_spin_lock(&rt_b->rt_runtime_lock);
707 raw_spin_lock(&rt_rq->rt_runtime_lock);
708 /*
709 * Either we're all inf and nobody needs to borrow, or we're
710 * already disabled and thus have nothing to do, or we have
711 * exactly the right amount of runtime to take out.
712 */
713 if (rt_rq->rt_runtime == RUNTIME_INF ||
714 rt_rq->rt_runtime == rt_b->rt_runtime)
715 goto balanced;
716 raw_spin_unlock(&rt_rq->rt_runtime_lock);
717
718 /*
719 * Calculate the difference between what we started out with
720 * and what we current have, that's the amount of runtime
721 * we lend and now have to reclaim.
722 */
723 want = rt_b->rt_runtime - rt_rq->rt_runtime;
724
725 /*
726 * Greedy reclaim, take back as much as we can.
727 */
728 for_each_cpu(i, rd->span) {
729 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
730 s64 diff;
731
732 /*
733 * Can't reclaim from ourselves or disabled runqueues.
734 */
735 if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
736 continue;
737
738 raw_spin_lock(&iter->rt_runtime_lock);
739 if (want > 0) {
740 diff = min_t(s64, iter->rt_runtime, want);
741 iter->rt_runtime -= diff;
742 want -= diff;
743 } else {
744 iter->rt_runtime -= want;
745 want -= want;
746 }
747 raw_spin_unlock(&iter->rt_runtime_lock);
748
749 if (!want)
750 break;
751 }
752
753 raw_spin_lock(&rt_rq->rt_runtime_lock);
754 /*
755 * We cannot be left wanting - that would mean some runtime
756 * leaked out of the system.
757 */
758 BUG_ON(want);
759 balanced:
760 /*
761 * Disable all the borrow logic by pretending we have inf
762 * runtime - in which case borrowing doesn't make sense.
763 */
764 rt_rq->rt_runtime = RUNTIME_INF;
765 rt_rq->rt_throttled = 0;
766 raw_spin_unlock(&rt_rq->rt_runtime_lock);
767 raw_spin_unlock(&rt_b->rt_runtime_lock);
768
769 /* Make rt_rq available for pick_next_task() */
770 sched_rt_rq_enqueue(rt_rq);
771 }
772 }
773
774 static void __enable_runtime(struct rq *rq)
775 {
776 rt_rq_iter_t iter;
777 struct rt_rq *rt_rq;
778
779 if (unlikely(!scheduler_running))
780 return;
781
782 /*
783 * Reset each runqueue's bandwidth settings
784 */
785 for_each_rt_rq(rt_rq, iter, rq) {
786 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
787
788 raw_spin_lock(&rt_b->rt_runtime_lock);
789 raw_spin_lock(&rt_rq->rt_runtime_lock);
790 rt_rq->rt_runtime = rt_b->rt_runtime;
791 rt_rq->rt_time = 0;
792 rt_rq->rt_throttled = 0;
793 raw_spin_unlock(&rt_rq->rt_runtime_lock);
794 raw_spin_unlock(&rt_b->rt_runtime_lock);
795 }
796 }
797
798 static void balance_runtime(struct rt_rq *rt_rq)
799 {
800 if (!sched_feat(RT_RUNTIME_SHARE))
801 return;
802
803 if (rt_rq->rt_time > rt_rq->rt_runtime) {
804 raw_spin_unlock(&rt_rq->rt_runtime_lock);
805 do_balance_runtime(rt_rq);
806 raw_spin_lock(&rt_rq->rt_runtime_lock);
807 }
808 }
809 #else /* !CONFIG_SMP */
810 static inline void balance_runtime(struct rt_rq *rt_rq) {}
811 #endif /* CONFIG_SMP */
812
813 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
814 {
815 int i, idle = 1, throttled = 0;
816 const struct cpumask *span;
817
818 span = sched_rt_period_mask();
819 #ifdef CONFIG_RT_GROUP_SCHED
820 /*
821 * FIXME: isolated CPUs should really leave the root task group,
822 * whether they are isolcpus or were isolated via cpusets, lest
823 * the timer run on a CPU which does not service all runqueues,
824 * potentially leaving other CPUs indefinitely throttled. If
825 * isolation is really required, the user will turn the throttle
826 * off to kill the perturbations it causes anyway. Meanwhile,
827 * this maintains functionality for boot and/or troubleshooting.
828 */
829 if (rt_b == &root_task_group.rt_bandwidth)
830 span = cpu_online_mask;
831 #endif
832 for_each_cpu(i, span) {
833 int enqueue = 0;
834 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
835 struct rq *rq = rq_of_rt_rq(rt_rq);
836 int skip;
837
838 /*
839 * When span == cpu_online_mask, taking each rq->lock
840 * can be time-consuming. Try to avoid it when possible.
841 */
842 raw_spin_lock(&rt_rq->rt_runtime_lock);
843 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
844 rt_rq->rt_runtime = rt_b->rt_runtime;
845 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
846 raw_spin_unlock(&rt_rq->rt_runtime_lock);
847 if (skip)
848 continue;
849
850 raw_spin_lock(&rq->lock);
851 update_rq_clock(rq);
852
853 if (rt_rq->rt_time) {
854 u64 runtime;
855
856 raw_spin_lock(&rt_rq->rt_runtime_lock);
857 if (rt_rq->rt_throttled)
858 balance_runtime(rt_rq);
859 runtime = rt_rq->rt_runtime;
860 rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
861 if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
862 rt_rq->rt_throttled = 0;
863 enqueue = 1;
864
865 /*
866 * When we're idle and a woken (rt) task is
867 * throttled check_preempt_curr() will set
868 * skip_update and the time between the wakeup
869 * and this unthrottle will get accounted as
870 * 'runtime'.
871 */
872 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
873 rq_clock_skip_update(rq, false);
874 }
875 if (rt_rq->rt_time || rt_rq->rt_nr_running)
876 idle = 0;
877 raw_spin_unlock(&rt_rq->rt_runtime_lock);
878 } else if (rt_rq->rt_nr_running) {
879 idle = 0;
880 if (!rt_rq_throttled(rt_rq))
881 enqueue = 1;
882 }
883 if (rt_rq->rt_throttled)
884 throttled = 1;
885
886 if (enqueue)
887 sched_rt_rq_enqueue(rt_rq);
888 raw_spin_unlock(&rq->lock);
889 }
890
891 if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
892 return 1;
893
894 return idle;
895 }
896
897 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
898 {
899 #ifdef CONFIG_RT_GROUP_SCHED
900 struct rt_rq *rt_rq = group_rt_rq(rt_se);
901
902 if (rt_rq)
903 return rt_rq->highest_prio.curr;
904 #endif
905
906 return rt_task_of(rt_se)->prio;
907 }
908
909 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
910 {
911 u64 runtime = sched_rt_runtime(rt_rq);
912
913 if (rt_rq->rt_throttled)
914 return rt_rq_throttled(rt_rq);
915
916 if (runtime >= sched_rt_period(rt_rq))
917 return 0;
918
919 balance_runtime(rt_rq);
920 runtime = sched_rt_runtime(rt_rq);
921 if (runtime == RUNTIME_INF)
922 return 0;
923
924 if (rt_rq->rt_time > runtime) {
925 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
926
927 /*
928 * Don't actually throttle groups that have no runtime assigned
929 * but accrue some time due to boosting.
930 */
931 if (likely(rt_b->rt_runtime)) {
932 rt_rq->rt_throttled = 1;
933 printk_deferred_once("sched: RT throttling activated\n");
934 } else {
935 /*
936 * In case we did anyway, make it go away,
937 * replenishment is a joke, since it will replenish us
938 * with exactly 0 ns.
939 */
940 rt_rq->rt_time = 0;
941 }
942
943 if (rt_rq_throttled(rt_rq)) {
944 sched_rt_rq_dequeue(rt_rq);
945 return 1;
946 }
947 }
948
949 return 0;
950 }
951
952 /*
953 * Update the current task's runtime statistics. Skip current tasks that
954 * are not in our scheduling class.
955 */
956 static void update_curr_rt(struct rq *rq)
957 {
958 struct task_struct *curr = rq->curr;
959 struct sched_rt_entity *rt_se = &curr->rt;
960 u64 delta_exec;
961
962 if (curr->sched_class != &rt_sched_class)
963 return;
964
965 delta_exec = rq_clock_task(rq) - curr->se.exec_start;
966 if (unlikely((s64)delta_exec <= 0))
967 return;
968
969 /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
970 cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
971
972 schedstat_set(curr->se.statistics.exec_max,
973 max(curr->se.statistics.exec_max, delta_exec));
974
975 curr->se.sum_exec_runtime += delta_exec;
976 account_group_exec_runtime(curr, delta_exec);
977
978 curr->se.exec_start = rq_clock_task(rq);
979 cpuacct_charge(curr, delta_exec);
980
981 sched_rt_avg_update(rq, delta_exec);
982
983 if (!rt_bandwidth_enabled())
984 return;
985
986 for_each_sched_rt_entity(rt_se) {
987 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
988
989 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
990 raw_spin_lock(&rt_rq->rt_runtime_lock);
991 rt_rq->rt_time += delta_exec;
992 if (sched_rt_runtime_exceeded(rt_rq))
993 resched_curr(rq);
994 raw_spin_unlock(&rt_rq->rt_runtime_lock);
995 }
996 }
997 }
998
999 static void
1000 dequeue_top_rt_rq(struct rt_rq *rt_rq)
1001 {
1002 struct rq *rq = rq_of_rt_rq(rt_rq);
1003
1004 BUG_ON(&rq->rt != rt_rq);
1005
1006 if (!rt_rq->rt_queued)
1007 return;
1008
1009 BUG_ON(!rq->nr_running);
1010
1011 sub_nr_running(rq, rt_rq->rt_nr_running);
1012 rt_rq->rt_queued = 0;
1013 }
1014
1015 static void
1016 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1017 {
1018 struct rq *rq = rq_of_rt_rq(rt_rq);
1019
1020 BUG_ON(&rq->rt != rt_rq);
1021
1022 if (rt_rq->rt_queued)
1023 return;
1024 if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
1025 return;
1026
1027 add_nr_running(rq, rt_rq->rt_nr_running);
1028 rt_rq->rt_queued = 1;
1029 }
1030
1031 #if defined CONFIG_SMP
1032
1033 static void
1034 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1035 {
1036 struct rq *rq = rq_of_rt_rq(rt_rq);
1037
1038 #ifdef CONFIG_RT_GROUP_SCHED
1039 /*
1040 * Change rq's cpupri only if rt_rq is the top queue.
1041 */
1042 if (&rq->rt != rt_rq)
1043 return;
1044 #endif
1045 if (rq->online && prio < prev_prio)
1046 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1047 }
1048
1049 static void
1050 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1051 {
1052 struct rq *rq = rq_of_rt_rq(rt_rq);
1053
1054 #ifdef CONFIG_RT_GROUP_SCHED
1055 /*
1056 * Change rq's cpupri only if rt_rq is the top queue.
1057 */
1058 if (&rq->rt != rt_rq)
1059 return;
1060 #endif
1061 if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1062 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1063 }
1064
1065 #else /* CONFIG_SMP */
1066
1067 static inline
1068 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1069 static inline
1070 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1071
1072 #endif /* CONFIG_SMP */
1073
1074 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1075 static void
1076 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1077 {
1078 int prev_prio = rt_rq->highest_prio.curr;
1079
1080 if (prio < prev_prio)
1081 rt_rq->highest_prio.curr = prio;
1082
1083 inc_rt_prio_smp(rt_rq, prio, prev_prio);
1084 }
1085
1086 static void
1087 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1088 {
1089 int prev_prio = rt_rq->highest_prio.curr;
1090
1091 if (rt_rq->rt_nr_running) {
1092
1093 WARN_ON(prio < prev_prio);
1094
1095 /*
1096 * This may have been our highest task, and therefore
1097 * we may have some recomputation to do
1098 */
1099 if (prio == prev_prio) {
1100 struct rt_prio_array *array = &rt_rq->active;
1101
1102 rt_rq->highest_prio.curr =
1103 sched_find_first_bit(array->bitmap);
1104 }
1105
1106 } else
1107 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1108
1109 dec_rt_prio_smp(rt_rq, prio, prev_prio);
1110 }
1111
1112 #else
1113
1114 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1115 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1116
1117 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1118
1119 #ifdef CONFIG_RT_GROUP_SCHED
1120
1121 static void
1122 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1123 {
1124 if (rt_se_boosted(rt_se))
1125 rt_rq->rt_nr_boosted++;
1126
1127 if (rt_rq->tg)
1128 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1129 }
1130
1131 static void
1132 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1133 {
1134 if (rt_se_boosted(rt_se))
1135 rt_rq->rt_nr_boosted--;
1136
1137 WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1138 }
1139
1140 #else /* CONFIG_RT_GROUP_SCHED */
1141
1142 static void
1143 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1144 {
1145 start_rt_bandwidth(&def_rt_bandwidth);
1146 }
1147
1148 static inline
1149 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1150
1151 #endif /* CONFIG_RT_GROUP_SCHED */
1152
1153 static inline
1154 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1155 {
1156 struct rt_rq *group_rq = group_rt_rq(rt_se);
1157
1158 if (group_rq)
1159 return group_rq->rt_nr_running;
1160 else
1161 return 1;
1162 }
1163
1164 static inline
1165 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1166 {
1167 struct rt_rq *group_rq = group_rt_rq(rt_se);
1168 struct task_struct *tsk;
1169
1170 if (group_rq)
1171 return group_rq->rr_nr_running;
1172
1173 tsk = rt_task_of(rt_se);
1174
1175 return (tsk->policy == SCHED_RR) ? 1 : 0;
1176 }
1177
1178 static inline
1179 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1180 {
1181 int prio = rt_se_prio(rt_se);
1182
1183 WARN_ON(!rt_prio(prio));
1184 rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1185 rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1186
1187 inc_rt_prio(rt_rq, prio);
1188 inc_rt_migration(rt_se, rt_rq);
1189 inc_rt_group(rt_se, rt_rq);
1190 }
1191
1192 static inline
1193 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1194 {
1195 WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1196 WARN_ON(!rt_rq->rt_nr_running);
1197 rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1198 rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1199
1200 dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1201 dec_rt_migration(rt_se, rt_rq);
1202 dec_rt_group(rt_se, rt_rq);
1203 }
1204
1205 /*
1206 * Change rt_se->run_list location unless SAVE && !MOVE
1207 *
1208 * assumes ENQUEUE/DEQUEUE flags match
1209 */
1210 static inline bool move_entity(unsigned int flags)
1211 {
1212 if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1213 return false;
1214
1215 return true;
1216 }
1217
1218 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1219 {
1220 list_del_init(&rt_se->run_list);
1221
1222 if (list_empty(array->queue + rt_se_prio(rt_se)))
1223 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1224
1225 rt_se->on_list = 0;
1226 }
1227
1228 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1229 {
1230 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1231 struct rt_prio_array *array = &rt_rq->active;
1232 struct rt_rq *group_rq = group_rt_rq(rt_se);
1233 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1234
1235 /*
1236 * Don't enqueue the group if its throttled, or when empty.
1237 * The latter is a consequence of the former when a child group
1238 * get throttled and the current group doesn't have any other
1239 * active members.
1240 */
1241 if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1242 if (rt_se->on_list)
1243 __delist_rt_entity(rt_se, array);
1244 return;
1245 }
1246
1247 if (move_entity(flags)) {
1248 WARN_ON_ONCE(rt_se->on_list);
1249 if (flags & ENQUEUE_HEAD)
1250 list_add(&rt_se->run_list, queue);
1251 else
1252 list_add_tail(&rt_se->run_list, queue);
1253
1254 __set_bit(rt_se_prio(rt_se), array->bitmap);
1255 rt_se->on_list = 1;
1256 }
1257 rt_se->on_rq = 1;
1258
1259 inc_rt_tasks(rt_se, rt_rq);
1260 }
1261
1262 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1263 {
1264 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1265 struct rt_prio_array *array = &rt_rq->active;
1266
1267 if (move_entity(flags)) {
1268 WARN_ON_ONCE(!rt_se->on_list);
1269 __delist_rt_entity(rt_se, array);
1270 }
1271 rt_se->on_rq = 0;
1272
1273 dec_rt_tasks(rt_se, rt_rq);
1274 }
1275
1276 /*
1277 * Because the prio of an upper entry depends on the lower
1278 * entries, we must remove entries top - down.
1279 */
1280 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1281 {
1282 struct sched_rt_entity *back = NULL;
1283
1284 for_each_sched_rt_entity(rt_se) {
1285 rt_se->back = back;
1286 back = rt_se;
1287 }
1288
1289 dequeue_top_rt_rq(rt_rq_of_se(back));
1290
1291 for (rt_se = back; rt_se; rt_se = rt_se->back) {
1292 if (on_rt_rq(rt_se))
1293 __dequeue_rt_entity(rt_se, flags);
1294 }
1295 }
1296
1297 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1298 {
1299 struct rq *rq = rq_of_rt_se(rt_se);
1300
1301 dequeue_rt_stack(rt_se, flags);
1302 for_each_sched_rt_entity(rt_se)
1303 __enqueue_rt_entity(rt_se, flags);
1304 enqueue_top_rt_rq(&rq->rt);
1305 }
1306
1307 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1308 {
1309 struct rq *rq = rq_of_rt_se(rt_se);
1310
1311 dequeue_rt_stack(rt_se, flags);
1312
1313 for_each_sched_rt_entity(rt_se) {
1314 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1315
1316 if (rt_rq && rt_rq->rt_nr_running)
1317 __enqueue_rt_entity(rt_se, flags);
1318 }
1319 enqueue_top_rt_rq(&rq->rt);
1320 }
1321
1322 /*
1323 * Adding/removing a task to/from a priority array:
1324 */
1325 static void
1326 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1327 {
1328 struct sched_rt_entity *rt_se = &p->rt;
1329
1330 schedtune_enqueue_task(p, cpu_of(rq));
1331
1332 if (flags & ENQUEUE_WAKEUP)
1333 rt_se->timeout = 0;
1334
1335 enqueue_rt_entity(rt_se, flags);
1336 walt_inc_cumulative_runnable_avg(rq, p);
1337
1338 if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1339 enqueue_pushable_task(rq, p);
1340 }
1341
1342 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1343 {
1344 struct sched_rt_entity *rt_se = &p->rt;
1345
1346 schedtune_dequeue_task(p, cpu_of(rq));
1347
1348 update_curr_rt(rq);
1349 dequeue_rt_entity(rt_se, flags);
1350 walt_dec_cumulative_runnable_avg(rq, p);
1351
1352 dequeue_pushable_task(rq, p);
1353 }
1354
1355 /*
1356 * Put task to the head or the end of the run list without the overhead of
1357 * dequeue followed by enqueue.
1358 */
1359 static void
1360 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1361 {
1362 if (on_rt_rq(rt_se)) {
1363 struct rt_prio_array *array = &rt_rq->active;
1364 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1365
1366 if (head)
1367 list_move(&rt_se->run_list, queue);
1368 else
1369 list_move_tail(&rt_se->run_list, queue);
1370 }
1371 }
1372
1373 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1374 {
1375 struct sched_rt_entity *rt_se = &p->rt;
1376 struct rt_rq *rt_rq;
1377
1378 for_each_sched_rt_entity(rt_se) {
1379 rt_rq = rt_rq_of_se(rt_se);
1380 requeue_rt_entity(rt_rq, rt_se, head);
1381 }
1382 }
1383
1384 static void yield_task_rt(struct rq *rq)
1385 {
1386 requeue_task_rt(rq, rq->curr, 0);
1387 }
1388
1389 #ifdef CONFIG_SMP
1390 static int find_lowest_rq(struct task_struct *task);
1391
1392 static int
1393 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
1394 int sibling_count_hint)
1395 {
1396 struct task_struct *curr;
1397 struct rq *rq;
1398
1399 /* For anything but wake ups, just return the task_cpu */
1400 if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1401 goto out;
1402
1403 rq = cpu_rq(cpu);
1404
1405 rcu_read_lock();
1406 curr = READ_ONCE(rq->curr); /* unlocked access */
1407
1408 /*
1409 * If the current task on @p's runqueue is an RT task, then
1410 * try to see if we can wake this RT task up on another
1411 * runqueue. Otherwise simply start this RT task
1412 * on its current runqueue.
1413 *
1414 * We want to avoid overloading runqueues. If the woken
1415 * task is a higher priority, then it will stay on this CPU
1416 * and the lower prio task should be moved to another CPU.
1417 * Even though this will probably make the lower prio task
1418 * lose its cache, we do not want to bounce a higher task
1419 * around just because it gave up its CPU, perhaps for a
1420 * lock?
1421 *
1422 * For equal prio tasks, we just let the scheduler sort it out.
1423 *
1424 * Otherwise, just let it ride on the affined RQ and the
1425 * post-schedule router will push the preempted task away
1426 *
1427 * This test is optimistic, if we get it wrong the load-balancer
1428 * will have to sort it out.
1429 */
1430 if (curr && unlikely(rt_task(curr)) &&
1431 (curr->nr_cpus_allowed < 2 ||
1432 curr->prio <= p->prio)) {
1433 int target = find_lowest_rq(p);
1434
1435 /*
1436 * Don't bother moving it if the destination CPU is
1437 * not running a lower priority task.
1438 */
1439 if (target != -1 &&
1440 p->prio < cpu_rq(target)->rt.highest_prio.curr)
1441 cpu = target;
1442 }
1443 rcu_read_unlock();
1444
1445 out:
1446 return cpu;
1447 }
1448
1449 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1450 {
1451 /*
1452 * Current can't be migrated, useless to reschedule,
1453 * let's hope p can move out.
1454 */
1455 if (rq->curr->nr_cpus_allowed == 1 ||
1456 !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1457 return;
1458
1459 /*
1460 * p is migratable, so let's not schedule it and
1461 * see if it is pushed or pulled somewhere else.
1462 */
1463 if (p->nr_cpus_allowed != 1
1464 && cpupri_find(&rq->rd->cpupri, p, NULL))
1465 return;
1466
1467 /*
1468 * There appears to be other cpus that can accept
1469 * current and none to run 'p', so lets reschedule
1470 * to try and push current away:
1471 */
1472 requeue_task_rt(rq, p, 1);
1473 resched_curr(rq);
1474 }
1475
1476 #endif /* CONFIG_SMP */
1477
1478 /*
1479 * Preempt the current task with a newly woken task if needed:
1480 */
1481 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1482 {
1483 if (p->prio < rq->curr->prio) {
1484 resched_curr(rq);
1485 return;
1486 }
1487
1488 #ifdef CONFIG_SMP
1489 /*
1490 * If:
1491 *
1492 * - the newly woken task is of equal priority to the current task
1493 * - the newly woken task is non-migratable while current is migratable
1494 * - current will be preempted on the next reschedule
1495 *
1496 * we should check to see if current can readily move to a different
1497 * cpu. If so, we will reschedule to allow the push logic to try
1498 * to move current somewhere else, making room for our non-migratable
1499 * task.
1500 */
1501 if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1502 check_preempt_equal_prio(rq, p);
1503 #endif
1504 }
1505
1506 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1507 struct rt_rq *rt_rq)
1508 {
1509 struct rt_prio_array *array = &rt_rq->active;
1510 struct sched_rt_entity *next = NULL;
1511 struct list_head *queue;
1512 int idx;
1513
1514 idx = sched_find_first_bit(array->bitmap);
1515 BUG_ON(idx >= MAX_RT_PRIO);
1516
1517 queue = array->queue + idx;
1518 next = list_entry(queue->next, struct sched_rt_entity, run_list);
1519
1520 return next;
1521 }
1522
1523 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1524 {
1525 struct sched_rt_entity *rt_se;
1526 struct task_struct *p;
1527 struct rt_rq *rt_rq = &rq->rt;
1528
1529 do {
1530 rt_se = pick_next_rt_entity(rq, rt_rq);
1531 BUG_ON(!rt_se);
1532 rt_rq = group_rt_rq(rt_se);
1533 } while (rt_rq);
1534
1535 p = rt_task_of(rt_se);
1536 p->se.exec_start = rq_clock_task(rq);
1537
1538 return p;
1539 }
1540
1541 extern int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running);
1542
1543 static struct task_struct *
1544 pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1545 {
1546 struct task_struct *p;
1547 struct rt_rq *rt_rq = &rq->rt;
1548
1549 if (need_pull_rt_task(rq, prev)) {
1550 /*
1551 * This is OK, because current is on_cpu, which avoids it being
1552 * picked for load-balance and preemption/IRQs are still
1553 * disabled avoiding further scheduler activity on it and we're
1554 * being very careful to re-start the picking loop.
1555 */
1556 rq_unpin_lock(rq, rf);
1557 pull_rt_task(rq);
1558 rq_repin_lock(rq, rf);
1559 /*
1560 * pull_rt_task() can drop (and re-acquire) rq->lock; this
1561 * means a dl or stop task can slip in, in which case we need
1562 * to re-start task selection.
1563 */
1564 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1565 rq->dl.dl_nr_running))
1566 return RETRY_TASK;
1567 }
1568
1569 /*
1570 * We may dequeue prev's rt_rq in put_prev_task().
1571 * So, we update time before rt_nr_running check.
1572 */
1573 if (prev->sched_class == &rt_sched_class)
1574 update_curr_rt(rq);
1575
1576 if (!rt_rq->rt_queued)
1577 return NULL;
1578
1579 put_prev_task(rq, prev);
1580
1581 p = _pick_next_task_rt(rq);
1582
1583 /* The running task is never eligible for pushing */
1584 dequeue_pushable_task(rq, p);
1585
1586 queue_push_tasks(rq);
1587
1588 if (p)
1589 update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), rt_rq,
1590 rq->curr->sched_class == &rt_sched_class);
1591
1592 return p;
1593 }
1594
1595 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1596 {
1597 update_curr_rt(rq);
1598
1599 update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
1600
1601 /*
1602 * The previous task needs to be made eligible for pushing
1603 * if it is still active
1604 */
1605 if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1606 enqueue_pushable_task(rq, p);
1607 }
1608
1609 #ifdef CONFIG_SMP
1610
1611 /* Only try algorithms three times */
1612 #define RT_MAX_TRIES 3
1613
1614 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1615 {
1616 if (!task_running(rq, p) &&
1617 cpumask_test_cpu(cpu, &p->cpus_allowed))
1618 return 1;
1619 return 0;
1620 }
1621
1622 /*
1623 * Return the highest pushable rq's task, which is suitable to be executed
1624 * on the cpu, NULL otherwise
1625 */
1626 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1627 {
1628 struct plist_head *head = &rq->rt.pushable_tasks;
1629 struct task_struct *p;
1630
1631 if (!has_pushable_tasks(rq))
1632 return NULL;
1633
1634 plist_for_each_entry(p, head, pushable_tasks) {
1635 if (pick_rt_task(rq, p, cpu))
1636 return p;
1637 }
1638
1639 return NULL;
1640 }
1641
1642 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1643
1644 static int find_lowest_rq(struct task_struct *task)
1645 {
1646 struct sched_domain *sd;
1647 struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1648 int this_cpu = smp_processor_id();
1649 int cpu = task_cpu(task);
1650
1651 /* Make sure the mask is initialized first */
1652 if (unlikely(!lowest_mask))
1653 return -1;
1654
1655 if (task->nr_cpus_allowed == 1)
1656 return -1; /* No other targets possible */
1657
1658 if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1659 return -1; /* No targets found */
1660
1661 /*
1662 * At this point we have built a mask of cpus representing the
1663 * lowest priority tasks in the system. Now we want to elect
1664 * the best one based on our affinity and topology.
1665 *
1666 * We prioritize the last cpu that the task executed on since
1667 * it is most likely cache-hot in that location.
1668 */
1669 if (cpumask_test_cpu(cpu, lowest_mask))
1670 return cpu;
1671
1672 /*
1673 * Otherwise, we consult the sched_domains span maps to figure
1674 * out which cpu is logically closest to our hot cache data.
1675 */
1676 if (!cpumask_test_cpu(this_cpu, lowest_mask))
1677 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1678
1679 rcu_read_lock();
1680 for_each_domain(cpu, sd) {
1681 if (sd->flags & SD_WAKE_AFFINE) {
1682 int best_cpu;
1683
1684 /*
1685 * "this_cpu" is cheaper to preempt than a
1686 * remote processor.
1687 */
1688 if (this_cpu != -1 &&
1689 cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1690 rcu_read_unlock();
1691 return this_cpu;
1692 }
1693
1694 best_cpu = cpumask_first_and(lowest_mask,
1695 sched_domain_span(sd));
1696 if (best_cpu < nr_cpu_ids) {
1697 rcu_read_unlock();
1698 return best_cpu;
1699 }
1700 }
1701 }
1702 rcu_read_unlock();
1703
1704 /*
1705 * And finally, if there were no matches within the domains
1706 * just give the caller *something* to work with from the compatible
1707 * locations.
1708 */
1709 if (this_cpu != -1)
1710 return this_cpu;
1711
1712 cpu = cpumask_any(lowest_mask);
1713 if (cpu < nr_cpu_ids)
1714 return cpu;
1715 return -1;
1716 }
1717
1718 /* Will lock the rq it finds */
1719 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1720 {
1721 struct rq *lowest_rq = NULL;
1722 int tries;
1723 int cpu;
1724
1725 for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1726 cpu = find_lowest_rq(task);
1727
1728 if ((cpu == -1) || (cpu == rq->cpu))
1729 break;
1730
1731 lowest_rq = cpu_rq(cpu);
1732
1733 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1734 /*
1735 * Target rq has tasks of equal or higher priority,
1736 * retrying does not release any lock and is unlikely
1737 * to yield a different result.
1738 */
1739 lowest_rq = NULL;
1740 break;
1741 }
1742
1743 /* if the prio of this runqueue changed, try again */
1744 if (double_lock_balance(rq, lowest_rq)) {
1745 /*
1746 * We had to unlock the run queue. In
1747 * the mean time, task could have
1748 * migrated already or had its affinity changed.
1749 * Also make sure that it wasn't scheduled on its rq.
1750 */
1751 if (unlikely(task_rq(task) != rq ||
1752 !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
1753 task_running(rq, task) ||
1754 !rt_task(task) ||
1755 !task_on_rq_queued(task))) {
1756
1757 double_unlock_balance(rq, lowest_rq);
1758 lowest_rq = NULL;
1759 break;
1760 }
1761 }
1762
1763 /* If this rq is still suitable use it. */
1764 if (lowest_rq->rt.highest_prio.curr > task->prio)
1765 break;
1766
1767 /* try again */
1768 double_unlock_balance(rq, lowest_rq);
1769 lowest_rq = NULL;
1770 }
1771
1772 return lowest_rq;
1773 }
1774
1775 static struct task_struct *pick_next_pushable_task(struct rq *rq)
1776 {
1777 struct task_struct *p;
1778
1779 if (!has_pushable_tasks(rq))
1780 return NULL;
1781
1782 p = plist_first_entry(&rq->rt.pushable_tasks,
1783 struct task_struct, pushable_tasks);
1784
1785 BUG_ON(rq->cpu != task_cpu(p));
1786 BUG_ON(task_current(rq, p));
1787 BUG_ON(p->nr_cpus_allowed <= 1);
1788
1789 BUG_ON(!task_on_rq_queued(p));
1790 BUG_ON(!rt_task(p));
1791
1792 return p;
1793 }
1794
1795 /*
1796 * If the current CPU has more than one RT task, see if the non
1797 * running task can migrate over to a CPU that is running a task
1798 * of lesser priority.
1799 */
1800 static int push_rt_task(struct rq *rq)
1801 {
1802 struct task_struct *next_task;
1803 struct rq *lowest_rq;
1804 int ret = 0;
1805
1806 if (!rq->rt.overloaded)
1807 return 0;
1808
1809 next_task = pick_next_pushable_task(rq);
1810 if (!next_task)
1811 return 0;
1812
1813 retry:
1814 if (unlikely(next_task == rq->curr)) {
1815 WARN_ON(1);
1816 return 0;
1817 }
1818
1819 /*
1820 * It's possible that the next_task slipped in of
1821 * higher priority than current. If that's the case
1822 * just reschedule current.
1823 */
1824 if (unlikely(next_task->prio < rq->curr->prio)) {
1825 resched_curr(rq);
1826 return 0;
1827 }
1828
1829 /* We might release rq lock */
1830 get_task_struct(next_task);
1831
1832 /* find_lock_lowest_rq locks the rq if found */
1833 lowest_rq = find_lock_lowest_rq(next_task, rq);
1834 if (!lowest_rq) {
1835 struct task_struct *task;
1836 /*
1837 * find_lock_lowest_rq releases rq->lock
1838 * so it is possible that next_task has migrated.
1839 *
1840 * We need to make sure that the task is still on the same
1841 * run-queue and is also still the next task eligible for
1842 * pushing.
1843 */
1844 task = pick_next_pushable_task(rq);
1845 if (task == next_task) {
1846 /*
1847 * The task hasn't migrated, and is still the next
1848 * eligible task, but we failed to find a run-queue
1849 * to push it to. Do not retry in this case, since
1850 * other cpus will pull from us when ready.
1851 */
1852 goto out;
1853 }
1854
1855 if (!task)
1856 /* No more tasks, just exit */
1857 goto out;
1858
1859 /*
1860 * Something has shifted, try again.
1861 */
1862 put_task_struct(next_task);
1863 next_task = task;
1864 goto retry;
1865 }
1866
1867 deactivate_task(rq, next_task, 0);
1868 next_task->on_rq = TASK_ON_RQ_MIGRATING;
1869 set_task_cpu(next_task, lowest_rq->cpu);
1870 next_task->on_rq = TASK_ON_RQ_QUEUED;
1871 activate_task(lowest_rq, next_task, 0);
1872 ret = 1;
1873
1874 resched_curr(lowest_rq);
1875
1876 double_unlock_balance(rq, lowest_rq);
1877
1878 out:
1879 put_task_struct(next_task);
1880
1881 return ret;
1882 }
1883
1884 static void push_rt_tasks(struct rq *rq)
1885 {
1886 /* push_rt_task will return true if it moved an RT */
1887 while (push_rt_task(rq))
1888 ;
1889 }
1890
1891 #ifdef HAVE_RT_PUSH_IPI
1892
1893 /*
1894 * When a high priority task schedules out from a CPU and a lower priority
1895 * task is scheduled in, a check is made to see if there's any RT tasks
1896 * on other CPUs that are waiting to run because a higher priority RT task
1897 * is currently running on its CPU. In this case, the CPU with multiple RT
1898 * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1899 * up that may be able to run one of its non-running queued RT tasks.
1900 *
1901 * All CPUs with overloaded RT tasks need to be notified as there is currently
1902 * no way to know which of these CPUs have the highest priority task waiting
1903 * to run. Instead of trying to take a spinlock on each of these CPUs,
1904 * which has shown to cause large latency when done on machines with many
1905 * CPUs, sending an IPI to the CPUs to have them push off the overloaded
1906 * RT tasks waiting to run.
1907 *
1908 * Just sending an IPI to each of the CPUs is also an issue, as on large
1909 * count CPU machines, this can cause an IPI storm on a CPU, especially
1910 * if its the only CPU with multiple RT tasks queued, and a large number
1911 * of CPUs scheduling a lower priority task at the same time.
1912 *
1913 * Each root domain has its own irq work function that can iterate over
1914 * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
1915 * tassk must be checked if there's one or many CPUs that are lowering
1916 * their priority, there's a single irq work iterator that will try to
1917 * push off RT tasks that are waiting to run.
1918 *
1919 * When a CPU schedules a lower priority task, it will kick off the
1920 * irq work iterator that will jump to each CPU with overloaded RT tasks.
1921 * As it only takes the first CPU that schedules a lower priority task
1922 * to start the process, the rto_start variable is incremented and if
1923 * the atomic result is one, then that CPU will try to take the rto_lock.
1924 * This prevents high contention on the lock as the process handles all
1925 * CPUs scheduling lower priority tasks.
1926 *
1927 * All CPUs that are scheduling a lower priority task will increment the
1928 * rt_loop_next variable. This will make sure that the irq work iterator
1929 * checks all RT overloaded CPUs whenever a CPU schedules a new lower
1930 * priority task, even if the iterator is in the middle of a scan. Incrementing
1931 * the rt_loop_next will cause the iterator to perform another scan.
1932 *
1933 */
1934 static int rto_next_cpu(struct root_domain *rd)
1935 {
1936 int next;
1937 int cpu;
1938
1939 /*
1940 * When starting the IPI RT pushing, the rto_cpu is set to -1,
1941 * rt_next_cpu() will simply return the first CPU found in
1942 * the rto_mask.
1943 *
1944 * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
1945 * will return the next CPU found in the rto_mask.
1946 *
1947 * If there are no more CPUs left in the rto_mask, then a check is made
1948 * against rto_loop and rto_loop_next. rto_loop is only updated with
1949 * the rto_lock held, but any CPU may increment the rto_loop_next
1950 * without any locking.
1951 */
1952 for (;;) {
1953
1954 /* When rto_cpu is -1 this acts like cpumask_first() */
1955 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
1956
1957 rd->rto_cpu = cpu;
1958
1959 if (cpu < nr_cpu_ids)
1960 return cpu;
1961
1962 rd->rto_cpu = -1;
1963
1964 /*
1965 * ACQUIRE ensures we see the @rto_mask changes
1966 * made prior to the @next value observed.
1967 *
1968 * Matches WMB in rt_set_overload().
1969 */
1970 next = atomic_read_acquire(&rd->rto_loop_next);
1971
1972 if (rd->rto_loop == next)
1973 break;
1974
1975 rd->rto_loop = next;
1976 }
1977
1978 return -1;
1979 }
1980
1981 static inline bool rto_start_trylock(atomic_t *v)
1982 {
1983 return !atomic_cmpxchg_acquire(v, 0, 1);
1984 }
1985
1986 static inline void rto_start_unlock(atomic_t *v)
1987 {
1988 atomic_set_release(v, 0);
1989 }
1990
1991 static void tell_cpu_to_push(struct rq *rq)
1992 {
1993 int cpu = -1;
1994
1995 /* Keep the loop going if the IPI is currently active */
1996 atomic_inc(&rq->rd->rto_loop_next);
1997
1998 /* Only one CPU can initiate a loop at a time */
1999 if (!rto_start_trylock(&rq->rd->rto_loop_start))
2000 return;
2001
2002 raw_spin_lock(&rq->rd->rto_lock);
2003
2004 /*
2005 * The rto_cpu is updated under the lock, if it has a valid cpu
2006 * then the IPI is still running and will continue due to the
2007 * update to loop_next, and nothing needs to be done here.
2008 * Otherwise it is finishing up and an ipi needs to be sent.
2009 */
2010 if (rq->rd->rto_cpu < 0)
2011 cpu = rto_next_cpu(rq->rd);
2012
2013 raw_spin_unlock(&rq->rd->rto_lock);
2014
2015 rto_start_unlock(&rq->rd->rto_loop_start);
2016
2017 if (cpu >= 0) {
2018 /* Make sure the rd does not get freed while pushing */
2019 sched_get_rd(rq->rd);
2020 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2021 }
2022 }
2023
2024 /* Called from hardirq context */
2025 void rto_push_irq_work_func(struct irq_work *work)
2026 {
2027 struct root_domain *rd =
2028 container_of(work, struct root_domain, rto_push_work);
2029 struct rq *rq;
2030 int cpu;
2031
2032 rq = this_rq();
2033
2034 /*
2035 * We do not need to grab the lock to check for has_pushable_tasks.
2036 * When it gets updated, a check is made if a push is possible.
2037 */
2038 if (has_pushable_tasks(rq)) {
2039 raw_spin_lock(&rq->lock);
2040 push_rt_tasks(rq);
2041 raw_spin_unlock(&rq->lock);
2042 }
2043
2044 raw_spin_lock(&rd->rto_lock);
2045
2046 /* Pass the IPI to the next rt overloaded queue */
2047 cpu = rto_next_cpu(rd);
2048
2049 raw_spin_unlock(&rd->rto_lock);
2050
2051 if (cpu < 0) {
2052 sched_put_rd(rd);
2053 return;
2054 }
2055
2056 /* Try the next RT overloaded CPU */
2057 irq_work_queue_on(&rd->rto_push_work, cpu);
2058 }
2059 #endif /* HAVE_RT_PUSH_IPI */
2060
2061 static void pull_rt_task(struct rq *this_rq)
2062 {
2063 int this_cpu = this_rq->cpu, cpu;
2064 bool resched = false;
2065 struct task_struct *p;
2066 struct rq *src_rq;
2067 int rt_overload_count = rt_overloaded(this_rq);
2068
2069 if (likely(!rt_overload_count))
2070 return;
2071
2072 /*
2073 * Match the barrier from rt_set_overloaded; this guarantees that if we
2074 * see overloaded we must also see the rto_mask bit.
2075 */
2076 smp_rmb();
2077
2078 /* If we are the only overloaded CPU do nothing */
2079 if (rt_overload_count == 1 &&
2080 cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2081 return;
2082
2083 #ifdef HAVE_RT_PUSH_IPI
2084 if (sched_feat(RT_PUSH_IPI)) {
2085 tell_cpu_to_push(this_rq);
2086 return;
2087 }
2088 #endif
2089
2090 for_each_cpu(cpu, this_rq->rd->rto_mask) {
2091 if (this_cpu == cpu)
2092 continue;
2093
2094 src_rq = cpu_rq(cpu);
2095
2096 /*
2097 * Don't bother taking the src_rq->lock if the next highest
2098 * task is known to be lower-priority than our current task.
2099 * This may look racy, but if this value is about to go
2100 * logically higher, the src_rq will push this task away.
2101 * And if its going logically lower, we do not care
2102 */
2103 if (src_rq->rt.highest_prio.next >=
2104 this_rq->rt.highest_prio.curr)
2105 continue;
2106
2107 /*
2108 * We can potentially drop this_rq's lock in
2109 * double_lock_balance, and another CPU could
2110 * alter this_rq
2111 */
2112 double_lock_balance(this_rq, src_rq);
2113
2114 /*
2115 * We can pull only a task, which is pushable
2116 * on its rq, and no others.
2117 */
2118 p = pick_highest_pushable_task(src_rq, this_cpu);
2119
2120 /*
2121 * Do we have an RT task that preempts
2122 * the to-be-scheduled task?
2123 */
2124 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2125 WARN_ON(p == src_rq->curr);
2126 WARN_ON(!task_on_rq_queued(p));
2127
2128 /*
2129 * There's a chance that p is higher in priority
2130 * than what's currently running on its cpu.
2131 * This is just that p is wakeing up and hasn't
2132 * had a chance to schedule. We only pull
2133 * p if it is lower in priority than the
2134 * current task on the run queue
2135 */
2136 if (p->prio < src_rq->curr->prio)
2137 goto skip;
2138
2139 resched = true;
2140
2141 deactivate_task(src_rq, p, 0);
2142 p->on_rq = TASK_ON_RQ_MIGRATING;
2143 set_task_cpu(p, this_cpu);
2144 p->on_rq = TASK_ON_RQ_QUEUED;
2145 activate_task(this_rq, p, 0);
2146 /*
2147 * We continue with the search, just in
2148 * case there's an even higher prio task
2149 * in another runqueue. (low likelihood
2150 * but possible)
2151 */
2152 }
2153 skip:
2154 double_unlock_balance(this_rq, src_rq);
2155 }
2156
2157 if (resched)
2158 resched_curr(this_rq);
2159 }
2160
2161 /*
2162 * If we are not running and we are not going to reschedule soon, we should
2163 * try to push tasks away now
2164 */
2165 static void task_woken_rt(struct rq *rq, struct task_struct *p)
2166 {
2167 if (!task_running(rq, p) &&
2168 !test_tsk_need_resched(rq->curr) &&
2169 p->nr_cpus_allowed > 1 &&
2170 (dl_task(rq->curr) || rt_task(rq->curr)) &&
2171 (rq->curr->nr_cpus_allowed < 2 ||
2172 rq->curr->prio <= p->prio))
2173 push_rt_tasks(rq);
2174 }
2175
2176 /* Assumes rq->lock is held */
2177 static void rq_online_rt(struct rq *rq)
2178 {
2179 if (rq->rt.overloaded)
2180 rt_set_overload(rq);
2181
2182 __enable_runtime(rq);
2183
2184 cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2185 }
2186
2187 /* Assumes rq->lock is held */
2188 static void rq_offline_rt(struct rq *rq)
2189 {
2190 if (rq->rt.overloaded)
2191 rt_clear_overload(rq);
2192
2193 __disable_runtime(rq);
2194
2195 cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2196 }
2197
2198 /*
2199 * When switch from the rt queue, we bring ourselves to a position
2200 * that we might want to pull RT tasks from other runqueues.
2201 */
2202 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2203 {
2204 /*
2205 * If there are other RT tasks then we will reschedule
2206 * and the scheduling of the other RT tasks will handle
2207 * the balancing. But if we are the last RT task
2208 * we may need to handle the pulling of RT tasks
2209 * now.
2210 */
2211 if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2212 return;
2213
2214 queue_pull_task(rq);
2215 }
2216
2217 void __init init_sched_rt_class(void)
2218 {
2219 unsigned int i;
2220
2221 for_each_possible_cpu(i) {
2222 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2223 GFP_KERNEL, cpu_to_node(i));
2224 }
2225 }
2226 #endif /* CONFIG_SMP */
2227
2228 /*
2229 * When switching a task to RT, we may overload the runqueue
2230 * with RT tasks. In this case we try to push them off to
2231 * other runqueues.
2232 */
2233 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2234 {
2235 /*
2236 * If we are already running, then there's nothing
2237 * that needs to be done. But if we are not running
2238 * we may need to preempt the current running task.
2239 * If that current running task is also an RT task
2240 * then see if we can move to another run queue.
2241 */
2242 if (task_on_rq_queued(p) && rq->curr != p) {
2243 #ifdef CONFIG_SMP
2244 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2245 queue_push_tasks(rq);
2246 #endif /* CONFIG_SMP */
2247 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2248 resched_curr(rq);
2249 }
2250 }
2251
2252 /*
2253 * Priority of the task has changed. This may cause
2254 * us to initiate a push or pull.
2255 */
2256 static void
2257 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2258 {
2259 if (!task_on_rq_queued(p))
2260 return;
2261
2262 if (rq->curr == p) {
2263 #ifdef CONFIG_SMP
2264 /*
2265 * If our priority decreases while running, we
2266 * may need to pull tasks to this runqueue.
2267 */
2268 if (oldprio < p->prio)
2269 queue_pull_task(rq);
2270
2271 /*
2272 * If there's a higher priority task waiting to run
2273 * then reschedule.
2274 */
2275 if (p->prio > rq->rt.highest_prio.curr)
2276 resched_curr(rq);
2277 #else
2278 /* For UP simply resched on drop of prio */
2279 if (oldprio < p->prio)
2280 resched_curr(rq);
2281 #endif /* CONFIG_SMP */
2282 } else {
2283 /*
2284 * This task is not running, but if it is
2285 * greater than the current running task
2286 * then reschedule.
2287 */
2288 if (p->prio < rq->curr->prio)
2289 resched_curr(rq);
2290 }
2291 }
2292
2293 #ifdef CONFIG_POSIX_TIMERS
2294 static void watchdog(struct rq *rq, struct task_struct *p)
2295 {
2296 unsigned long soft, hard;
2297
2298 /* max may change after cur was read, this will be fixed next tick */
2299 soft = task_rlimit(p, RLIMIT_RTTIME);
2300 hard = task_rlimit_max(p, RLIMIT_RTTIME);
2301
2302 if (soft != RLIM_INFINITY) {
2303 unsigned long next;
2304
2305 if (p->rt.watchdog_stamp != jiffies) {
2306 p->rt.timeout++;
2307 p->rt.watchdog_stamp = jiffies;
2308 }
2309
2310 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2311 if (p->rt.timeout > next)
2312 p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2313 }
2314 }
2315 #else
2316 static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2317 #endif
2318
2319 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2320 {
2321 struct sched_rt_entity *rt_se = &p->rt;
2322
2323 update_curr_rt(rq);
2324 update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
2325
2326 watchdog(rq, p);
2327
2328 /*
2329 * RR tasks need a special form of timeslice management.
2330 * FIFO tasks have no timeslices.
2331 */
2332 if (p->policy != SCHED_RR)
2333 return;
2334
2335 if (--p->rt.time_slice)
2336 return;
2337
2338 p->rt.time_slice = sched_rr_timeslice;
2339
2340 /*
2341 * Requeue to the end of queue if we (and all of our ancestors) are not
2342 * the only element on the queue
2343 */
2344 for_each_sched_rt_entity(rt_se) {
2345 if (rt_se->run_list.prev != rt_se->run_list.next) {
2346 requeue_task_rt(rq, p, 0);
2347 resched_curr(rq);
2348 return;
2349 }
2350 }
2351 }
2352
2353 static void set_curr_task_rt(struct rq *rq)
2354 {
2355 struct task_struct *p = rq->curr;
2356
2357 p->se.exec_start = rq_clock_task(rq);
2358
2359 /* The running task is never eligible for pushing */
2360 dequeue_pushable_task(rq, p);
2361 }
2362
2363 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2364 {
2365 /*
2366 * Time slice is 0 for SCHED_FIFO tasks
2367 */
2368 if (task->policy == SCHED_RR)
2369 return sched_rr_timeslice;
2370 else
2371 return 0;
2372 }
2373
2374 const struct sched_class rt_sched_class = {
2375 .next = &fair_sched_class,
2376 .enqueue_task = enqueue_task_rt,
2377 .dequeue_task = dequeue_task_rt,
2378 .yield_task = yield_task_rt,
2379
2380 .check_preempt_curr = check_preempt_curr_rt,
2381
2382 .pick_next_task = pick_next_task_rt,
2383 .put_prev_task = put_prev_task_rt,
2384
2385 #ifdef CONFIG_SMP
2386 .select_task_rq = select_task_rq_rt,
2387
2388 .set_cpus_allowed = set_cpus_allowed_common,
2389 .rq_online = rq_online_rt,
2390 .rq_offline = rq_offline_rt,
2391 .task_woken = task_woken_rt,
2392 .switched_from = switched_from_rt,
2393 #endif
2394
2395 .set_curr_task = set_curr_task_rt,
2396 .task_tick = task_tick_rt,
2397
2398 .get_rr_interval = get_rr_interval_rt,
2399
2400 .prio_changed = prio_changed_rt,
2401 .switched_to = switched_to_rt,
2402
2403 .update_curr = update_curr_rt,
2404 };
2405
2406 #ifdef CONFIG_RT_GROUP_SCHED
2407 /*
2408 * Ensure that the real time constraints are schedulable.
2409 */
2410 static DEFINE_MUTEX(rt_constraints_mutex);
2411
2412 /* Must be called with tasklist_lock held */
2413 static inline int tg_has_rt_tasks(struct task_group *tg)
2414 {
2415 struct task_struct *g, *p;
2416
2417 /*
2418 * Autogroups do not have RT tasks; see autogroup_create().
2419 */
2420 if (task_group_is_autogroup(tg))
2421 return 0;
2422
2423 for_each_process_thread(g, p) {
2424 if (rt_task(p) && task_group(p) == tg)
2425 return 1;
2426 }
2427
2428 return 0;
2429 }
2430
2431 struct rt_schedulable_data {
2432 struct task_group *tg;
2433 u64 rt_period;
2434 u64 rt_runtime;
2435 };
2436
2437 static int tg_rt_schedulable(struct task_group *tg, void *data)
2438 {
2439 struct rt_schedulable_data *d = data;
2440 struct task_group *child;
2441 unsigned long total, sum = 0;
2442 u64 period, runtime;
2443
2444 period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2445 runtime = tg->rt_bandwidth.rt_runtime;
2446
2447 if (tg == d->tg) {
2448 period = d->rt_period;
2449 runtime = d->rt_runtime;
2450 }
2451
2452 /*
2453 * Cannot have more runtime than the period.
2454 */
2455 if (runtime > period && runtime != RUNTIME_INF)
2456 return -EINVAL;
2457
2458 /*
2459 * Ensure we don't starve existing RT tasks.
2460 */
2461 if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2462 return -EBUSY;
2463
2464 total = to_ratio(period, runtime);
2465
2466 /*
2467 * Nobody can have more than the global setting allows.
2468 */
2469 if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2470 return -EINVAL;
2471
2472 /*
2473 * The sum of our children's runtime should not exceed our own.
2474 */
2475 list_for_each_entry_rcu(child, &tg->children, siblings) {
2476 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2477 runtime = child->rt_bandwidth.rt_runtime;
2478
2479 if (child == d->tg) {
2480 period = d->rt_period;
2481 runtime = d->rt_runtime;
2482 }
2483
2484 sum += to_ratio(period, runtime);
2485 }
2486
2487 if (sum > total)
2488 return -EINVAL;
2489
2490 return 0;
2491 }
2492
2493 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2494 {
2495 int ret;
2496
2497 struct rt_schedulable_data data = {
2498 .tg = tg,
2499 .rt_period = period,
2500 .rt_runtime = runtime,
2501 };
2502
2503 rcu_read_lock();
2504 ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2505 rcu_read_unlock();
2506
2507 return ret;
2508 }
2509
2510 static int tg_set_rt_bandwidth(struct task_group *tg,
2511 u64 rt_period, u64 rt_runtime)
2512 {
2513 int i, err = 0;
2514
2515 /*
2516 * Disallowing the root group RT runtime is BAD, it would disallow the
2517 * kernel creating (and or operating) RT threads.
2518 */
2519 if (tg == &root_task_group && rt_runtime == 0)
2520 return -EINVAL;
2521
2522 /* No period doesn't make any sense. */
2523 if (rt_period == 0)
2524 return -EINVAL;
2525
2526 mutex_lock(&rt_constraints_mutex);
2527 read_lock(&tasklist_lock);
2528 err = __rt_schedulable(tg, rt_period, rt_runtime);
2529 if (err)
2530 goto unlock;
2531
2532 raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2533 tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2534 tg->rt_bandwidth.rt_runtime = rt_runtime;
2535
2536 for_each_possible_cpu(i) {
2537 struct rt_rq *rt_rq = tg->rt_rq[i];
2538
2539 raw_spin_lock(&rt_rq->rt_runtime_lock);
2540 rt_rq->rt_runtime = rt_runtime;
2541 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2542 }
2543 raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2544 unlock:
2545 read_unlock(&tasklist_lock);
2546 mutex_unlock(&rt_constraints_mutex);
2547
2548 return err;
2549 }
2550
2551 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2552 {
2553 u64 rt_runtime, rt_period;
2554
2555 rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2556 rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2557 if (rt_runtime_us < 0)
2558 rt_runtime = RUNTIME_INF;
2559 else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2560 return -EINVAL;
2561
2562 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2563 }
2564
2565 long sched_group_rt_runtime(struct task_group *tg)
2566 {
2567 u64 rt_runtime_us;
2568
2569 if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2570 return -1;
2571
2572 rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2573 do_div(rt_runtime_us, NSEC_PER_USEC);
2574 return rt_runtime_us;
2575 }
2576
2577 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2578 {
2579 u64 rt_runtime, rt_period;
2580
2581 if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2582 return -EINVAL;
2583
2584 rt_period = rt_period_us * NSEC_PER_USEC;
2585 rt_runtime = tg->rt_bandwidth.rt_runtime;
2586
2587 return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2588 }
2589
2590 long sched_group_rt_period(struct task_group *tg)
2591 {
2592 u64 rt_period_us;
2593
2594 rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2595 do_div(rt_period_us, NSEC_PER_USEC);
2596 return rt_period_us;
2597 }
2598
2599 static int sched_rt_global_constraints(void)
2600 {
2601 int ret = 0;
2602
2603 mutex_lock(&rt_constraints_mutex);
2604 read_lock(&tasklist_lock);
2605 ret = __rt_schedulable(NULL, 0, 0);
2606 read_unlock(&tasklist_lock);
2607 mutex_unlock(&rt_constraints_mutex);
2608
2609 return ret;
2610 }
2611
2612 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2613 {
2614 /* Don't accept realtime tasks when there is no way for them to run */
2615 if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2616 return 0;
2617
2618 return 1;
2619 }
2620
2621 #else /* !CONFIG_RT_GROUP_SCHED */
2622 static int sched_rt_global_constraints(void)
2623 {
2624 unsigned long flags;
2625 int i;
2626
2627 raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2628 for_each_possible_cpu(i) {
2629 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2630
2631 raw_spin_lock(&rt_rq->rt_runtime_lock);
2632 rt_rq->rt_runtime = global_rt_runtime();
2633 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2634 }
2635 raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2636
2637 return 0;
2638 }
2639 #endif /* CONFIG_RT_GROUP_SCHED */
2640
2641 static int sched_rt_global_validate(void)
2642 {
2643 if (sysctl_sched_rt_period <= 0)
2644 return -EINVAL;
2645
2646 if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2647 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
2648 return -EINVAL;
2649
2650 return 0;
2651 }
2652
2653 static void sched_rt_do_global(void)
2654 {
2655 def_rt_bandwidth.rt_runtime = global_rt_runtime();
2656 def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2657 }
2658
2659 int sched_rt_handler(struct ctl_table *table, int write,
2660 void __user *buffer, size_t *lenp,
2661 loff_t *ppos)
2662 {
2663 int old_period, old_runtime;
2664 static DEFINE_MUTEX(mutex);
2665 int ret;
2666
2667 mutex_lock(&mutex);
2668 old_period = sysctl_sched_rt_period;
2669 old_runtime = sysctl_sched_rt_runtime;
2670
2671 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2672
2673 if (!ret && write) {
2674 ret = sched_rt_global_validate();
2675 if (ret)
2676 goto undo;
2677
2678 ret = sched_dl_global_validate();
2679 if (ret)
2680 goto undo;
2681
2682 ret = sched_rt_global_constraints();
2683 if (ret)
2684 goto undo;
2685
2686 sched_rt_do_global();
2687 sched_dl_do_global();
2688 }
2689 if (0) {
2690 undo:
2691 sysctl_sched_rt_period = old_period;
2692 sysctl_sched_rt_runtime = old_runtime;
2693 }
2694 mutex_unlock(&mutex);
2695
2696 return ret;
2697 }
2698
2699 int sched_rr_handler(struct ctl_table *table, int write,
2700 void __user *buffer, size_t *lenp,
2701 loff_t *ppos)
2702 {
2703 int ret;
2704 static DEFINE_MUTEX(mutex);
2705
2706 mutex_lock(&mutex);
2707 ret = proc_dointvec(table, write, buffer, lenp, ppos);
2708 /*
2709 * Make sure that internally we keep jiffies.
2710 * Also, writing zero resets the timeslice to default:
2711 */
2712 if (!ret && write) {
2713 sched_rr_timeslice =
2714 sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2715 msecs_to_jiffies(sysctl_sched_rr_timeslice);
2716 }
2717 mutex_unlock(&mutex);
2718 return ret;
2719 }
2720
2721 #ifdef CONFIG_SCHED_DEBUG
2722 void print_rt_stats(struct seq_file *m, int cpu)
2723 {
2724 rt_rq_iter_t iter;
2725 struct rt_rq *rt_rq;
2726
2727 rcu_read_lock();
2728 for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2729 print_rt_rq(m, cpu, rt_rq);
2730 rcu_read_unlock();
2731 }
2732 #endif /* CONFIG_SCHED_DEBUG */