kernel/sched/rt.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   4  * policies)
   5  */
   6
   7 #include "sched.h"
   8
   9 #include <linux/slab.h>
  10 #include <linux/irq_work.h>
  11 #include "tune.h"
  12
  13 #include "walt.h"
  14
  15 int sched_rr_timeslice = RR_TIMESLICE;
  16 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
  17
  18 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  19
  20 struct rt_bandwidth def_rt_bandwidth;
  21
  22 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  23 {
  24         struct rt_bandwidth *rt_b =
  25                 container_of(timer, struct rt_bandwidth, rt_period_timer);
  26         int idle = 0;
  27         int overrun;
  28
  29         raw_spin_lock(&rt_b->rt_runtime_lock);
  30         for (;;) {
  31                 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
  32                 if (!overrun)
  33                         break;
  34
  35                 raw_spin_unlock(&rt_b->rt_runtime_lock);
  36                 idle = do_sched_rt_period_timer(rt_b, overrun);
  37                 raw_spin_lock(&rt_b->rt_runtime_lock);
  38         }
  39         if (idle)
  40                 rt_b->rt_period_active = 0;
  41         raw_spin_unlock(&rt_b->rt_runtime_lock);
  42
  43         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  44 }
  45
  46 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  47 {
  48         rt_b->rt_period = ns_to_ktime(period);
  49         rt_b->rt_runtime = runtime;
  50
  51         raw_spin_lock_init(&rt_b->rt_runtime_lock);
  52
  53         hrtimer_init(&rt_b->rt_period_timer,
  54                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  55         rt_b->rt_period_timer.function = sched_rt_period_timer;
  56 }
  57
  58 static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
  59 {
  60         raw_spin_lock(&rt_b->rt_runtime_lock);
  61         if (!rt_b->rt_period_active) {
  62                 rt_b->rt_period_active = 1;
  63                 /*
  64                  * SCHED_DEADLINE updates the bandwidth, as a run away
  65                  * RT task with a DL task could hog a CPU. But DL does
  66                  * not reset the period. If a deadline task was running
  67                  * without an RT task running, it can cause RT tasks to
  68                  * throttle when they start up. Kick the timer right away
  69                  * to update the period.
  70                  */
  71                 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
  72                 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
  73         }
  74         raw_spin_unlock(&rt_b->rt_runtime_lock);
  75 }
  76
  77 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  78 {
  79         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  80                 return;
  81
  82         do_start_rt_bandwidth(rt_b);
  83 }
  84
  85 void init_rt_rq(struct rt_rq *rt_rq)
  86 {
  87         struct rt_prio_array *array;
  88         int i;
  89
  90         array = &rt_rq->active;
  91         for (i = 0; i < MAX_RT_PRIO; i++) {
  92                 INIT_LIST_HEAD(array->queue + i);
  93                 __clear_bit(i, array->bitmap);
  94         }
  95         /* delimiter for bitsearch: */
  96         __set_bit(MAX_RT_PRIO, array->bitmap);
  97
  98 #if defined CONFIG_SMP
  99         rt_rq->highest_prio.curr = MAX_RT_PRIO;
 100         rt_rq->highest_prio.next = MAX_RT_PRIO;
 101         rt_rq->rt_nr_migratory = 0;
 102         rt_rq->overloaded = 0;
 103         plist_head_init(&rt_rq->pushable_tasks);
 104 #endif /* CONFIG_SMP */
 105         /* We start is dequeued state, because no RT tasks are queued */
 106         rt_rq->rt_queued = 0;
 107
 108         rt_rq->rt_time = 0;
 109         rt_rq->rt_throttled = 0;
 110         rt_rq->rt_runtime = 0;
 111         raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 112 }
 113
 114 #ifdef CONFIG_RT_GROUP_SCHED
 115 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 116 {
 117         hrtimer_cancel(&rt_b->rt_period_timer);
 118 }
 119
 120 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 121
 122 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 123 {
 124 #ifdef CONFIG_SCHED_DEBUG
 125         WARN_ON_ONCE(!rt_entity_is_task(rt_se));
 126 #endif
 127         return container_of(rt_se, struct task_struct, rt);
 128 }
 129
 130 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 131 {
 132         return rt_rq->rq;
 133 }
 134
 135 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 136 {
 137         return rt_se->rt_rq;
 138 }
 139
 140 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 141 {
 142         struct rt_rq *rt_rq = rt_se->rt_rq;
 143
 144         return rt_rq->rq;
 145 }
 146
 147 void free_rt_sched_group(struct task_group *tg)
 148 {
 149         int i;
 150
 151         if (tg->rt_se)
 152                 destroy_rt_bandwidth(&tg->rt_bandwidth);
 153
 154         for_each_possible_cpu(i) {
 155                 if (tg->rt_rq)
 156                         kfree(tg->rt_rq[i]);
 157                 if (tg->rt_se)
 158                         kfree(tg->rt_se[i]);
 159         }
 160
 161         kfree(tg->rt_rq);
 162         kfree(tg->rt_se);
 163 }
 164
 165 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 166                 struct sched_rt_entity *rt_se, int cpu,
 167                 struct sched_rt_entity *parent)
 168 {
 169         struct rq *rq = cpu_rq(cpu);
 170
 171         rt_rq->highest_prio.curr = MAX_RT_PRIO;
 172         rt_rq->rt_nr_boosted = 0;
 173         rt_rq->rq = rq;
 174         rt_rq->tg = tg;
 175
 176         tg->rt_rq[cpu] = rt_rq;
 177         tg->rt_se[cpu] = rt_se;
 178
 179         if (!rt_se)
 180                 return;
 181
 182         if (!parent)
 183                 rt_se->rt_rq = &rq->rt;
 184         else
 185                 rt_se->rt_rq = parent->my_q;
 186
 187         rt_se->my_q = rt_rq;
 188         rt_se->parent = parent;
 189         INIT_LIST_HEAD(&rt_se->run_list);
 190 }
 191
 192 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 193 {
 194         struct rt_rq *rt_rq;
 195         struct sched_rt_entity *rt_se;
 196         int i;
 197
 198         tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
 199         if (!tg->rt_rq)
 200                 goto err;
 201         tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
 202         if (!tg->rt_se)
 203                 goto err;
 204
 205         init_rt_bandwidth(&tg->rt_bandwidth,
 206                         ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 207
 208         for_each_possible_cpu(i) {
 209                 rt_rq = kzalloc_node(sizeof(struct rt_rq),
 210                                      GFP_KERNEL, cpu_to_node(i));
 211                 if (!rt_rq)
 212                         goto err;
 213
 214                 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 215                                      GFP_KERNEL, cpu_to_node(i));
 216                 if (!rt_se)
 217                         goto err_free_rq;
 218
 219                 init_rt_rq(rt_rq);
 220                 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 221                 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 222         }
 223
 224         return 1;
 225
 226 err_free_rq:
 227         kfree(rt_rq);
 228 err:
 229         return 0;
 230 }
 231
 232 #else /* CONFIG_RT_GROUP_SCHED */
 233
 234 #define rt_entity_is_task(rt_se) (1)
 235
 236 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 237 {
 238         return container_of(rt_se, struct task_struct, rt);
 239 }
 240
 241 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 242 {
 243         return container_of(rt_rq, struct rq, rt);
 244 }
 245
 246 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 247 {
 248         struct task_struct *p = rt_task_of(rt_se);
 249
 250         return task_rq(p);
 251 }
 252
 253 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 254 {
 255         struct rq *rq = rq_of_rt_se(rt_se);
 256
 257         return &rq->rt;
 258 }
 259
 260 void free_rt_sched_group(struct task_group *tg) { }
 261
 262 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 263 {
 264         return 1;
 265 }
 266 #endif /* CONFIG_RT_GROUP_SCHED */
 267
 268 #ifdef CONFIG_SMP
 269
 270 static void pull_rt_task(struct rq *this_rq);
 271
 272 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 273 {
 274         /* Try to pull RT tasks here if we lower this rq's prio */
 275         return rq->rt.highest_prio.curr > prev->prio;
 276 }
 277
 278 static inline int rt_overloaded(struct rq *rq)
 279 {
 280         return atomic_read(&rq->rd->rto_count);
 281 }
 282
 283 static inline void rt_set_overload(struct rq *rq)
 284 {
 285         if (!rq->online)
 286                 return;
 287
 288         cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 289         /*
 290          * Make sure the mask is visible before we set
 291          * the overload count. That is checked to determine
 292          * if we should look at the mask. It would be a shame
 293          * if we looked at the mask, but the mask was not
 294          * updated yet.
 295          *
 296          * Matched by the barrier in pull_rt_task().
 297          */
 298         smp_wmb();
 299         atomic_inc(&rq->rd->rto_count);
 300 }
 301
 302 static inline void rt_clear_overload(struct rq *rq)
 303 {
 304         if (!rq->online)
 305                 return;
 306
 307         /* the order here really doesn't matter */
 308         atomic_dec(&rq->rd->rto_count);
 309         cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 310 }
 311
 312 static void update_rt_migration(struct rt_rq *rt_rq)
 313 {
 314         if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
 315                 if (!rt_rq->overloaded) {
 316                         rt_set_overload(rq_of_rt_rq(rt_rq));
 317                         rt_rq->overloaded = 1;
 318                 }
 319         } else if (rt_rq->overloaded) {
 320                 rt_clear_overload(rq_of_rt_rq(rt_rq));
 321                 rt_rq->overloaded = 0;
 322         }
 323 }
 324
 325 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 326 {
 327         struct task_struct *p;
 328
 329         if (!rt_entity_is_task(rt_se))
 330                 return;
 331
 332         p = rt_task_of(rt_se);
 333         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 334
 335         rt_rq->rt_nr_total++;
 336         if (p->nr_cpus_allowed > 1)
 337                 rt_rq->rt_nr_migratory++;
 338
 339         update_rt_migration(rt_rq);
 340 }
 341
 342 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 343 {
 344         struct task_struct *p;
 345
 346         if (!rt_entity_is_task(rt_se))
 347                 return;
 348
 349         p = rt_task_of(rt_se);
 350         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 351
 352         rt_rq->rt_nr_total--;
 353         if (p->nr_cpus_allowed > 1)
 354                 rt_rq->rt_nr_migratory--;
 355
 356         update_rt_migration(rt_rq);
 357 }
 358
 359 static inline int has_pushable_tasks(struct rq *rq)
 360 {
 361         return !plist_head_empty(&rq->rt.pushable_tasks);
 362 }
 363
 364 static DEFINE_PER_CPU(struct callback_head, rt_push_head);
 365 static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 366
 367 static void push_rt_tasks(struct rq *);
 368 static void pull_rt_task(struct rq *);
 369
 370 static inline void queue_push_tasks(struct rq *rq)
 371 {
 372         if (!has_pushable_tasks(rq))
 373                 return;
 374
 375         queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 376 }
 377
 378 static inline void queue_pull_task(struct rq *rq)
 379 {
 380         queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 381 }
 382
 383 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 384 {
 385         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 386         plist_node_init(&p->pushable_tasks, p->prio);
 387         plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 388
 389         /* Update the highest prio pushable task */
 390         if (p->prio < rq->rt.highest_prio.next)
 391                 rq->rt.highest_prio.next = p->prio;
 392 }
 393
 394 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 395 {
 396         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 397
 398         /* Update the new highest prio pushable task */
 399         if (has_pushable_tasks(rq)) {
 400                 p = plist_first_entry(&rq->rt.pushable_tasks,
 401                                       struct task_struct, pushable_tasks);
 402                 rq->rt.highest_prio.next = p->prio;
 403         } else
 404                 rq->rt.highest_prio.next = MAX_RT_PRIO;
 405 }
 406
 407 #else
 408
 409 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 410 {
 411 }
 412
 413 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 414 {
 415 }
 416
 417 static inline
 418 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 419 {
 420 }
 421
 422 static inline
 423 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 424 {
 425 }
 426
 427 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 428 {
 429         return false;
 430 }
 431
 432 static inline void pull_rt_task(struct rq *this_rq)
 433 {
 434 }
 435
 436 static inline void queue_push_tasks(struct rq *rq)
 437 {
 438 }
 439 #endif /* CONFIG_SMP */
 440
 441 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
 442 static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 443
 444 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 445 {
 446         return rt_se->on_rq;
 447 }
 448
 449 #ifdef CONFIG_RT_GROUP_SCHED
 450
 451 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 452 {
 453         if (!rt_rq->tg)
 454                 return RUNTIME_INF;
 455
 456         return rt_rq->rt_runtime;
 457 }
 458
 459 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 460 {
 461         return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 462 }
 463
 464 typedef struct task_group *rt_rq_iter_t;
 465
 466 static inline struct task_group *next_task_group(struct task_group *tg)
 467 {
 468         do {
 469                 tg = list_entry_rcu(tg->list.next,
 470                         typeof(struct task_group), list);
 471         } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 472
 473         if (&tg->list == &task_groups)
 474                 tg = NULL;
 475
 476         return tg;
 477 }
 478
 479 #define for_each_rt_rq(rt_rq, iter, rq)                                 \
 480         for (iter = container_of(&task_groups, typeof(*iter), list);    \
 481                 (iter = next_task_group(iter)) &&                       \
 482                 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 483
 484 #define for_each_sched_rt_entity(rt_se) \
 485         for (; rt_se; rt_se = rt_se->parent)
 486
 487 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 488 {
 489         return rt_se->my_q;
 490 }
 491
 492 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 493 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 494
 495 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 496 {
 497         struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 498         struct rq *rq = rq_of_rt_rq(rt_rq);
 499         struct sched_rt_entity *rt_se;
 500
 501         int cpu = cpu_of(rq);
 502
 503         rt_se = rt_rq->tg->rt_se[cpu];
 504
 505         if (rt_rq->rt_nr_running) {
 506                 if (!rt_se)
 507                         enqueue_top_rt_rq(rt_rq);
 508                 else if (!on_rt_rq(rt_se))
 509                         enqueue_rt_entity(rt_se, 0);
 510
 511                 if (rt_rq->highest_prio.curr < curr->prio)
 512                         resched_curr(rq);
 513         }
 514 }
 515
 516 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 517 {
 518         struct sched_rt_entity *rt_se;
 519         int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 520
 521         rt_se = rt_rq->tg->rt_se[cpu];
 522
 523         if (!rt_se)
 524                 dequeue_top_rt_rq(rt_rq);
 525         else if (on_rt_rq(rt_se))
 526                 dequeue_rt_entity(rt_se, 0);
 527 }
 528
 529 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 530 {
 531         return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 532 }
 533
 534 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 535 {
 536         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 537         struct task_struct *p;
 538
 539         if (rt_rq)
 540                 return !!rt_rq->rt_nr_boosted;
 541
 542         p = rt_task_of(rt_se);
 543         return p->prio != p->normal_prio;
 544 }
 545
 546 #ifdef CONFIG_SMP
 547 static inline const struct cpumask *sched_rt_period_mask(void)
 548 {
 549         return this_rq()->rd->span;
 550 }
 551 #else
 552 static inline const struct cpumask *sched_rt_period_mask(void)
 553 {
 554         return cpu_online_mask;
 555 }
 556 #endif
 557
 558 static inline
 559 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 560 {
 561         return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 562 }
 563
 564 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 565 {
 566         return &rt_rq->tg->rt_bandwidth;
 567 }
 568
 569 #else /* !CONFIG_RT_GROUP_SCHED */
 570
 571 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 572 {
 573         return rt_rq->rt_runtime;
 574 }
 575
 576 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 577 {
 578         return ktime_to_ns(def_rt_bandwidth.rt_period);
 579 }
 580
 581 typedef struct rt_rq *rt_rq_iter_t;
 582
 583 #define for_each_rt_rq(rt_rq, iter, rq) \
 584         for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 585
 586 #define for_each_sched_rt_entity(rt_se) \
 587         for (; rt_se; rt_se = NULL)
 588
 589 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 590 {
 591         return NULL;
 592 }
 593
 594 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 595 {
 596         struct rq *rq = rq_of_rt_rq(rt_rq);
 597
 598         if (!rt_rq->rt_nr_running)
 599                 return;
 600
 601         enqueue_top_rt_rq(rt_rq);
 602         resched_curr(rq);
 603 }
 604
 605 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 606 {
 607         dequeue_top_rt_rq(rt_rq);
 608 }
 609
 610 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 611 {
 612         return rt_rq->rt_throttled;
 613 }
 614
 615 static inline const struct cpumask *sched_rt_period_mask(void)
 616 {
 617         return cpu_online_mask;
 618 }
 619
 620 static inline
 621 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 622 {
 623         return &cpu_rq(cpu)->rt;
 624 }
 625
 626 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 627 {
 628         return &def_rt_bandwidth;
 629 }
 630
 631 #endif /* CONFIG_RT_GROUP_SCHED */
 632
 633 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 634 {
 635         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 636
 637         return (hrtimer_active(&rt_b->rt_period_timer) ||
 638                 rt_rq->rt_time < rt_b->rt_runtime);
 639 }
 640
 641 #ifdef CONFIG_SMP
 642 /*
 643  * We ran out of runtime, see if we can borrow some from our neighbours.
 644  */
 645 static void do_balance_runtime(struct rt_rq *rt_rq)
 646 {
 647         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 648         struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
 649         int i, weight;
 650         u64 rt_period;
 651
 652         weight = cpumask_weight(rd->span);
 653
 654         raw_spin_lock(&rt_b->rt_runtime_lock);
 655         rt_period = ktime_to_ns(rt_b->rt_period);
 656         for_each_cpu(i, rd->span) {
 657                 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 658                 s64 diff;
 659
 660                 if (iter == rt_rq)
 661                         continue;
 662
 663                 raw_spin_lock(&iter->rt_runtime_lock);
 664                 /*
 665                  * Either all rqs have inf runtime and there's nothing to steal
 666                  * or __disable_runtime() below sets a specific rq to inf to
 667                  * indicate its been disabled and disalow stealing.
 668                  */
 669                 if (iter->rt_runtime == RUNTIME_INF)
 670                         goto next;
 671
 672                 /*
 673                  * From runqueues with spare time, take 1/n part of their
 674                  * spare time, but no more than our period.
 675                  */
 676                 diff = iter->rt_runtime - iter->rt_time;
 677                 if (diff > 0) {
 678                         diff = div_u64((u64)diff, weight);
 679                         if (rt_rq->rt_runtime + diff > rt_period)
 680                                 diff = rt_period - rt_rq->rt_runtime;
 681                         iter->rt_runtime -= diff;
 682                         rt_rq->rt_runtime += diff;
 683                         if (rt_rq->rt_runtime == rt_period) {
 684                                 raw_spin_unlock(&iter->rt_runtime_lock);
 685                                 break;
 686                         }
 687                 }
 688 next:
 689                 raw_spin_unlock(&iter->rt_runtime_lock);
 690         }
 691         raw_spin_unlock(&rt_b->rt_runtime_lock);
 692 }
 693
 694 /*
 695  * Ensure this RQ takes back all the runtime it lend to its neighbours.
 696  */
 697 static void __disable_runtime(struct rq *rq)
 698 {
 699         struct root_domain *rd = rq->rd;
 700         rt_rq_iter_t iter;
 701         struct rt_rq *rt_rq;
 702
 703         if (unlikely(!scheduler_running))
 704                 return;
 705
 706         for_each_rt_rq(rt_rq, iter, rq) {
 707                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 708                 s64 want;
 709                 int i;
 710
 711                 raw_spin_lock(&rt_b->rt_runtime_lock);
 712                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 713                 /*
 714                  * Either we're all inf and nobody needs to borrow, or we're
 715                  * already disabled and thus have nothing to do, or we have
 716                  * exactly the right amount of runtime to take out.
 717                  */
 718                 if (rt_rq->rt_runtime == RUNTIME_INF ||
 719                                 rt_rq->rt_runtime == rt_b->rt_runtime)
 720                         goto balanced;
 721                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 722
 723                 /*
 724                  * Calculate the difference between what we started out with
 725                  * and what we current have, that's the amount of runtime
 726                  * we lend and now have to reclaim.
 727                  */
 728                 want = rt_b->rt_runtime - rt_rq->rt_runtime;
 729
 730                 /*
 731                  * Greedy reclaim, take back as much as we can.
 732                  */
 733                 for_each_cpu(i, rd->span) {
 734                         struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 735                         s64 diff;
 736
 737                         /*
 738                          * Can't reclaim from ourselves or disabled runqueues.
 739                          */
 740                         if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 741                                 continue;
 742
 743                         raw_spin_lock(&iter->rt_runtime_lock);
 744                         if (want > 0) {
 745                                 diff = min_t(s64, iter->rt_runtime, want);
 746                                 iter->rt_runtime -= diff;
 747                                 want -= diff;
 748                         } else {
 749                                 iter->rt_runtime -= want;
 750                                 want -= want;
 751                         }
 752                         raw_spin_unlock(&iter->rt_runtime_lock);
 753
 754                         if (!want)
 755                                 break;
 756                 }
 757
 758                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 759                 /*
 760                  * We cannot be left wanting - that would mean some runtime
 761                  * leaked out of the system.
 762                  */
 763                 BUG_ON(want);
 764 balanced:
 765                 /*
 766                  * Disable all the borrow logic by pretending we have inf
 767                  * runtime - in which case borrowing doesn't make sense.
 768                  */
 769                 rt_rq->rt_runtime = RUNTIME_INF;
 770                 rt_rq->rt_throttled = 0;
 771                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 772                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 773
 774                 /* Make rt_rq available for pick_next_task() */
 775                 sched_rt_rq_enqueue(rt_rq);
 776         }
 777 }
 778
 779 static void __enable_runtime(struct rq *rq)
 780 {
 781         rt_rq_iter_t iter;
 782         struct rt_rq *rt_rq;
 783
 784         if (unlikely(!scheduler_running))
 785                 return;
 786
 787         /*
 788          * Reset each runqueue's bandwidth settings
 789          */
 790         for_each_rt_rq(rt_rq, iter, rq) {
 791                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 792
 793                 raw_spin_lock(&rt_b->rt_runtime_lock);
 794                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 795                 rt_rq->rt_runtime = rt_b->rt_runtime;
 796                 rt_rq->rt_time = 0;
 797                 rt_rq->rt_throttled = 0;
 798                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 799                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 800         }
 801 }
 802
 803 static void balance_runtime(struct rt_rq *rt_rq)
 804 {
 805         if (!sched_feat(RT_RUNTIME_SHARE))
 806                 return;
 807
 808         if (rt_rq->rt_time > rt_rq->rt_runtime) {
 809                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 810                 do_balance_runtime(rt_rq);
 811                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 812         }
 813 }
 814 #else /* !CONFIG_SMP */
 815 static inline void balance_runtime(struct rt_rq *rt_rq) {}
 816 #endif /* CONFIG_SMP */
 817
 818 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 819 {
 820         int i, idle = 1, throttled = 0;
 821         const struct cpumask *span;
 822
 823         span = sched_rt_period_mask();
 824 #ifdef CONFIG_RT_GROUP_SCHED
 825         /*
 826          * FIXME: isolated CPUs should really leave the root task group,
 827          * whether they are isolcpus or were isolated via cpusets, lest
 828          * the timer run on a CPU which does not service all runqueues,
 829          * potentially leaving other CPUs indefinitely throttled.  If
 830          * isolation is really required, the user will turn the throttle
 831          * off to kill the perturbations it causes anyway.  Meanwhile,
 832          * this maintains functionality for boot and/or troubleshooting.
 833          */
 834         if (rt_b == &root_task_group.rt_bandwidth)
 835                 span = cpu_online_mask;
 836 #endif
 837         for_each_cpu(i, span) {
 838                 int enqueue = 0;
 839                 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 840                 struct rq *rq = rq_of_rt_rq(rt_rq);
 841                 int skip;
 842
 843                 /*
 844                  * When span == cpu_online_mask, taking each rq->lock
 845                  * can be time-consuming. Try to avoid it when possible.
 846                  */
 847                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 848                 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
 849                         rt_rq->rt_runtime = rt_b->rt_runtime;
 850                 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
 851                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 852                 if (skip)
 853                         continue;
 854
 855                 raw_spin_lock(&rq->lock);
 856                 update_rq_clock(rq);
 857
 858                 if (rt_rq->rt_time) {
 859                         u64 runtime;
 860
 861                         raw_spin_lock(&rt_rq->rt_runtime_lock);
 862                         if (rt_rq->rt_throttled)
 863                                 balance_runtime(rt_rq);
 864                         runtime = rt_rq->rt_runtime;
 865                         rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 866                         if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 867                                 rt_rq->rt_throttled = 0;
 868                                 enqueue = 1;
 869
 870                                 /*
 871                                  * When we're idle and a woken (rt) task is
 872                                  * throttled check_preempt_curr() will set
 873                                  * skip_update and the time between the wakeup
 874                                  * and this unthrottle will get accounted as
 875                                  * 'runtime'.
 876                                  */
 877                                 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 878                                         rq_clock_skip_update(rq, false);
 879                         }
 880                         if (rt_rq->rt_time || rt_rq->rt_nr_running)
 881                                 idle = 0;
 882                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
 883                 } else if (rt_rq->rt_nr_running) {
 884                         idle = 0;
 885                         if (!rt_rq_throttled(rt_rq))
 886                                 enqueue = 1;
 887                 }
 888                 if (rt_rq->rt_throttled)
 889                         throttled = 1;
 890
 891                 if (enqueue)
 892                         sched_rt_rq_enqueue(rt_rq);
 893                 raw_spin_unlock(&rq->lock);
 894         }
 895
 896         if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
 897                 return 1;
 898
 899         return idle;
 900 }
 901
 902 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 903 {
 904 #ifdef CONFIG_RT_GROUP_SCHED
 905         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 906
 907         if (rt_rq)
 908                 return rt_rq->highest_prio.curr;
 909 #endif
 910
 911         return rt_task_of(rt_se)->prio;
 912 }
 913
 914 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
 915 {
 916         u64 runtime = sched_rt_runtime(rt_rq);
 917
 918         if (rt_rq->rt_throttled)
 919                 return rt_rq_throttled(rt_rq);
 920
 921         if (runtime >= sched_rt_period(rt_rq))
 922                 return 0;
 923
 924         balance_runtime(rt_rq);
 925         runtime = sched_rt_runtime(rt_rq);
 926         if (runtime == RUNTIME_INF)
 927                 return 0;
 928
 929         if (rt_rq->rt_time > runtime) {
 930                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 931
 932                 /*
 933                  * Don't actually throttle groups that have no runtime assigned
 934                  * but accrue some time due to boosting.
 935                  */
 936                 if (likely(rt_b->rt_runtime)) {
 937                         rt_rq->rt_throttled = 1;
 938                         printk_deferred_once("sched: RT throttling activated\n");
 939                 } else {
 940                         /*
 941                          * In case we did anyway, make it go away,
 942                          * replenishment is a joke, since it will replenish us
 943                          * with exactly 0 ns.
 944                          */
 945                         rt_rq->rt_time = 0;
 946                 }
 947
 948                 if (rt_rq_throttled(rt_rq)) {
 949                         sched_rt_rq_dequeue(rt_rq);
 950                         return 1;
 951                 }
 952         }
 953
 954         return 0;
 955 }
 956
 957 /*
 958  * Update the current task's runtime statistics. Skip current tasks that
 959  * are not in our scheduling class.
 960  */
 961 static void update_curr_rt(struct rq *rq)
 962 {
 963         struct task_struct *curr = rq->curr;
 964         struct sched_rt_entity *rt_se = &curr->rt;
 965         u64 delta_exec;
 966
 967         if (curr->sched_class != &rt_sched_class)
 968                 return;
 969
 970         delta_exec = rq_clock_task(rq) - curr->se.exec_start;
 971         if (unlikely((s64)delta_exec <= 0))
 972                 return;
 973
 974         /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
 975         cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
 976
 977         schedstat_set(curr->se.statistics.exec_max,
 978                       max(curr->se.statistics.exec_max, delta_exec));
 979
 980         curr->se.sum_exec_runtime += delta_exec;
 981         account_group_exec_runtime(curr, delta_exec);
 982
 983         curr->se.exec_start = rq_clock_task(rq);
 984         cpuacct_charge(curr, delta_exec);
 985
 986         sched_rt_avg_update(rq, delta_exec);
 987
 988         if (!rt_bandwidth_enabled())
 989                 return;
 990
 991         for_each_sched_rt_entity(rt_se) {
 992                 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
 993                 int exceeded;
 994
 995                 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
 996                         raw_spin_lock(&rt_rq->rt_runtime_lock);
 997                         rt_rq->rt_time += delta_exec;
 998                         exceeded = sched_rt_runtime_exceeded(rt_rq);
 999                         if (exceeded)
1000                                 resched_curr(rq);
1001                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
1002                         if (exceeded)
1003                                 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1004                 }
1005         }
1006 }
1007
1008 static void
1009 dequeue_top_rt_rq(struct rt_rq *rt_rq)
1010 {
1011         struct rq *rq = rq_of_rt_rq(rt_rq);
1012
1013         BUG_ON(&rq->rt != rt_rq);
1014
1015         if (!rt_rq->rt_queued)
1016                 return;
1017
1018         BUG_ON(!rq->nr_running);
1019
1020         sub_nr_running(rq, rt_rq->rt_nr_running);
1021         rt_rq->rt_queued = 0;
1022 }
1023
1024 static void
1025 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1026 {
1027         struct rq *rq = rq_of_rt_rq(rt_rq);
1028
1029         BUG_ON(&rq->rt != rt_rq);
1030
1031         if (rt_rq->rt_queued)
1032                 return;
1033         if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
1034                 return;
1035
1036         add_nr_running(rq, rt_rq->rt_nr_running);
1037         rt_rq->rt_queued = 1;
1038 }
1039
1040 #if defined CONFIG_SMP
1041
1042 static void
1043 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1044 {
1045         struct rq *rq = rq_of_rt_rq(rt_rq);
1046
1047 #ifdef CONFIG_RT_GROUP_SCHED
1048         /*
1049          * Change rq's cpupri only if rt_rq is the top queue.
1050          */
1051         if (&rq->rt != rt_rq)
1052                 return;
1053 #endif
1054         if (rq->online && prio < prev_prio)
1055                 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1056 }
1057
1058 static void
1059 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1060 {
1061         struct rq *rq = rq_of_rt_rq(rt_rq);
1062
1063 #ifdef CONFIG_RT_GROUP_SCHED
1064         /*
1065          * Change rq's cpupri only if rt_rq is the top queue.
1066          */
1067         if (&rq->rt != rt_rq)
1068                 return;
1069 #endif
1070         if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1071                 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1072 }
1073
1074 #else /* CONFIG_SMP */
1075
1076 static inline
1077 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1078 static inline
1079 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1080
1081 #endif /* CONFIG_SMP */
1082
1083 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1084 static void
1085 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1086 {
1087         int prev_prio = rt_rq->highest_prio.curr;
1088
1089         if (prio < prev_prio)
1090                 rt_rq->highest_prio.curr = prio;
1091
1092         inc_rt_prio_smp(rt_rq, prio, prev_prio);
1093 }
1094
1095 static void
1096 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1097 {
1098         int prev_prio = rt_rq->highest_prio.curr;
1099
1100         if (rt_rq->rt_nr_running) {
1101
1102                 WARN_ON(prio < prev_prio);
1103
1104                 /*
1105                  * This may have been our highest task, and therefore
1106                  * we may have some recomputation to do
1107                  */
1108                 if (prio == prev_prio) {
1109                         struct rt_prio_array *array = &rt_rq->active;
1110
1111                         rt_rq->highest_prio.curr =
1112                                 sched_find_first_bit(array->bitmap);
1113                 }
1114
1115         } else
1116                 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1117
1118         dec_rt_prio_smp(rt_rq, prio, prev_prio);
1119 }
1120
1121 #else
1122
1123 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1124 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1125
1126 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1127
1128 #ifdef CONFIG_RT_GROUP_SCHED
1129
1130 static void
1131 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1132 {
1133         if (rt_se_boosted(rt_se))
1134                 rt_rq->rt_nr_boosted++;
1135
1136         if (rt_rq->tg)
1137                 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1138 }
1139
1140 static void
1141 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1142 {
1143         if (rt_se_boosted(rt_se))
1144                 rt_rq->rt_nr_boosted--;
1145
1146         WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1147 }
1148
1149 #else /* CONFIG_RT_GROUP_SCHED */
1150
1151 static void
1152 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1153 {
1154         start_rt_bandwidth(&def_rt_bandwidth);
1155 }
1156
1157 static inline
1158 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1159
1160 #endif /* CONFIG_RT_GROUP_SCHED */
1161
1162 static inline
1163 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1164 {
1165         struct rt_rq *group_rq = group_rt_rq(rt_se);
1166
1167         if (group_rq)
1168                 return group_rq->rt_nr_running;
1169         else
1170                 return 1;
1171 }
1172
1173 static inline
1174 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1175 {
1176         struct rt_rq *group_rq = group_rt_rq(rt_se);
1177         struct task_struct *tsk;
1178
1179         if (group_rq)
1180                 return group_rq->rr_nr_running;
1181
1182         tsk = rt_task_of(rt_se);
1183
1184         return (tsk->policy == SCHED_RR) ? 1 : 0;
1185 }
1186
1187 static inline
1188 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1189 {
1190         int prio = rt_se_prio(rt_se);
1191
1192         WARN_ON(!rt_prio(prio));
1193         rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1194         rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1195
1196         inc_rt_prio(rt_rq, prio);
1197         inc_rt_migration(rt_se, rt_rq);
1198         inc_rt_group(rt_se, rt_rq);
1199 }
1200
1201 static inline
1202 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1203 {
1204         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1205         WARN_ON(!rt_rq->rt_nr_running);
1206         rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1207         rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1208
1209         dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1210         dec_rt_migration(rt_se, rt_rq);
1211         dec_rt_group(rt_se, rt_rq);
1212 }
1213
1214 /*
1215  * Change rt_se->run_list location unless SAVE && !MOVE
1216  *
1217  * assumes ENQUEUE/DEQUEUE flags match
1218  */
1219 static inline bool move_entity(unsigned int flags)
1220 {
1221         if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1222                 return false;
1223
1224         return true;
1225 }
1226
1227 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1228 {
1229         list_del_init(&rt_se->run_list);
1230
1231         if (list_empty(array->queue + rt_se_prio(rt_se)))
1232                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1233
1234         rt_se->on_list = 0;
1235 }
1236
1237 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1238 {
1239         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1240         struct rt_prio_array *array = &rt_rq->active;
1241         struct rt_rq *group_rq = group_rt_rq(rt_se);
1242         struct list_head *queue = array->queue + rt_se_prio(rt_se);
1243
1244         /*
1245          * Don't enqueue the group if its throttled, or when empty.
1246          * The latter is a consequence of the former when a child group
1247          * get throttled and the current group doesn't have any other
1248          * active members.
1249          */
1250         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1251                 if (rt_se->on_list)
1252                         __delist_rt_entity(rt_se, array);
1253                 return;
1254         }
1255
1256         if (move_entity(flags)) {
1257                 WARN_ON_ONCE(rt_se->on_list);
1258                 if (flags & ENQUEUE_HEAD)
1259                         list_add(&rt_se->run_list, queue);
1260                 else
1261                         list_add_tail(&rt_se->run_list, queue);
1262
1263                 __set_bit(rt_se_prio(rt_se), array->bitmap);
1264                 rt_se->on_list = 1;
1265         }
1266         rt_se->on_rq = 1;
1267
1268         inc_rt_tasks(rt_se, rt_rq);
1269 }
1270
1271 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1272 {
1273         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1274         struct rt_prio_array *array = &rt_rq->active;
1275
1276         if (move_entity(flags)) {
1277                 WARN_ON_ONCE(!rt_se->on_list);
1278                 __delist_rt_entity(rt_se, array);
1279         }
1280         rt_se->on_rq = 0;
1281
1282         dec_rt_tasks(rt_se, rt_rq);
1283 }
1284
1285 /*
1286  * Because the prio of an upper entry depends on the lower
1287  * entries, we must remove entries top - down.
1288  */
1289 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1290 {
1291         struct sched_rt_entity *back = NULL;
1292
1293         for_each_sched_rt_entity(rt_se) {
1294                 rt_se->back = back;
1295                 back = rt_se;
1296         }
1297
1298         dequeue_top_rt_rq(rt_rq_of_se(back));
1299
1300         for (rt_se = back; rt_se; rt_se = rt_se->back) {
1301                 if (on_rt_rq(rt_se))
1302                         __dequeue_rt_entity(rt_se, flags);
1303         }
1304 }
1305
1306 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1307 {
1308         struct rq *rq = rq_of_rt_se(rt_se);
1309
1310         dequeue_rt_stack(rt_se, flags);
1311         for_each_sched_rt_entity(rt_se)
1312                 __enqueue_rt_entity(rt_se, flags);
1313         enqueue_top_rt_rq(&rq->rt);
1314 }
1315
1316 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1317 {
1318         struct rq *rq = rq_of_rt_se(rt_se);
1319
1320         dequeue_rt_stack(rt_se, flags);
1321
1322         for_each_sched_rt_entity(rt_se) {
1323                 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1324
1325                 if (rt_rq && rt_rq->rt_nr_running)
1326                         __enqueue_rt_entity(rt_se, flags);
1327         }
1328         enqueue_top_rt_rq(&rq->rt);
1329 }
1330
1331 /*
1332  * Adding/removing a task to/from a priority array:
1333  */
1334 static void
1335 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1336 {
1337         struct sched_rt_entity *rt_se = &p->rt;
1338
1339         schedtune_enqueue_task(p, cpu_of(rq));
1340
1341         if (flags & ENQUEUE_WAKEUP)
1342                 rt_se->timeout = 0;
1343
1344         enqueue_rt_entity(rt_se, flags);
1345         walt_inc_cumulative_runnable_avg(rq, p);
1346
1347         if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1348                 enqueue_pushable_task(rq, p);
1349 }
1350
1351 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1352 {
1353         struct sched_rt_entity *rt_se = &p->rt;
1354
1355         schedtune_dequeue_task(p, cpu_of(rq));
1356
1357         update_curr_rt(rq);
1358         dequeue_rt_entity(rt_se, flags);
1359         walt_dec_cumulative_runnable_avg(rq, p);
1360
1361         dequeue_pushable_task(rq, p);
1362 }
1363
1364 /*
1365  * Put task to the head or the end of the run list without the overhead of
1366  * dequeue followed by enqueue.
1367  */
1368 static void
1369 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1370 {
1371         if (on_rt_rq(rt_se)) {
1372                 struct rt_prio_array *array = &rt_rq->active;
1373                 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1374
1375                 if (head)
1376                         list_move(&rt_se->run_list, queue);
1377                 else
1378                         list_move_tail(&rt_se->run_list, queue);
1379         }
1380 }
1381
1382 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1383 {
1384         struct sched_rt_entity *rt_se = &p->rt;
1385         struct rt_rq *rt_rq;
1386
1387         for_each_sched_rt_entity(rt_se) {
1388                 rt_rq = rt_rq_of_se(rt_se);
1389                 requeue_rt_entity(rt_rq, rt_se, head);
1390         }
1391 }
1392
1393 static void yield_task_rt(struct rq *rq)
1394 {
1395         requeue_task_rt(rq, rq->curr, 0);
1396 }
1397
1398 #ifdef CONFIG_SMP
1399 static int find_lowest_rq(struct task_struct *task);
1400
1401 static int
1402 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
1403                   int sibling_count_hint)
1404 {
1405         struct task_struct *curr;
1406         struct rq *rq;
1407
1408         /* For anything but wake ups, just return the task_cpu */
1409         if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1410                 goto out;
1411
1412         rq = cpu_rq(cpu);
1413
1414         rcu_read_lock();
1415         curr = READ_ONCE(rq->curr); /* unlocked access */
1416
1417         /*
1418          * If the current task on @p's runqueue is an RT task, then
1419          * try to see if we can wake this RT task up on another
1420          * runqueue. Otherwise simply start this RT task
1421          * on its current runqueue.
1422          *
1423          * We want to avoid overloading runqueues. If the woken
1424          * task is a higher priority, then it will stay on this CPU
1425          * and the lower prio task should be moved to another CPU.
1426          * Even though this will probably make the lower prio task
1427          * lose its cache, we do not want to bounce a higher task
1428          * around just because it gave up its CPU, perhaps for a
1429          * lock?
1430          *
1431          * For equal prio tasks, we just let the scheduler sort it out.
1432          *
1433          * Otherwise, just let it ride on the affined RQ and the
1434          * post-schedule router will push the preempted task away
1435          *
1436          * This test is optimistic, if we get it wrong the load-balancer
1437          * will have to sort it out.
1438          */
1439         if (curr && unlikely(rt_task(curr)) &&
1440             (curr->nr_cpus_allowed < 2 ||
1441              curr->prio <= p->prio)) {
1442                 int target = find_lowest_rq(p);
1443
1444                 /*
1445                  * Don't bother moving it if the destination CPU is
1446                  * not running a lower priority task.
1447                  */
1448                 if (target != -1 &&
1449                     p->prio < cpu_rq(target)->rt.highest_prio.curr)
1450                         cpu = target;
1451         }
1452         rcu_read_unlock();
1453
1454 out:
1455         return cpu;
1456 }
1457
1458 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1459 {
1460         /*
1461          * Current can't be migrated, useless to reschedule,
1462          * let's hope p can move out.
1463          */
1464         if (rq->curr->nr_cpus_allowed == 1 ||
1465             !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1466                 return;
1467
1468         /*
1469          * p is migratable, so let's not schedule it and
1470          * see if it is pushed or pulled somewhere else.
1471          */
1472         if (p->nr_cpus_allowed != 1
1473             && cpupri_find(&rq->rd->cpupri, p, NULL))
1474                 return;
1475
1476         /*
1477          * There appears to be other cpus that can accept
1478          * current and none to run 'p', so lets reschedule
1479          * to try and push current away:
1480          */
1481         requeue_task_rt(rq, p, 1);
1482         resched_curr(rq);
1483 }
1484
1485 #endif /* CONFIG_SMP */
1486
1487 /*
1488  * Preempt the current task with a newly woken task if needed:
1489  */
1490 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1491 {
1492         if (p->prio < rq->curr->prio) {
1493                 resched_curr(rq);
1494                 return;
1495         }
1496
1497 #ifdef CONFIG_SMP
1498         /*
1499          * If:
1500          *
1501          * - the newly woken task is of equal priority to the current task
1502          * - the newly woken task is non-migratable while current is migratable
1503          * - current will be preempted on the next reschedule
1504          *
1505          * we should check to see if current can readily move to a different
1506          * cpu.  If so, we will reschedule to allow the push logic to try
1507          * to move current somewhere else, making room for our non-migratable
1508          * task.
1509          */
1510         if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1511                 check_preempt_equal_prio(rq, p);
1512 #endif
1513 }
1514
1515 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1516                                                    struct rt_rq *rt_rq)
1517 {
1518         struct rt_prio_array *array = &rt_rq->active;
1519         struct sched_rt_entity *next = NULL;
1520         struct list_head *queue;
1521         int idx;
1522
1523         idx = sched_find_first_bit(array->bitmap);
1524         BUG_ON(idx >= MAX_RT_PRIO);
1525
1526         queue = array->queue + idx;
1527         next = list_entry(queue->next, struct sched_rt_entity, run_list);
1528
1529         return next;
1530 }
1531
1532 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1533 {
1534         struct sched_rt_entity *rt_se;
1535         struct task_struct *p;
1536         struct rt_rq *rt_rq  = &rq->rt;
1537
1538         do {
1539                 rt_se = pick_next_rt_entity(rq, rt_rq);
1540                 BUG_ON(!rt_se);
1541                 rt_rq = group_rt_rq(rt_se);
1542         } while (rt_rq);
1543
1544         p = rt_task_of(rt_se);
1545         p->se.exec_start = rq_clock_task(rq);
1546
1547         return p;
1548 }
1549
1550 extern int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running);
1551
1552 static struct task_struct *
1553 pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1554 {
1555         struct task_struct *p;
1556         struct rt_rq *rt_rq = &rq->rt;
1557
1558         if (need_pull_rt_task(rq, prev)) {
1559                 /*
1560                  * This is OK, because current is on_cpu, which avoids it being
1561                  * picked for load-balance and preemption/IRQs are still
1562                  * disabled avoiding further scheduler activity on it and we're
1563                  * being very careful to re-start the picking loop.
1564                  */
1565                 rq_unpin_lock(rq, rf);
1566                 pull_rt_task(rq);
1567                 rq_repin_lock(rq, rf);
1568                 /*
1569                  * pull_rt_task() can drop (and re-acquire) rq->lock; this
1570                  * means a dl or stop task can slip in, in which case we need
1571                  * to re-start task selection.
1572                  */
1573                 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1574                              rq->dl.dl_nr_running))
1575                         return RETRY_TASK;
1576         }
1577
1578         /*
1579          * We may dequeue prev's rt_rq in put_prev_task().
1580          * So, we update time before rt_nr_running check.
1581          */
1582         if (prev->sched_class == &rt_sched_class)
1583                 update_curr_rt(rq);
1584
1585         if (!rt_rq->rt_queued)
1586                 return NULL;
1587
1588         put_prev_task(rq, prev);
1589
1590         p = _pick_next_task_rt(rq);
1591
1592         /* The running task is never eligible for pushing */
1593         dequeue_pushable_task(rq, p);
1594
1595         queue_push_tasks(rq);
1596
1597         if (p)
1598                 update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), rt_rq,
1599                                         rq->curr->sched_class == &rt_sched_class);
1600
1601         return p;
1602 }
1603
1604 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1605 {
1606         update_curr_rt(rq);
1607
1608         update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
1609
1610         /*
1611          * The previous task needs to be made eligible for pushing
1612          * if it is still active
1613          */
1614         if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1615                 enqueue_pushable_task(rq, p);
1616 }
1617
1618 #ifdef CONFIG_SMP
1619
1620 /* Only try algorithms three times */
1621 #define RT_MAX_TRIES 3
1622
1623 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
1624 {
1625         if (!task_running(rq, p) &&
1626             cpumask_test_cpu(cpu, &p->cpus_allowed))
1627                 return 1;
1628         return 0;
1629 }
1630
1631 /*
1632  * Return the highest pushable rq's task, which is suitable to be executed
1633  * on the cpu, NULL otherwise
1634  */
1635 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
1636 {
1637         struct plist_head *head = &rq->rt.pushable_tasks;
1638         struct task_struct *p;
1639
1640         if (!has_pushable_tasks(rq))
1641                 return NULL;
1642
1643         plist_for_each_entry(p, head, pushable_tasks) {
1644                 if (pick_rt_task(rq, p, cpu))
1645                         return p;
1646         }
1647
1648         return NULL;
1649 }
1650
1651 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
1652
1653 static int find_lowest_rq(struct task_struct *task)
1654 {
1655         struct sched_domain *sd;
1656         struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
1657         int this_cpu = smp_processor_id();
1658         int cpu      = task_cpu(task);
1659
1660         /* Make sure the mask is initialized first */
1661         if (unlikely(!lowest_mask))
1662                 return -1;
1663
1664         if (task->nr_cpus_allowed == 1)
1665                 return -1; /* No other targets possible */
1666
1667         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
1668                 return -1; /* No targets found */
1669
1670         /*
1671          * At this point we have built a mask of cpus representing the
1672          * lowest priority tasks in the system.  Now we want to elect
1673          * the best one based on our affinity and topology.
1674          *
1675          * We prioritize the last cpu that the task executed on since
1676          * it is most likely cache-hot in that location.
1677          */
1678         if (cpumask_test_cpu(cpu, lowest_mask))
1679                 return cpu;
1680
1681         /*
1682          * Otherwise, we consult the sched_domains span maps to figure
1683          * out which cpu is logically closest to our hot cache data.
1684          */
1685         if (!cpumask_test_cpu(this_cpu, lowest_mask))
1686                 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
1687
1688         rcu_read_lock();
1689         for_each_domain(cpu, sd) {
1690                 if (sd->flags & SD_WAKE_AFFINE) {
1691                         int best_cpu;
1692
1693                         /*
1694                          * "this_cpu" is cheaper to preempt than a
1695                          * remote processor.
1696                          */
1697                         if (this_cpu != -1 &&
1698                             cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
1699                                 rcu_read_unlock();
1700                                 return this_cpu;
1701                         }
1702
1703                         best_cpu = cpumask_first_and(lowest_mask,
1704                                                      sched_domain_span(sd));
1705                         if (best_cpu < nr_cpu_ids) {
1706                                 rcu_read_unlock();
1707                                 return best_cpu;
1708                         }
1709                 }
1710         }
1711         rcu_read_unlock();
1712
1713         /*
1714          * And finally, if there were no matches within the domains
1715          * just give the caller *something* to work with from the compatible
1716          * locations.
1717          */
1718         if (this_cpu != -1)
1719                 return this_cpu;
1720
1721         cpu = cpumask_any(lowest_mask);
1722         if (cpu < nr_cpu_ids)
1723                 return cpu;
1724         return -1;
1725 }
1726
1727 /* Will lock the rq it finds */
1728 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
1729 {
1730         struct rq *lowest_rq = NULL;
1731         int tries;
1732         int cpu;
1733
1734         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
1735                 cpu = find_lowest_rq(task);
1736
1737                 if ((cpu == -1) || (cpu == rq->cpu))
1738                         break;
1739
1740                 lowest_rq = cpu_rq(cpu);
1741
1742                 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
1743                         /*
1744                          * Target rq has tasks of equal or higher priority,
1745                          * retrying does not release any lock and is unlikely
1746                          * to yield a different result.
1747                          */
1748                         lowest_rq = NULL;
1749                         break;
1750                 }
1751
1752                 /* if the prio of this runqueue changed, try again */
1753                 if (double_lock_balance(rq, lowest_rq)) {
1754                         /*
1755                          * We had to unlock the run queue. In
1756                          * the mean time, task could have
1757                          * migrated already or had its affinity changed.
1758                          * Also make sure that it wasn't scheduled on its rq.
1759                          */
1760                         if (unlikely(task_rq(task) != rq ||
1761                                      !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
1762                                      task_running(rq, task) ||
1763                                      !rt_task(task) ||
1764                                      !task_on_rq_queued(task))) {
1765
1766                                 double_unlock_balance(rq, lowest_rq);
1767                                 lowest_rq = NULL;
1768                                 break;
1769                         }
1770                 }
1771
1772                 /* If this rq is still suitable use it. */
1773                 if (lowest_rq->rt.highest_prio.curr > task->prio)
1774                         break;
1775
1776                 /* try again */
1777                 double_unlock_balance(rq, lowest_rq);
1778                 lowest_rq = NULL;
1779         }
1780
1781         return lowest_rq;
1782 }
1783
1784 static struct task_struct *pick_next_pushable_task(struct rq *rq)
1785 {
1786         struct task_struct *p;
1787
1788         if (!has_pushable_tasks(rq))
1789                 return NULL;
1790
1791         p = plist_first_entry(&rq->rt.pushable_tasks,
1792                               struct task_struct, pushable_tasks);
1793
1794         BUG_ON(rq->cpu != task_cpu(p));
1795         BUG_ON(task_current(rq, p));
1796         BUG_ON(p->nr_cpus_allowed <= 1);
1797
1798         BUG_ON(!task_on_rq_queued(p));
1799         BUG_ON(!rt_task(p));
1800
1801         return p;
1802 }
1803
1804 /*
1805  * If the current CPU has more than one RT task, see if the non
1806  * running task can migrate over to a CPU that is running a task
1807  * of lesser priority.
1808  */
1809 static int push_rt_task(struct rq *rq)
1810 {
1811         struct task_struct *next_task;
1812         struct rq *lowest_rq;
1813         int ret = 0;
1814
1815         if (!rq->rt.overloaded)
1816                 return 0;
1817
1818         next_task = pick_next_pushable_task(rq);
1819         if (!next_task)
1820                 return 0;
1821
1822 retry:
1823         if (unlikely(next_task == rq->curr)) {
1824                 WARN_ON(1);
1825                 return 0;
1826         }
1827
1828         /*
1829          * It's possible that the next_task slipped in of
1830          * higher priority than current. If that's the case
1831          * just reschedule current.
1832          */
1833         if (unlikely(next_task->prio < rq->curr->prio)) {
1834                 resched_curr(rq);
1835                 return 0;
1836         }
1837
1838         /* We might release rq lock */
1839         get_task_struct(next_task);
1840
1841         /* find_lock_lowest_rq locks the rq if found */
1842         lowest_rq = find_lock_lowest_rq(next_task, rq);
1843         if (!lowest_rq) {
1844                 struct task_struct *task;
1845                 /*
1846                  * find_lock_lowest_rq releases rq->lock
1847                  * so it is possible that next_task has migrated.
1848                  *
1849                  * We need to make sure that the task is still on the same
1850                  * run-queue and is also still the next task eligible for
1851                  * pushing.
1852                  */
1853                 task = pick_next_pushable_task(rq);
1854                 if (task == next_task) {
1855                         /*
1856                          * The task hasn't migrated, and is still the next
1857                          * eligible task, but we failed to find a run-queue
1858                          * to push it to.  Do not retry in this case, since
1859                          * other cpus will pull from us when ready.
1860                          */
1861                         goto out;
1862                 }
1863
1864                 if (!task)
1865                         /* No more tasks, just exit */
1866                         goto out;
1867
1868                 /*
1869                  * Something has shifted, try again.
1870                  */
1871                 put_task_struct(next_task);
1872                 next_task = task;
1873                 goto retry;
1874         }
1875
1876         deactivate_task(rq, next_task, 0);
1877         next_task->on_rq = TASK_ON_RQ_MIGRATING;
1878         set_task_cpu(next_task, lowest_rq->cpu);
1879         next_task->on_rq = TASK_ON_RQ_QUEUED;
1880         activate_task(lowest_rq, next_task, 0);
1881         ret = 1;
1882
1883         resched_curr(lowest_rq);
1884
1885         double_unlock_balance(rq, lowest_rq);
1886
1887 out:
1888         put_task_struct(next_task);
1889
1890         return ret;
1891 }
1892
1893 static void push_rt_tasks(struct rq *rq)
1894 {
1895         /* push_rt_task will return true if it moved an RT */
1896         while (push_rt_task(rq))
1897                 ;
1898 }
1899
1900 #ifdef HAVE_RT_PUSH_IPI
1901
1902 /*
1903  * When a high priority task schedules out from a CPU and a lower priority
1904  * task is scheduled in, a check is made to see if there's any RT tasks
1905  * on other CPUs that are waiting to run because a higher priority RT task
1906  * is currently running on its CPU. In this case, the CPU with multiple RT
1907  * tasks queued on it (overloaded) needs to be notified that a CPU has opened
1908  * up that may be able to run one of its non-running queued RT tasks.
1909  *
1910  * All CPUs with overloaded RT tasks need to be notified as there is currently
1911  * no way to know which of these CPUs have the highest priority task waiting
1912  * to run. Instead of trying to take a spinlock on each of these CPUs,
1913  * which has shown to cause large latency when done on machines with many
1914  * CPUs, sending an IPI to the CPUs to have them push off the overloaded
1915  * RT tasks waiting to run.
1916  *
1917  * Just sending an IPI to each of the CPUs is also an issue, as on large
1918  * count CPU machines, this can cause an IPI storm on a CPU, especially
1919  * if its the only CPU with multiple RT tasks queued, and a large number
1920  * of CPUs scheduling a lower priority task at the same time.
1921  *
1922  * Each root domain has its own irq work function that can iterate over
1923  * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
1924  * tassk must be checked if there's one or many CPUs that are lowering
1925  * their priority, there's a single irq work iterator that will try to
1926  * push off RT tasks that are waiting to run.
1927  *
1928  * When a CPU schedules a lower priority task, it will kick off the
1929  * irq work iterator that will jump to each CPU with overloaded RT tasks.
1930  * As it only takes the first CPU that schedules a lower priority task
1931  * to start the process, the rto_start variable is incremented and if
1932  * the atomic result is one, then that CPU will try to take the rto_lock.
1933  * This prevents high contention on the lock as the process handles all
1934  * CPUs scheduling lower priority tasks.
1935  *
1936  * All CPUs that are scheduling a lower priority task will increment the
1937  * rt_loop_next variable. This will make sure that the irq work iterator
1938  * checks all RT overloaded CPUs whenever a CPU schedules a new lower
1939  * priority task, even if the iterator is in the middle of a scan. Incrementing
1940  * the rt_loop_next will cause the iterator to perform another scan.
1941  *
1942  */
1943 static int rto_next_cpu(struct root_domain *rd)
1944 {
1945         int next;
1946         int cpu;
1947
1948         /*
1949          * When starting the IPI RT pushing, the rto_cpu is set to -1,
1950          * rt_next_cpu() will simply return the first CPU found in
1951          * the rto_mask.
1952          *
1953          * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
1954          * will return the next CPU found in the rto_mask.
1955          *
1956          * If there are no more CPUs left in the rto_mask, then a check is made
1957          * against rto_loop and rto_loop_next. rto_loop is only updated with
1958          * the rto_lock held, but any CPU may increment the rto_loop_next
1959          * without any locking.
1960          */
1961         for (;;) {
1962
1963                 /* When rto_cpu is -1 this acts like cpumask_first() */
1964                 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
1965
1966                 rd->rto_cpu = cpu;
1967
1968                 if (cpu < nr_cpu_ids)
1969                         return cpu;
1970
1971                 rd->rto_cpu = -1;
1972
1973                 /*
1974                  * ACQUIRE ensures we see the @rto_mask changes
1975                  * made prior to the @next value observed.
1976                  *
1977                  * Matches WMB in rt_set_overload().
1978                  */
1979                 next = atomic_read_acquire(&rd->rto_loop_next);
1980
1981                 if (rd->rto_loop == next)
1982                         break;
1983
1984                 rd->rto_loop = next;
1985         }
1986
1987         return -1;
1988 }
1989
1990 static inline bool rto_start_trylock(atomic_t *v)
1991 {
1992         return !atomic_cmpxchg_acquire(v, 0, 1);
1993 }
1994
1995 static inline void rto_start_unlock(atomic_t *v)
1996 {
1997         atomic_set_release(v, 0);
1998 }
1999
2000 static void tell_cpu_to_push(struct rq *rq)
2001 {
2002         int cpu = -1;
2003
2004         /* Keep the loop going if the IPI is currently active */
2005         atomic_inc(&rq->rd->rto_loop_next);
2006
2007         /* Only one CPU can initiate a loop at a time */
2008         if (!rto_start_trylock(&rq->rd->rto_loop_start))
2009                 return;
2010
2011         raw_spin_lock(&rq->rd->rto_lock);
2012
2013         /*
2014          * The rto_cpu is updated under the lock, if it has a valid cpu
2015          * then the IPI is still running and will continue due to the
2016          * update to loop_next, and nothing needs to be done here.
2017          * Otherwise it is finishing up and an ipi needs to be sent.
2018          */
2019         if (rq->rd->rto_cpu < 0)
2020                 cpu = rto_next_cpu(rq->rd);
2021
2022         raw_spin_unlock(&rq->rd->rto_lock);
2023
2024         rto_start_unlock(&rq->rd->rto_loop_start);
2025
2026         if (cpu >= 0) {
2027                 /* Make sure the rd does not get freed while pushing */
2028                 sched_get_rd(rq->rd);
2029                 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2030         }
2031 }
2032
2033 /* Called from hardirq context */
2034 void rto_push_irq_work_func(struct irq_work *work)
2035 {
2036         struct root_domain *rd =
2037                 container_of(work, struct root_domain, rto_push_work);
2038         struct rq *rq;
2039         int cpu;
2040
2041         rq = this_rq();
2042
2043         /*
2044          * We do not need to grab the lock to check for has_pushable_tasks.
2045          * When it gets updated, a check is made if a push is possible.
2046          */
2047         if (has_pushable_tasks(rq)) {
2048                 raw_spin_lock(&rq->lock);
2049                 push_rt_tasks(rq);
2050                 raw_spin_unlock(&rq->lock);
2051         }
2052
2053         raw_spin_lock(&rd->rto_lock);
2054
2055         /* Pass the IPI to the next rt overloaded queue */
2056         cpu = rto_next_cpu(rd);
2057
2058         raw_spin_unlock(&rd->rto_lock);
2059
2060         if (cpu < 0) {
2061                 sched_put_rd(rd);
2062                 return;
2063         }
2064
2065         /* Try the next RT overloaded CPU */
2066         irq_work_queue_on(&rd->rto_push_work, cpu);
2067 }
2068 #endif /* HAVE_RT_PUSH_IPI */
2069
2070 static void pull_rt_task(struct rq *this_rq)
2071 {
2072         int this_cpu = this_rq->cpu, cpu;
2073         bool resched = false;
2074         struct task_struct *p;
2075         struct rq *src_rq;
2076         int rt_overload_count = rt_overloaded(this_rq);
2077
2078         if (likely(!rt_overload_count))
2079                 return;
2080
2081         /*
2082          * Match the barrier from rt_set_overloaded; this guarantees that if we
2083          * see overloaded we must also see the rto_mask bit.
2084          */
2085         smp_rmb();
2086
2087         /* If we are the only overloaded CPU do nothing */
2088         if (rt_overload_count == 1 &&
2089             cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2090                 return;
2091
2092 #ifdef HAVE_RT_PUSH_IPI
2093         if (sched_feat(RT_PUSH_IPI)) {
2094                 tell_cpu_to_push(this_rq);
2095                 return;
2096         }
2097 #endif
2098
2099         for_each_cpu(cpu, this_rq->rd->rto_mask) {
2100                 if (this_cpu == cpu)
2101                         continue;
2102
2103                 src_rq = cpu_rq(cpu);
2104
2105                 /*
2106                  * Don't bother taking the src_rq->lock if the next highest
2107                  * task is known to be lower-priority than our current task.
2108                  * This may look racy, but if this value is about to go
2109                  * logically higher, the src_rq will push this task away.
2110                  * And if its going logically lower, we do not care
2111                  */
2112                 if (src_rq->rt.highest_prio.next >=
2113                     this_rq->rt.highest_prio.curr)
2114                         continue;
2115
2116                 /*
2117                  * We can potentially drop this_rq's lock in
2118                  * double_lock_balance, and another CPU could
2119                  * alter this_rq
2120                  */
2121                 double_lock_balance(this_rq, src_rq);
2122
2123                 /*
2124                  * We can pull only a task, which is pushable
2125                  * on its rq, and no others.
2126                  */
2127                 p = pick_highest_pushable_task(src_rq, this_cpu);
2128
2129                 /*
2130                  * Do we have an RT task that preempts
2131                  * the to-be-scheduled task?
2132                  */
2133                 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2134                         WARN_ON(p == src_rq->curr);
2135                         WARN_ON(!task_on_rq_queued(p));
2136
2137                         /*
2138                          * There's a chance that p is higher in priority
2139                          * than what's currently running on its cpu.
2140                          * This is just that p is wakeing up and hasn't
2141                          * had a chance to schedule. We only pull
2142                          * p if it is lower in priority than the
2143                          * current task on the run queue
2144                          */
2145                         if (p->prio < src_rq->curr->prio)
2146                                 goto skip;
2147
2148                         resched = true;
2149
2150                         deactivate_task(src_rq, p, 0);
2151                         p->on_rq = TASK_ON_RQ_MIGRATING;
2152                         set_task_cpu(p, this_cpu);
2153                         p->on_rq = TASK_ON_RQ_QUEUED;
2154                         activate_task(this_rq, p, 0);
2155                         /*
2156                          * We continue with the search, just in
2157                          * case there's an even higher prio task
2158                          * in another runqueue. (low likelihood
2159                          * but possible)
2160                          */
2161                 }
2162 skip:
2163                 double_unlock_balance(this_rq, src_rq);
2164         }
2165
2166         if (resched)
2167                 resched_curr(this_rq);
2168 }
2169
2170 /*
2171  * If we are not running and we are not going to reschedule soon, we should
2172  * try to push tasks away now
2173  */
2174 static void task_woken_rt(struct rq *rq, struct task_struct *p)
2175 {
2176         if (!task_running(rq, p) &&
2177             !test_tsk_need_resched(rq->curr) &&
2178             p->nr_cpus_allowed > 1 &&
2179             (dl_task(rq->curr) || rt_task(rq->curr)) &&
2180             (rq->curr->nr_cpus_allowed < 2 ||
2181              rq->curr->prio <= p->prio))
2182                 push_rt_tasks(rq);
2183 }
2184
2185 /* Assumes rq->lock is held */
2186 static void rq_online_rt(struct rq *rq)
2187 {
2188         if (rq->rt.overloaded)
2189                 rt_set_overload(rq);
2190
2191         __enable_runtime(rq);
2192
2193         cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2194 }
2195
2196 /* Assumes rq->lock is held */
2197 static void rq_offline_rt(struct rq *rq)
2198 {
2199         if (rq->rt.overloaded)
2200                 rt_clear_overload(rq);
2201
2202         __disable_runtime(rq);
2203
2204         cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2205 }
2206
2207 /*
2208  * When switch from the rt queue, we bring ourselves to a position
2209  * that we might want to pull RT tasks from other runqueues.
2210  */
2211 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2212 {
2213         /*
2214          * If there are other RT tasks then we will reschedule
2215          * and the scheduling of the other RT tasks will handle
2216          * the balancing. But if we are the last RT task
2217          * we may need to handle the pulling of RT tasks
2218          * now.
2219          */
2220         if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2221                 return;
2222
2223         queue_pull_task(rq);
2224 }
2225
2226 void __init init_sched_rt_class(void)
2227 {
2228         unsigned int i;
2229
2230         for_each_possible_cpu(i) {
2231                 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2232                                         GFP_KERNEL, cpu_to_node(i));
2233         }
2234 }
2235 #endif /* CONFIG_SMP */
2236
2237 /*
2238  * When switching a task to RT, we may overload the runqueue
2239  * with RT tasks. In this case we try to push them off to
2240  * other runqueues.
2241  */
2242 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2243 {
2244         /*
2245          * If we are already running, then there's nothing
2246          * that needs to be done. But if we are not running
2247          * we may need to preempt the current running task.
2248          * If that current running task is also an RT task
2249          * then see if we can move to another run queue.
2250          */
2251         if (task_on_rq_queued(p) && rq->curr != p) {
2252 #ifdef CONFIG_SMP
2253                 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2254                         queue_push_tasks(rq);
2255 #endif /* CONFIG_SMP */
2256                 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2257                         resched_curr(rq);
2258         }
2259 }
2260
2261 /*
2262  * Priority of the task has changed. This may cause
2263  * us to initiate a push or pull.
2264  */
2265 static void
2266 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2267 {
2268         if (!task_on_rq_queued(p))
2269                 return;
2270
2271         if (rq->curr == p) {
2272 #ifdef CONFIG_SMP
2273                 /*
2274                  * If our priority decreases while running, we
2275                  * may need to pull tasks to this runqueue.
2276                  */
2277                 if (oldprio < p->prio)
2278                         queue_pull_task(rq);
2279
2280                 /*
2281                  * If there's a higher priority task waiting to run
2282                  * then reschedule.
2283                  */
2284                 if (p->prio > rq->rt.highest_prio.curr)
2285                         resched_curr(rq);
2286 #else
2287                 /* For UP simply resched on drop of prio */
2288                 if (oldprio < p->prio)
2289                         resched_curr(rq);
2290 #endif /* CONFIG_SMP */
2291         } else {
2292                 /*
2293                  * This task is not running, but if it is
2294                  * greater than the current running task
2295                  * then reschedule.
2296                  */
2297                 if (p->prio < rq->curr->prio)
2298                         resched_curr(rq);
2299         }
2300 }
2301
2302 #ifdef CONFIG_POSIX_TIMERS
2303 static void watchdog(struct rq *rq, struct task_struct *p)
2304 {
2305         unsigned long soft, hard;
2306
2307         /* max may change after cur was read, this will be fixed next tick */
2308         soft = task_rlimit(p, RLIMIT_RTTIME);
2309         hard = task_rlimit_max(p, RLIMIT_RTTIME);
2310
2311         if (soft != RLIM_INFINITY) {
2312                 unsigned long next;
2313
2314                 if (p->rt.watchdog_stamp != jiffies) {
2315                         p->rt.timeout++;
2316                         p->rt.watchdog_stamp = jiffies;
2317                 }
2318
2319                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2320                 if (p->rt.timeout > next)
2321                         p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2322         }
2323 }
2324 #else
2325 static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2326 #endif
2327
2328 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2329 {
2330         struct sched_rt_entity *rt_se = &p->rt;
2331
2332         update_curr_rt(rq);
2333         update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
2334
2335         watchdog(rq, p);
2336
2337         /*
2338          * RR tasks need a special form of timeslice management.
2339          * FIFO tasks have no timeslices.
2340          */
2341         if (p->policy != SCHED_RR)
2342                 return;
2343
2344         if (--p->rt.time_slice)
2345                 return;
2346
2347         p->rt.time_slice = sched_rr_timeslice;
2348
2349         /*
2350          * Requeue to the end of queue if we (and all of our ancestors) are not
2351          * the only element on the queue
2352          */
2353         for_each_sched_rt_entity(rt_se) {
2354                 if (rt_se->run_list.prev != rt_se->run_list.next) {
2355                         requeue_task_rt(rq, p, 0);
2356                         resched_curr(rq);
2357                         return;
2358                 }
2359         }
2360 }
2361
2362 static void set_curr_task_rt(struct rq *rq)
2363 {
2364         struct task_struct *p = rq->curr;
2365
2366         p->se.exec_start = rq_clock_task(rq);
2367
2368         /* The running task is never eligible for pushing */
2369         dequeue_pushable_task(rq, p);
2370 }
2371
2372 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2373 {
2374         /*
2375          * Time slice is 0 for SCHED_FIFO tasks
2376          */
2377         if (task->policy == SCHED_RR)
2378                 return sched_rr_timeslice;
2379         else
2380                 return 0;
2381 }
2382
2383 const struct sched_class rt_sched_class = {
2384         .next                   = &fair_sched_class,
2385         .enqueue_task           = enqueue_task_rt,
2386         .dequeue_task           = dequeue_task_rt,
2387         .yield_task             = yield_task_rt,
2388
2389         .check_preempt_curr     = check_preempt_curr_rt,
2390
2391         .pick_next_task         = pick_next_task_rt,
2392         .put_prev_task          = put_prev_task_rt,
2393
2394 #ifdef CONFIG_SMP
2395         .select_task_rq         = select_task_rq_rt,
2396
2397         .set_cpus_allowed       = set_cpus_allowed_common,
2398         .rq_online              = rq_online_rt,
2399         .rq_offline             = rq_offline_rt,
2400         .task_woken             = task_woken_rt,
2401         .switched_from          = switched_from_rt,
2402 #endif
2403
2404         .set_curr_task          = set_curr_task_rt,
2405         .task_tick              = task_tick_rt,
2406
2407         .get_rr_interval        = get_rr_interval_rt,
2408
2409         .prio_changed           = prio_changed_rt,
2410         .switched_to            = switched_to_rt,
2411
2412         .update_curr            = update_curr_rt,
2413 };
2414
2415 #ifdef CONFIG_RT_GROUP_SCHED
2416 /*
2417  * Ensure that the real time constraints are schedulable.
2418  */
2419 static DEFINE_MUTEX(rt_constraints_mutex);
2420
2421 /* Must be called with tasklist_lock held */
2422 static inline int tg_has_rt_tasks(struct task_group *tg)
2423 {
2424         struct task_struct *g, *p;
2425
2426         /*
2427          * Autogroups do not have RT tasks; see autogroup_create().
2428          */
2429         if (task_group_is_autogroup(tg))
2430                 return 0;
2431
2432         for_each_process_thread(g, p) {
2433                 if (rt_task(p) && task_group(p) == tg)
2434                         return 1;
2435         }
2436
2437         return 0;
2438 }
2439
2440 struct rt_schedulable_data {
2441         struct task_group *tg;
2442         u64 rt_period;
2443         u64 rt_runtime;
2444 };
2445
2446 static int tg_rt_schedulable(struct task_group *tg, void *data)
2447 {
2448         struct rt_schedulable_data *d = data;
2449         struct task_group *child;
2450         unsigned long total, sum = 0;
2451         u64 period, runtime;
2452
2453         period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2454         runtime = tg->rt_bandwidth.rt_runtime;
2455
2456         if (tg == d->tg) {
2457                 period = d->rt_period;
2458                 runtime = d->rt_runtime;
2459         }
2460
2461         /*
2462          * Cannot have more runtime than the period.
2463          */
2464         if (runtime > period && runtime != RUNTIME_INF)
2465                 return -EINVAL;
2466
2467         /*
2468          * Ensure we don't starve existing RT tasks.
2469          */
2470         if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2471                 return -EBUSY;
2472
2473         total = to_ratio(period, runtime);
2474
2475         /*
2476          * Nobody can have more than the global setting allows.
2477          */
2478         if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2479                 return -EINVAL;
2480
2481         /*
2482          * The sum of our children's runtime should not exceed our own.
2483          */
2484         list_for_each_entry_rcu(child, &tg->children, siblings) {
2485                 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2486                 runtime = child->rt_bandwidth.rt_runtime;
2487
2488                 if (child == d->tg) {
2489                         period = d->rt_period;
2490                         runtime = d->rt_runtime;
2491                 }
2492
2493                 sum += to_ratio(period, runtime);
2494         }
2495
2496         if (sum > total)
2497                 return -EINVAL;
2498
2499         return 0;
2500 }
2501
2502 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2503 {
2504         int ret;
2505
2506         struct rt_schedulable_data data = {
2507                 .tg = tg,
2508                 .rt_period = period,
2509                 .rt_runtime = runtime,
2510         };
2511
2512         rcu_read_lock();
2513         ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
2514         rcu_read_unlock();
2515
2516         return ret;
2517 }
2518
2519 static int tg_set_rt_bandwidth(struct task_group *tg,
2520                 u64 rt_period, u64 rt_runtime)
2521 {
2522         int i, err = 0;
2523
2524         /*
2525          * Disallowing the root group RT runtime is BAD, it would disallow the
2526          * kernel creating (and or operating) RT threads.
2527          */
2528         if (tg == &root_task_group && rt_runtime == 0)
2529                 return -EINVAL;
2530
2531         /* No period doesn't make any sense. */
2532         if (rt_period == 0)
2533                 return -EINVAL;
2534
2535         mutex_lock(&rt_constraints_mutex);
2536         read_lock(&tasklist_lock);
2537         err = __rt_schedulable(tg, rt_period, rt_runtime);
2538         if (err)
2539                 goto unlock;
2540
2541         raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2542         tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
2543         tg->rt_bandwidth.rt_runtime = rt_runtime;
2544
2545         for_each_possible_cpu(i) {
2546                 struct rt_rq *rt_rq = tg->rt_rq[i];
2547
2548                 raw_spin_lock(&rt_rq->rt_runtime_lock);
2549                 rt_rq->rt_runtime = rt_runtime;
2550                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2551         }
2552         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
2553 unlock:
2554         read_unlock(&tasklist_lock);
2555         mutex_unlock(&rt_constraints_mutex);
2556
2557         return err;
2558 }
2559
2560 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
2561 {
2562         u64 rt_runtime, rt_period;
2563
2564         rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2565         rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
2566         if (rt_runtime_us < 0)
2567                 rt_runtime = RUNTIME_INF;
2568         else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
2569                 return -EINVAL;
2570
2571         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2572 }
2573
2574 long sched_group_rt_runtime(struct task_group *tg)
2575 {
2576         u64 rt_runtime_us;
2577
2578         if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
2579                 return -1;
2580
2581         rt_runtime_us = tg->rt_bandwidth.rt_runtime;
2582         do_div(rt_runtime_us, NSEC_PER_USEC);
2583         return rt_runtime_us;
2584 }
2585
2586 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
2587 {
2588         u64 rt_runtime, rt_period;
2589
2590         if (rt_period_us > U64_MAX / NSEC_PER_USEC)
2591                 return -EINVAL;
2592
2593         rt_period = rt_period_us * NSEC_PER_USEC;
2594         rt_runtime = tg->rt_bandwidth.rt_runtime;
2595
2596         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
2597 }
2598
2599 long sched_group_rt_period(struct task_group *tg)
2600 {
2601         u64 rt_period_us;
2602
2603         rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
2604         do_div(rt_period_us, NSEC_PER_USEC);
2605         return rt_period_us;
2606 }
2607
2608 static int sched_rt_global_constraints(void)
2609 {
2610         int ret = 0;
2611
2612         mutex_lock(&rt_constraints_mutex);
2613         read_lock(&tasklist_lock);
2614         ret = __rt_schedulable(NULL, 0, 0);
2615         read_unlock(&tasklist_lock);
2616         mutex_unlock(&rt_constraints_mutex);
2617
2618         return ret;
2619 }
2620
2621 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
2622 {
2623         /* Don't accept realtime tasks when there is no way for them to run */
2624         if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
2625                 return 0;
2626
2627         return 1;
2628 }
2629
2630 #else /* !CONFIG_RT_GROUP_SCHED */
2631 static int sched_rt_global_constraints(void)
2632 {
2633         unsigned long flags;
2634         int i;
2635
2636         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2637         for_each_possible_cpu(i) {
2638                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
2639
2640                 raw_spin_lock(&rt_rq->rt_runtime_lock);
2641                 rt_rq->rt_runtime = global_rt_runtime();
2642                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
2643         }
2644         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2645
2646         return 0;
2647 }
2648 #endif /* CONFIG_RT_GROUP_SCHED */
2649
2650 static int sched_rt_global_validate(void)
2651 {
2652         if (sysctl_sched_rt_period <= 0)
2653                 return -EINVAL;
2654
2655         if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
2656                 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
2657                 return -EINVAL;
2658
2659         return 0;
2660 }
2661
2662 static void sched_rt_do_global(void)
2663 {
2664         unsigned long flags;
2665
2666         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
2667         def_rt_bandwidth.rt_runtime = global_rt_runtime();
2668         def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
2669         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
2670 }
2671
2672 int sched_rt_handler(struct ctl_table *table, int write,
2673                 void __user *buffer, size_t *lenp,
2674                 loff_t *ppos)
2675 {
2676         int old_period, old_runtime;
2677         static DEFINE_MUTEX(mutex);
2678         int ret;
2679
2680         mutex_lock(&mutex);
2681         old_period = sysctl_sched_rt_period;
2682         old_runtime = sysctl_sched_rt_runtime;
2683
2684         ret = proc_dointvec(table, write, buffer, lenp, ppos);
2685
2686         if (!ret && write) {
2687                 ret = sched_rt_global_validate();
2688                 if (ret)
2689                         goto undo;
2690
2691                 ret = sched_dl_global_validate();
2692                 if (ret)
2693                         goto undo;
2694
2695                 ret = sched_rt_global_constraints();
2696                 if (ret)
2697                         goto undo;
2698
2699                 sched_rt_do_global();
2700                 sched_dl_do_global();
2701         }
2702         if (0) {
2703 undo:
2704                 sysctl_sched_rt_period = old_period;
2705                 sysctl_sched_rt_runtime = old_runtime;
2706         }
2707         mutex_unlock(&mutex);
2708
2709         return ret;
2710 }
2711
2712 int sched_rr_handler(struct ctl_table *table, int write,
2713                 void __user *buffer, size_t *lenp,
2714                 loff_t *ppos)
2715 {
2716         int ret;
2717         static DEFINE_MUTEX(mutex);
2718
2719         mutex_lock(&mutex);
2720         ret = proc_dointvec(table, write, buffer, lenp, ppos);
2721         /*
2722          * Make sure that internally we keep jiffies.
2723          * Also, writing zero resets the timeslice to default:
2724          */
2725         if (!ret && write) {
2726                 sched_rr_timeslice =
2727                         sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
2728                         msecs_to_jiffies(sysctl_sched_rr_timeslice);
2729         }
2730         mutex_unlock(&mutex);
2731         return ret;
2732 }
2733
2734 #ifdef CONFIG_SCHED_DEBUG
2735 void print_rt_stats(struct seq_file *m, int cpu)
2736 {
2737         rt_rq_iter_t iter;
2738         struct rt_rq *rt_rq;
2739
2740         rcu_read_lock();
2741         for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
2742                 print_rt_rq(m, cpu, rt_rq);
2743         rcu_read_unlock();
2744 }
2745 #endif /* CONFIG_SCHED_DEBUG */