kernel/sched/rt.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   4  * policies)
   5  */
   6
   7 #include "sched.h"
   8
   9 #include <linux/slab.h>
  10 #include <linux/irq_work.h>
  11 #include "tune.h"
  12
  13 #include "walt.h"
  14
  15 int sched_rr_timeslice = RR_TIMESLICE;
  16 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
  17
  18
  19 void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se);
  20
  21 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  22
  23 struct rt_bandwidth def_rt_bandwidth;
  24
  25 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  26 {
  27         struct rt_bandwidth *rt_b =
  28                 container_of(timer, struct rt_bandwidth, rt_period_timer);
  29         int idle = 0;
  30         int overrun;
  31
  32         raw_spin_lock(&rt_b->rt_runtime_lock);
  33         for (;;) {
  34                 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
  35                 if (!overrun)
  36                         break;
  37
  38                 raw_spin_unlock(&rt_b->rt_runtime_lock);
  39                 idle = do_sched_rt_period_timer(rt_b, overrun);
  40                 raw_spin_lock(&rt_b->rt_runtime_lock);
  41         }
  42         if (idle)
  43                 rt_b->rt_period_active = 0;
  44         raw_spin_unlock(&rt_b->rt_runtime_lock);
  45
  46         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  47 }
  48
  49 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  50 {
  51         rt_b->rt_period = ns_to_ktime(period);
  52         rt_b->rt_runtime = runtime;
  53
  54         raw_spin_lock_init(&rt_b->rt_runtime_lock);
  55
  56         hrtimer_init(&rt_b->rt_period_timer,
  57                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  58         rt_b->rt_period_timer.function = sched_rt_period_timer;
  59 }
  60
  61 static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
  62 {
  63         raw_spin_lock(&rt_b->rt_runtime_lock);
  64         if (!rt_b->rt_period_active) {
  65                 rt_b->rt_period_active = 1;
  66                 /*
  67                  * SCHED_DEADLINE updates the bandwidth, as a run away
  68                  * RT task with a DL task could hog a CPU. But DL does
  69                  * not reset the period. If a deadline task was running
  70                  * without an RT task running, it can cause RT tasks to
  71                  * throttle when they start up. Kick the timer right away
  72                  * to update the period.
  73                  */
  74                 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
  75                 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
  76         }
  77         raw_spin_unlock(&rt_b->rt_runtime_lock);
  78 }
  79
  80 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  81 {
  82         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  83                 return;
  84
  85         do_start_rt_bandwidth(rt_b);
  86 }
  87
  88 void init_rt_rq(struct rt_rq *rt_rq)
  89 {
  90         struct rt_prio_array *array;
  91         int i;
  92
  93         array = &rt_rq->active;
  94         for (i = 0; i < MAX_RT_PRIO; i++) {
  95                 INIT_LIST_HEAD(array->queue + i);
  96                 __clear_bit(i, array->bitmap);
  97         }
  98         /* delimiter for bitsearch: */
  99         __set_bit(MAX_RT_PRIO, array->bitmap);
 100
 101 #if defined CONFIG_SMP
 102         rt_rq->highest_prio.curr = MAX_RT_PRIO;
 103         rt_rq->highest_prio.next = MAX_RT_PRIO;
 104         rt_rq->rt_nr_migratory = 0;
 105         rt_rq->overloaded = 0;
 106         plist_head_init(&rt_rq->pushable_tasks);
 107         atomic_long_set(&rt_rq->removed_util_avg, 0);
 108         atomic_long_set(&rt_rq->removed_load_avg, 0);
 109 #endif /* CONFIG_SMP */
 110         /* We start is dequeued state, because no RT tasks are queued */
 111         rt_rq->rt_queued = 0;
 112
 113         rt_rq->rt_time = 0;
 114         rt_rq->rt_throttled = 0;
 115         rt_rq->rt_runtime = 0;
 116         raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 117 }
 118
 119 #ifdef CONFIG_RT_GROUP_SCHED
 120 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 121 {
 122         hrtimer_cancel(&rt_b->rt_period_timer);
 123 }
 124
 125 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 126
 127 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 128 {
 129 #ifdef CONFIG_SCHED_DEBUG
 130         WARN_ON_ONCE(!rt_entity_is_task(rt_se));
 131 #endif
 132         return container_of(rt_se, struct task_struct, rt);
 133 }
 134
 135 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 136 {
 137         return rt_rq->rq;
 138 }
 139
 140 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 141 {
 142         return rt_se->rt_rq;
 143 }
 144
 145 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 146 {
 147         struct rt_rq *rt_rq = rt_se->rt_rq;
 148
 149         return rt_rq->rq;
 150 }
 151
 152 void free_rt_sched_group(struct task_group *tg)
 153 {
 154         int i;
 155
 156         if (tg->rt_se)
 157                 destroy_rt_bandwidth(&tg->rt_bandwidth);
 158
 159         for_each_possible_cpu(i) {
 160                 if (tg->rt_rq)
 161                         kfree(tg->rt_rq[i]);
 162                 if (tg->rt_se)
 163                         kfree(tg->rt_se[i]);
 164         }
 165
 166         kfree(tg->rt_rq);
 167         kfree(tg->rt_se);
 168 }
 169
 170 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 171                 struct sched_rt_entity *rt_se, int cpu,
 172                 struct sched_rt_entity *parent)
 173 {
 174         struct rq *rq = cpu_rq(cpu);
 175
 176         rt_rq->highest_prio.curr = MAX_RT_PRIO;
 177         rt_rq->rt_nr_boosted = 0;
 178         rt_rq->rq = rq;
 179         rt_rq->tg = tg;
 180
 181         tg->rt_rq[cpu] = rt_rq;
 182         tg->rt_se[cpu] = rt_se;
 183
 184         if (!rt_se)
 185                 return;
 186
 187         if (!parent)
 188                 rt_se->rt_rq = &rq->rt;
 189         else
 190                 rt_se->rt_rq = parent->my_q;
 191
 192         rt_se->my_q = rt_rq;
 193         rt_se->parent = parent;
 194         INIT_LIST_HEAD(&rt_se->run_list);
 195 }
 196
 197 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 198 {
 199         struct rt_rq *rt_rq;
 200         struct sched_rt_entity *rt_se;
 201         int i;
 202
 203         tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
 204         if (!tg->rt_rq)
 205                 goto err;
 206         tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
 207         if (!tg->rt_se)
 208                 goto err;
 209
 210         init_rt_bandwidth(&tg->rt_bandwidth,
 211                         ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 212
 213         for_each_possible_cpu(i) {
 214                 rt_rq = kzalloc_node(sizeof(struct rt_rq),
 215                                      GFP_KERNEL, cpu_to_node(i));
 216                 if (!rt_rq)
 217                         goto err;
 218
 219                 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 220                                      GFP_KERNEL, cpu_to_node(i));
 221                 if (!rt_se)
 222                         goto err_free_rq;
 223
 224                 init_rt_rq(rt_rq);
 225                 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 226                 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 227                 init_rt_entity_runnable_average(rt_se);
 228         }
 229
 230         return 1;
 231
 232 err_free_rq:
 233         kfree(rt_rq);
 234 err:
 235         return 0;
 236 }
 237
 238 #else /* CONFIG_RT_GROUP_SCHED */
 239
 240 #define rt_entity_is_task(rt_se) (1)
 241
 242 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 243 {
 244         return container_of(rt_se, struct task_struct, rt);
 245 }
 246
 247 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 248 {
 249         return container_of(rt_rq, struct rq, rt);
 250 }
 251
 252 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 253 {
 254         struct task_struct *p = rt_task_of(rt_se);
 255
 256         return task_rq(p);
 257 }
 258
 259 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 260 {
 261         struct rq *rq = rq_of_rt_se(rt_se);
 262
 263         return &rq->rt;
 264 }
 265
 266 void free_rt_sched_group(struct task_group *tg) { }
 267
 268 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 269 {
 270         return 1;
 271 }
 272 #endif /* CONFIG_RT_GROUP_SCHED */
 273
 274 #ifdef CONFIG_SMP
 275
 276 #include "sched-pelt.h"
 277
 278 extern u64 decay_load(u64 val, u64 n);
 279
 280 static u32 __accumulate_pelt_segments_rt(u64 periods, u32 d1, u32 d3)
 281 {
 282         u32 c1, c2, c3 = d3;
 283
 284         c1 = decay_load((u64)d1, periods);
 285
 286         c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
 287
 288         return c1 + c2 + c3;
 289 }
 290
 291 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 292
 293 static __always_inline u32
 294 accumulate_sum_rt(u64 delta, int cpu, struct sched_avg *sa,
 295                unsigned long weight, int running)
 296 {
 297         unsigned long scale_freq, scale_cpu;
 298         u32 contrib = (u32)delta;
 299         u64 periods;
 300
 301         scale_freq = arch_scale_freq_capacity(NULL, cpu);
 302         scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 303
 304         delta += sa->period_contrib;
 305         periods = delta / 1024;
 306
 307         if (periods) {
 308                 sa->load_sum = decay_load(sa->load_sum, periods);
 309                 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 310
 311                 delta %= 1024;
 312                 contrib = __accumulate_pelt_segments_rt(periods,
 313                                 1024 - sa->period_contrib, delta);
 314         }
 315         sa->period_contrib = delta;
 316
 317         contrib = cap_scale(contrib, scale_freq);
 318         if (weight) {
 319                 sa->load_sum += weight * contrib;
 320         }
 321         if (running)
 322                 sa->util_sum += contrib * scale_cpu;
 323
 324         return periods;
 325 }
 326
 327 /*
 328  * We can represent the historical contribution to runnable average as the
 329  * coefficients of a geometric series, exactly like fair task load.
 330  * refer the ___update_load_avg @ fair sched class
 331  */
 332 static __always_inline int
 333 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 334         unsigned long weight, int running, struct rt_rq *rt_rq)
 335 {
 336         u64 delta;
 337
 338         delta = now - sa->last_update_time;
 339
 340         if ((s64)delta < 0) {
 341                 sa->last_update_time = now;
 342                 return 0;
 343         }
 344
 345         delta >>= 10;
 346         if (!delta)
 347                 return 0;
 348
 349         sa->last_update_time += delta << 10;
 350
 351         if (!weight)
 352                 running = 0;
 353
 354         if (!accumulate_sum_rt(delta, cpu, sa, weight, running))
 355                 return 0;
 356
 357         sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
 358         sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
 359
 360         return 1;
 361 }
 362
 363 static void pull_rt_task(struct rq *this_rq);
 364
 365 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 366 {
 367         /* Try to pull RT tasks here if we lower this rq's prio */
 368         return rq->rt.highest_prio.curr > prev->prio;
 369 }
 370
 371 static inline int rt_overloaded(struct rq *rq)
 372 {
 373         return atomic_read(&rq->rd->rto_count);
 374 }
 375
 376 static inline void rt_set_overload(struct rq *rq)
 377 {
 378         if (!rq->online)
 379                 return;
 380
 381         cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 382         /*
 383          * Make sure the mask is visible before we set
 384          * the overload count. That is checked to determine
 385          * if we should look at the mask. It would be a shame
 386          * if we looked at the mask, but the mask was not
 387          * updated yet.
 388          *
 389          * Matched by the barrier in pull_rt_task().
 390          */
 391         smp_wmb();
 392         atomic_inc(&rq->rd->rto_count);
 393 }
 394
 395 static inline void rt_clear_overload(struct rq *rq)
 396 {
 397         if (!rq->online)
 398                 return;
 399
 400         /* the order here really doesn't matter */
 401         atomic_dec(&rq->rd->rto_count);
 402         cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 403 }
 404
 405 static void update_rt_migration(struct rt_rq *rt_rq)
 406 {
 407         if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
 408                 if (!rt_rq->overloaded) {
 409                         rt_set_overload(rq_of_rt_rq(rt_rq));
 410                         rt_rq->overloaded = 1;
 411                 }
 412         } else if (rt_rq->overloaded) {
 413                 rt_clear_overload(rq_of_rt_rq(rt_rq));
 414                 rt_rq->overloaded = 0;
 415         }
 416 }
 417
 418 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 419 {
 420         struct task_struct *p;
 421
 422         if (!rt_entity_is_task(rt_se))
 423                 return;
 424
 425         p = rt_task_of(rt_se);
 426         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 427
 428         rt_rq->rt_nr_total++;
 429         if (p->nr_cpus_allowed > 1)
 430                 rt_rq->rt_nr_migratory++;
 431
 432         update_rt_migration(rt_rq);
 433 }
 434
 435 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 436 {
 437         struct task_struct *p;
 438
 439         if (!rt_entity_is_task(rt_se))
 440                 return;
 441
 442         p = rt_task_of(rt_se);
 443         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 444
 445         rt_rq->rt_nr_total--;
 446         if (p->nr_cpus_allowed > 1)
 447                 rt_rq->rt_nr_migratory--;
 448
 449         update_rt_migration(rt_rq);
 450 }
 451
 452 static inline int has_pushable_tasks(struct rq *rq)
 453 {
 454         return !plist_head_empty(&rq->rt.pushable_tasks);
 455 }
 456
 457 static DEFINE_PER_CPU(struct callback_head, rt_push_head);
 458 static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 459
 460 static void push_rt_tasks(struct rq *);
 461 static void pull_rt_task(struct rq *);
 462
 463 static inline void queue_push_tasks(struct rq *rq)
 464 {
 465         if (!has_pushable_tasks(rq))
 466                 return;
 467
 468         queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 469 }
 470
 471 static inline void queue_pull_task(struct rq *rq)
 472 {
 473         queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 474 }
 475
 476 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 477 {
 478         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 479         plist_node_init(&p->pushable_tasks, p->prio);
 480         plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 481
 482         /* Update the highest prio pushable task */
 483         if (p->prio < rq->rt.highest_prio.next)
 484                 rq->rt.highest_prio.next = p->prio;
 485 }
 486
 487 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 488 {
 489         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 490
 491         /* Update the new highest prio pushable task */
 492         if (has_pushable_tasks(rq)) {
 493                 p = plist_first_entry(&rq->rt.pushable_tasks,
 494                                       struct task_struct, pushable_tasks);
 495                 rq->rt.highest_prio.next = p->prio;
 496         } else
 497                 rq->rt.highest_prio.next = MAX_RT_PRIO;
 498 }
 499
 500 #else
 501
 502 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 503 {
 504 }
 505
 506 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 507 {
 508 }
 509
 510 static inline
 511 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 512 {
 513 }
 514
 515 static inline
 516 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 517 {
 518 }
 519
 520 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 521 {
 522         return false;
 523 }
 524
 525 static inline void pull_rt_task(struct rq *this_rq)
 526 {
 527 }
 528
 529 static inline void queue_push_tasks(struct rq *rq)
 530 {
 531 }
 532 #endif /* CONFIG_SMP */
 533
 534 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
 535 static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 536
 537 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 538 {
 539         return rt_se->on_rq;
 540 }
 541
 542 #ifdef CONFIG_RT_GROUP_SCHED
 543
 544 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 545 {
 546         if (!rt_rq->tg)
 547                 return RUNTIME_INF;
 548
 549         return rt_rq->rt_runtime;
 550 }
 551
 552 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 553 {
 554         return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 555 }
 556
 557 typedef struct task_group *rt_rq_iter_t;
 558
 559 static inline struct task_group *next_task_group(struct task_group *tg)
 560 {
 561         do {
 562                 tg = list_entry_rcu(tg->list.next,
 563                         typeof(struct task_group), list);
 564         } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 565
 566         if (&tg->list == &task_groups)
 567                 tg = NULL;
 568
 569         return tg;
 570 }
 571
 572 #define for_each_rt_rq(rt_rq, iter, rq)                                 \
 573         for (iter = container_of(&task_groups, typeof(*iter), list);    \
 574                 (iter = next_task_group(iter)) &&                       \
 575                 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 576
 577 #define for_each_sched_rt_entity(rt_se) \
 578         for (; rt_se; rt_se = rt_se->parent)
 579
 580 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 581 {
 582         return rt_se->my_q;
 583 }
 584
 585 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 586 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 587
 588 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 589 {
 590         struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 591         struct rq *rq = rq_of_rt_rq(rt_rq);
 592         struct sched_rt_entity *rt_se;
 593
 594         int cpu = cpu_of(rq);
 595
 596         rt_se = rt_rq->tg->rt_se[cpu];
 597
 598         if (rt_rq->rt_nr_running) {
 599                 if (!rt_se)
 600                         enqueue_top_rt_rq(rt_rq);
 601                 else if (!on_rt_rq(rt_se))
 602                         enqueue_rt_entity(rt_se, 0);
 603
 604                 if (rt_rq->highest_prio.curr < curr->prio)
 605                         resched_curr(rq);
 606         }
 607 }
 608
 609 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 610 {
 611         struct sched_rt_entity *rt_se;
 612         int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 613
 614         rt_se = rt_rq->tg->rt_se[cpu];
 615
 616         if (!rt_se)
 617                 dequeue_top_rt_rq(rt_rq);
 618         else if (on_rt_rq(rt_se))
 619                 dequeue_rt_entity(rt_se, 0);
 620 }
 621
 622 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 623 {
 624         return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 625 }
 626
 627 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 628 {
 629         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 630         struct task_struct *p;
 631
 632         if (rt_rq)
 633                 return !!rt_rq->rt_nr_boosted;
 634
 635         p = rt_task_of(rt_se);
 636         return p->prio != p->normal_prio;
 637 }
 638
 639 #ifdef CONFIG_SMP
 640 static inline const struct cpumask *sched_rt_period_mask(void)
 641 {
 642         return this_rq()->rd->span;
 643 }
 644 #else
 645 static inline const struct cpumask *sched_rt_period_mask(void)
 646 {
 647         return cpu_online_mask;
 648 }
 649 #endif
 650
 651 static inline
 652 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 653 {
 654         return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 655 }
 656
 657 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 658 {
 659         return &rt_rq->tg->rt_bandwidth;
 660 }
 661
 662 #else /* !CONFIG_RT_GROUP_SCHED */
 663
 664 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 665 {
 666         return rt_rq->rt_runtime;
 667 }
 668
 669 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 670 {
 671         return ktime_to_ns(def_rt_bandwidth.rt_period);
 672 }
 673
 674 typedef struct rt_rq *rt_rq_iter_t;
 675
 676 #define for_each_rt_rq(rt_rq, iter, rq) \
 677         for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 678
 679 #define for_each_sched_rt_entity(rt_se) \
 680         for (; rt_se; rt_se = NULL)
 681
 682 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 683 {
 684         return NULL;
 685 }
 686
 687 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 688 {
 689         struct rq *rq = rq_of_rt_rq(rt_rq);
 690
 691         if (!rt_rq->rt_nr_running)
 692                 return;
 693
 694         enqueue_top_rt_rq(rt_rq);
 695         resched_curr(rq);
 696 }
 697
 698 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 699 {
 700         dequeue_top_rt_rq(rt_rq);
 701 }
 702
 703 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 704 {
 705         return rt_rq->rt_throttled;
 706 }
 707
 708 static inline const struct cpumask *sched_rt_period_mask(void)
 709 {
 710         return cpu_online_mask;
 711 }
 712
 713 static inline
 714 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 715 {
 716         return &cpu_rq(cpu)->rt;
 717 }
 718
 719 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 720 {
 721         return &def_rt_bandwidth;
 722 }
 723
 724 #endif /* CONFIG_RT_GROUP_SCHED */
 725
 726 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 727 {
 728         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 729
 730         return (hrtimer_active(&rt_b->rt_period_timer) ||
 731                 rt_rq->rt_time < rt_b->rt_runtime);
 732 }
 733
 734 #ifdef CONFIG_SMP
 735 /*
 736  * We ran out of runtime, see if we can borrow some from our neighbours.
 737  */
 738 static void do_balance_runtime(struct rt_rq *rt_rq)
 739 {
 740         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 741         struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
 742         int i, weight;
 743         u64 rt_period;
 744
 745         weight = cpumask_weight(rd->span);
 746
 747         raw_spin_lock(&rt_b->rt_runtime_lock);
 748         rt_period = ktime_to_ns(rt_b->rt_period);
 749         for_each_cpu(i, rd->span) {
 750                 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 751                 s64 diff;
 752
 753                 if (iter == rt_rq)
 754                         continue;
 755
 756                 raw_spin_lock(&iter->rt_runtime_lock);
 757                 /*
 758                  * Either all rqs have inf runtime and there's nothing to steal
 759                  * or __disable_runtime() below sets a specific rq to inf to
 760                  * indicate its been disabled and disalow stealing.
 761                  */
 762                 if (iter->rt_runtime == RUNTIME_INF)
 763                         goto next;
 764
 765                 /*
 766                  * From runqueues with spare time, take 1/n part of their
 767                  * spare time, but no more than our period.
 768                  */
 769                 diff = iter->rt_runtime - iter->rt_time;
 770                 if (diff > 0) {
 771                         diff = div_u64((u64)diff, weight);
 772                         if (rt_rq->rt_runtime + diff > rt_period)
 773                                 diff = rt_period - rt_rq->rt_runtime;
 774                         iter->rt_runtime -= diff;
 775                         rt_rq->rt_runtime += diff;
 776                         if (rt_rq->rt_runtime == rt_period) {
 777                                 raw_spin_unlock(&iter->rt_runtime_lock);
 778                                 break;
 779                         }
 780                 }
 781 next:
 782                 raw_spin_unlock(&iter->rt_runtime_lock);
 783         }
 784         raw_spin_unlock(&rt_b->rt_runtime_lock);
 785 }
 786
 787 /*
 788  * Ensure this RQ takes back all the runtime it lend to its neighbours.
 789  */
 790 static void __disable_runtime(struct rq *rq)
 791 {
 792         struct root_domain *rd = rq->rd;
 793         rt_rq_iter_t iter;
 794         struct rt_rq *rt_rq;
 795
 796         if (unlikely(!scheduler_running))
 797                 return;
 798
 799         for_each_rt_rq(rt_rq, iter, rq) {
 800                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 801                 s64 want;
 802                 int i;
 803
 804                 raw_spin_lock(&rt_b->rt_runtime_lock);
 805                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 806                 /*
 807                  * Either we're all inf and nobody needs to borrow, or we're
 808                  * already disabled and thus have nothing to do, or we have
 809                  * exactly the right amount of runtime to take out.
 810                  */
 811                 if (rt_rq->rt_runtime == RUNTIME_INF ||
 812                                 rt_rq->rt_runtime == rt_b->rt_runtime)
 813                         goto balanced;
 814                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 815
 816                 /*
 817                  * Calculate the difference between what we started out with
 818                  * and what we current have, that's the amount of runtime
 819                  * we lend and now have to reclaim.
 820                  */
 821                 want = rt_b->rt_runtime - rt_rq->rt_runtime;
 822
 823                 /*
 824                  * Greedy reclaim, take back as much as we can.
 825                  */
 826                 for_each_cpu(i, rd->span) {
 827                         struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 828                         s64 diff;
 829
 830                         /*
 831                          * Can't reclaim from ourselves or disabled runqueues.
 832                          */
 833                         if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 834                                 continue;
 835
 836                         raw_spin_lock(&iter->rt_runtime_lock);
 837                         if (want > 0) {
 838                                 diff = min_t(s64, iter->rt_runtime, want);
 839                                 iter->rt_runtime -= diff;
 840                                 want -= diff;
 841                         } else {
 842                                 iter->rt_runtime -= want;
 843                                 want -= want;
 844                         }
 845                         raw_spin_unlock(&iter->rt_runtime_lock);
 846
 847                         if (!want)
 848                                 break;
 849                 }
 850
 851                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 852                 /*
 853                  * We cannot be left wanting - that would mean some runtime
 854                  * leaked out of the system.
 855                  */
 856                 BUG_ON(want);
 857 balanced:
 858                 /*
 859                  * Disable all the borrow logic by pretending we have inf
 860                  * runtime - in which case borrowing doesn't make sense.
 861                  */
 862                 rt_rq->rt_runtime = RUNTIME_INF;
 863                 rt_rq->rt_throttled = 0;
 864                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 865                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 866
 867                 /* Make rt_rq available for pick_next_task() */
 868                 sched_rt_rq_enqueue(rt_rq);
 869         }
 870 }
 871
 872 static void __enable_runtime(struct rq *rq)
 873 {
 874         rt_rq_iter_t iter;
 875         struct rt_rq *rt_rq;
 876
 877         if (unlikely(!scheduler_running))
 878                 return;
 879
 880         /*
 881          * Reset each runqueue's bandwidth settings
 882          */
 883         for_each_rt_rq(rt_rq, iter, rq) {
 884                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 885
 886                 raw_spin_lock(&rt_b->rt_runtime_lock);
 887                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 888                 rt_rq->rt_runtime = rt_b->rt_runtime;
 889                 rt_rq->rt_time = 0;
 890                 rt_rq->rt_throttled = 0;
 891                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 892                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 893         }
 894 }
 895
 896 static void balance_runtime(struct rt_rq *rt_rq)
 897 {
 898         if (!sched_feat(RT_RUNTIME_SHARE))
 899                 return;
 900
 901         if (rt_rq->rt_time > rt_rq->rt_runtime) {
 902                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 903                 do_balance_runtime(rt_rq);
 904                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 905         }
 906 }
 907 #else /* !CONFIG_SMP */
 908 static inline void balance_runtime(struct rt_rq *rt_rq) {}
 909 #endif /* CONFIG_SMP */
 910
 911 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 912 {
 913         int i, idle = 1, throttled = 0;
 914         const struct cpumask *span;
 915
 916         span = sched_rt_period_mask();
 917 #ifdef CONFIG_RT_GROUP_SCHED
 918         /*
 919          * FIXME: isolated CPUs should really leave the root task group,
 920          * whether they are isolcpus or were isolated via cpusets, lest
 921          * the timer run on a CPU which does not service all runqueues,
 922          * potentially leaving other CPUs indefinitely throttled.  If
 923          * isolation is really required, the user will turn the throttle
 924          * off to kill the perturbations it causes anyway.  Meanwhile,
 925          * this maintains functionality for boot and/or troubleshooting.
 926          */
 927         if (rt_b == &root_task_group.rt_bandwidth)
 928                 span = cpu_online_mask;
 929 #endif
 930         for_each_cpu(i, span) {
 931                 int enqueue = 0;
 932                 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 933                 struct rq *rq = rq_of_rt_rq(rt_rq);
 934                 int skip;
 935
 936                 /*
 937                  * When span == cpu_online_mask, taking each rq->lock
 938                  * can be time-consuming. Try to avoid it when possible.
 939                  */
 940                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 941                 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
 942                         rt_rq->rt_runtime = rt_b->rt_runtime;
 943                 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
 944                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 945                 if (skip)
 946                         continue;
 947
 948                 raw_spin_lock(&rq->lock);
 949                 update_rq_clock(rq);
 950
 951                 if (rt_rq->rt_time) {
 952                         u64 runtime;
 953
 954                         raw_spin_lock(&rt_rq->rt_runtime_lock);
 955                         if (rt_rq->rt_throttled)
 956                                 balance_runtime(rt_rq);
 957                         runtime = rt_rq->rt_runtime;
 958                         rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 959                         if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 960                                 rt_rq->rt_throttled = 0;
 961                                 enqueue = 1;
 962
 963                                 /*
 964                                  * When we're idle and a woken (rt) task is
 965                                  * throttled check_preempt_curr() will set
 966                                  * skip_update and the time between the wakeup
 967                                  * and this unthrottle will get accounted as
 968                                  * 'runtime'.
 969                                  */
 970                                 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 971                                         rq_clock_skip_update(rq, false);
 972                         }
 973                         if (rt_rq->rt_time || rt_rq->rt_nr_running)
 974                                 idle = 0;
 975                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
 976                 } else if (rt_rq->rt_nr_running) {
 977                         idle = 0;
 978                         if (!rt_rq_throttled(rt_rq))
 979                                 enqueue = 1;
 980                 }
 981                 if (rt_rq->rt_throttled)
 982                         throttled = 1;
 983
 984                 if (enqueue)
 985                         sched_rt_rq_enqueue(rt_rq);
 986                 raw_spin_unlock(&rq->lock);
 987         }
 988
 989         if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
 990                 return 1;
 991
 992         return idle;
 993 }
 994
 995 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 996 {
 997 #ifdef CONFIG_RT_GROUP_SCHED
 998         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 999
1000         if (rt_rq)
1001                 return rt_rq->highest_prio.curr;
1002 #endif
1003
1004         return rt_task_of(rt_se)->prio;
1005 }
1006
1007 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
1008 {
1009         u64 runtime = sched_rt_runtime(rt_rq);
1010
1011         if (rt_rq->rt_throttled)
1012                 return rt_rq_throttled(rt_rq);
1013
1014         if (runtime >= sched_rt_period(rt_rq))
1015                 return 0;
1016
1017         balance_runtime(rt_rq);
1018         runtime = sched_rt_runtime(rt_rq);
1019         if (runtime == RUNTIME_INF)
1020                 return 0;
1021
1022         if (rt_rq->rt_time > runtime) {
1023                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
1024
1025                 /*
1026                  * Don't actually throttle groups that have no runtime assigned
1027                  * but accrue some time due to boosting.
1028                  */
1029                 if (likely(rt_b->rt_runtime)) {
1030                         rt_rq->rt_throttled = 1;
1031                         printk_deferred_once("sched: RT throttling activated\n");
1032                 } else {
1033                         /*
1034                          * In case we did anyway, make it go away,
1035                          * replenishment is a joke, since it will replenish us
1036                          * with exactly 0 ns.
1037                          */
1038                         rt_rq->rt_time = 0;
1039                 }
1040
1041                 if (rt_rq_throttled(rt_rq)) {
1042                         sched_rt_rq_dequeue(rt_rq);
1043                         return 1;
1044                 }
1045         }
1046
1047         return 0;
1048 }
1049
1050 /*
1051  * Update the current task's runtime statistics. Skip current tasks that
1052  * are not in our scheduling class.
1053  */
1054 static void update_curr_rt(struct rq *rq)
1055 {
1056         struct task_struct *curr = rq->curr;
1057         struct sched_rt_entity *rt_se = &curr->rt;
1058         u64 delta_exec;
1059
1060         if (curr->sched_class != &rt_sched_class)
1061                 return;
1062
1063         delta_exec = rq_clock_task(rq) - curr->se.exec_start;
1064         if (unlikely((s64)delta_exec <= 0))
1065                 return;
1066
1067         /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1068         cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
1069
1070         schedstat_set(curr->se.statistics.exec_max,
1071                       max(curr->se.statistics.exec_max, delta_exec));
1072
1073         curr->se.sum_exec_runtime += delta_exec;
1074         account_group_exec_runtime(curr, delta_exec);
1075
1076         curr->se.exec_start = rq_clock_task(rq);
1077         cpuacct_charge(curr, delta_exec);
1078
1079         sched_rt_avg_update(rq, delta_exec);
1080
1081         if (!rt_bandwidth_enabled())
1082                 return;
1083
1084         for_each_sched_rt_entity(rt_se) {
1085                 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1086                 int exceeded;
1087
1088                 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1089                         raw_spin_lock(&rt_rq->rt_runtime_lock);
1090                         rt_rq->rt_time += delta_exec;
1091                         exceeded = sched_rt_runtime_exceeded(rt_rq);
1092                         if (exceeded)
1093                                 resched_curr(rq);
1094                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
1095                         if (exceeded)
1096                                 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1097                 }
1098         }
1099 }
1100
1101 static void
1102 dequeue_top_rt_rq(struct rt_rq *rt_rq)
1103 {
1104         struct rq *rq = rq_of_rt_rq(rt_rq);
1105
1106         BUG_ON(&rq->rt != rt_rq);
1107
1108         if (!rt_rq->rt_queued)
1109                 return;
1110
1111         BUG_ON(!rq->nr_running);
1112
1113         sub_nr_running(rq, rt_rq->rt_nr_running);
1114         rt_rq->rt_queued = 0;
1115 }
1116
1117 static void
1118 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1119 {
1120         struct rq *rq = rq_of_rt_rq(rt_rq);
1121
1122         BUG_ON(&rq->rt != rt_rq);
1123
1124         if (rt_rq->rt_queued)
1125                 return;
1126         if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
1127                 return;
1128
1129         add_nr_running(rq, rt_rq->rt_nr_running);
1130         rt_rq->rt_queued = 1;
1131 }
1132
1133 #if defined CONFIG_SMP
1134
1135 static void
1136 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1137 {
1138         struct rq *rq = rq_of_rt_rq(rt_rq);
1139
1140 #ifdef CONFIG_RT_GROUP_SCHED
1141         /*
1142          * Change rq's cpupri only if rt_rq is the top queue.
1143          */
1144         if (&rq->rt != rt_rq)
1145                 return;
1146 #endif
1147         if (rq->online && prio < prev_prio)
1148                 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1149 }
1150
1151 static void
1152 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1153 {
1154         struct rq *rq = rq_of_rt_rq(rt_rq);
1155
1156 #ifdef CONFIG_RT_GROUP_SCHED
1157         /*
1158          * Change rq's cpupri only if rt_rq is the top queue.
1159          */
1160         if (&rq->rt != rt_rq)
1161                 return;
1162 #endif
1163         if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1164                 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1165 }
1166
1167 #else /* CONFIG_SMP */
1168
1169 static inline
1170 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1171 static inline
1172 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1173
1174 #endif /* CONFIG_SMP */
1175
1176 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1177 static void
1178 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1179 {
1180         int prev_prio = rt_rq->highest_prio.curr;
1181
1182         if (prio < prev_prio)
1183                 rt_rq->highest_prio.curr = prio;
1184
1185         inc_rt_prio_smp(rt_rq, prio, prev_prio);
1186 }
1187
1188 static void
1189 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1190 {
1191         int prev_prio = rt_rq->highest_prio.curr;
1192
1193         if (rt_rq->rt_nr_running) {
1194
1195                 WARN_ON(prio < prev_prio);
1196
1197                 /*
1198                  * This may have been our highest task, and therefore
1199                  * we may have some recomputation to do
1200                  */
1201                 if (prio == prev_prio) {
1202                         struct rt_prio_array *array = &rt_rq->active;
1203
1204                         rt_rq->highest_prio.curr =
1205                                 sched_find_first_bit(array->bitmap);
1206                 }
1207
1208         } else
1209                 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1210
1211         dec_rt_prio_smp(rt_rq, prio, prev_prio);
1212 }
1213
1214 #else
1215
1216 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1217 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1218
1219 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1220
1221 #ifdef CONFIG_RT_GROUP_SCHED
1222
1223 static void
1224 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1225 {
1226         if (rt_se_boosted(rt_se))
1227                 rt_rq->rt_nr_boosted++;
1228
1229         if (rt_rq->tg)
1230                 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1231 }
1232
1233 static void
1234 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1235 {
1236         if (rt_se_boosted(rt_se))
1237                 rt_rq->rt_nr_boosted--;
1238
1239         WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1240 }
1241
1242 #else /* CONFIG_RT_GROUP_SCHED */
1243
1244 static void
1245 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1246 {
1247         start_rt_bandwidth(&def_rt_bandwidth);
1248 }
1249
1250 static inline
1251 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1252
1253 #endif /* CONFIG_RT_GROUP_SCHED */
1254
1255 static inline
1256 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1257 {
1258         struct rt_rq *group_rq = group_rt_rq(rt_se);
1259
1260         if (group_rq)
1261                 return group_rq->rt_nr_running;
1262         else
1263                 return 1;
1264 }
1265
1266 static inline
1267 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1268 {
1269         struct rt_rq *group_rq = group_rt_rq(rt_se);
1270         struct task_struct *tsk;
1271
1272         if (group_rq)
1273                 return group_rq->rr_nr_running;
1274
1275         tsk = rt_task_of(rt_se);
1276
1277         return (tsk->policy == SCHED_RR) ? 1 : 0;
1278 }
1279
1280 static inline
1281 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1282 {
1283         int prio = rt_se_prio(rt_se);
1284
1285         WARN_ON(!rt_prio(prio));
1286         rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1287         rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1288
1289         inc_rt_prio(rt_rq, prio);
1290         inc_rt_migration(rt_se, rt_rq);
1291         inc_rt_group(rt_se, rt_rq);
1292 }
1293
1294 static inline
1295 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1296 {
1297         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1298         WARN_ON(!rt_rq->rt_nr_running);
1299         rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1300         rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1301
1302         dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1303         dec_rt_migration(rt_se, rt_rq);
1304         dec_rt_group(rt_se, rt_rq);
1305 }
1306
1307 #ifdef CONFIG_SMP
1308 static void
1309 attach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1310 {
1311         rt_se->avg.last_update_time = rt_rq->avg.last_update_time;
1312         rt_rq->avg.util_avg += rt_se->avg.util_avg;
1313         rt_rq->avg.util_sum += rt_se->avg.util_sum;
1314         rt_rq->avg.load_avg += rt_se->avg.load_avg;
1315         rt_rq->avg.load_sum += rt_se->avg.load_sum;
1316 #ifdef CONFIG_RT_GROUP_SCHED
1317         rt_rq->propagate_avg = 1;
1318 #endif
1319         rt_rq_util_change(rt_rq);
1320 }
1321
1322 static void
1323 detach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1324 {
1325         sub_positive(&rt_rq->avg.util_avg, rt_se->avg.util_avg);
1326         sub_positive(&rt_rq->avg.util_sum, rt_se->avg.util_sum);
1327         sub_positive(&rt_rq->avg.load_avg, rt_se->avg.load_avg);
1328         sub_positive(&rt_rq->avg.load_sum, rt_se->avg.load_sum);
1329 #ifdef CONFIG_RT_GROUP_SCHED
1330         rt_rq->propagate_avg = 1;
1331 #endif
1332         rt_rq_util_change(rt_rq);
1333 }
1334 #else
1335 static inline void
1336 attach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) {}
1337 static inline void
1338 detach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) {}
1339 #endif
1340
1341 /*
1342  * Change rt_se->run_list location unless SAVE && !MOVE
1343  *
1344  * assumes ENQUEUE/DEQUEUE flags match
1345  */
1346 static inline bool move_entity(unsigned int flags)
1347 {
1348         if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1349                 return false;
1350
1351         return true;
1352 }
1353
1354 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1355 {
1356         list_del_init(&rt_se->run_list);
1357
1358         if (list_empty(array->queue + rt_se_prio(rt_se)))
1359                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1360
1361         rt_se->on_list = 0;
1362 }
1363
1364 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1365 {
1366         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1367         struct rt_prio_array *array = &rt_rq->active;
1368         struct rt_rq *group_rq = group_rt_rq(rt_se);
1369         struct list_head *queue = array->queue + rt_se_prio(rt_se);
1370
1371         /*
1372          * Don't enqueue the group if its throttled, or when empty.
1373          * The latter is a consequence of the former when a child group
1374          * get throttled and the current group doesn't have any other
1375          * active members.
1376          */
1377         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1378                 if (rt_se->on_list)
1379                         __delist_rt_entity(rt_se, array);
1380                 return;
1381         }
1382
1383         if (move_entity(flags)) {
1384                 WARN_ON_ONCE(rt_se->on_list);
1385                 if (flags & ENQUEUE_HEAD)
1386                         list_add(&rt_se->run_list, queue);
1387                 else
1388                         list_add_tail(&rt_se->run_list, queue);
1389
1390                 __set_bit(rt_se_prio(rt_se), array->bitmap);
1391                 rt_se->on_list = 1;
1392         }
1393         rt_se->on_rq = 1;
1394
1395         update_rt_load_avg(rq_clock_task(rq_of_rt_rq(rt_rq)), rt_se);
1396
1397         if (rt_entity_is_task(rt_se) && !rt_se->avg.last_update_time)
1398                 attach_rt_entity_load_avg(rt_rq, rt_se);
1399
1400         inc_rt_tasks(rt_se, rt_rq);
1401 }
1402
1403 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1404 {
1405         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1406         struct rt_prio_array *array = &rt_rq->active;
1407
1408         if (move_entity(flags)) {
1409                 WARN_ON_ONCE(!rt_se->on_list);
1410                 __delist_rt_entity(rt_se, array);
1411         }
1412         rt_se->on_rq = 0;
1413
1414         update_rt_load_avg(rq_clock_task(rq_of_rt_rq(rt_rq)), rt_se);
1415
1416         dec_rt_tasks(rt_se, rt_rq);
1417 }
1418
1419 /*
1420  * Because the prio of an upper entry depends on the lower
1421  * entries, we must remove entries top - down.
1422  */
1423 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1424 {
1425         struct sched_rt_entity *back = NULL;
1426
1427         for_each_sched_rt_entity(rt_se) {
1428                 rt_se->back = back;
1429                 back = rt_se;
1430         }
1431
1432         dequeue_top_rt_rq(rt_rq_of_se(back));
1433
1434         for (rt_se = back; rt_se; rt_se = rt_se->back) {
1435                 if (on_rt_rq(rt_se))
1436                         __dequeue_rt_entity(rt_se, flags);
1437         }
1438 }
1439
1440 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1441 {
1442         struct rq *rq = rq_of_rt_se(rt_se);
1443
1444         dequeue_rt_stack(rt_se, flags);
1445         for_each_sched_rt_entity(rt_se)
1446                 __enqueue_rt_entity(rt_se, flags);
1447         enqueue_top_rt_rq(&rq->rt);
1448 }
1449
1450 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1451 {
1452         struct rq *rq = rq_of_rt_se(rt_se);
1453
1454         dequeue_rt_stack(rt_se, flags);
1455
1456         for_each_sched_rt_entity(rt_se) {
1457                 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1458
1459                 if (rt_rq && rt_rq->rt_nr_running)
1460                         __enqueue_rt_entity(rt_se, flags);
1461         }
1462         enqueue_top_rt_rq(&rq->rt);
1463 }
1464
1465 /*
1466  * Adding/removing a task to/from a priority array:
1467  */
1468 static void
1469 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1470 {
1471         struct sched_rt_entity *rt_se = &p->rt;
1472
1473         schedtune_enqueue_task(p, cpu_of(rq));
1474
1475         if (flags & ENQUEUE_WAKEUP)
1476                 rt_se->timeout = 0;
1477
1478         enqueue_rt_entity(rt_se, flags);
1479         walt_inc_cumulative_runnable_avg(rq, p);
1480
1481         if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1482                 enqueue_pushable_task(rq, p);
1483 }
1484
1485 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1486 {
1487         struct sched_rt_entity *rt_se = &p->rt;
1488
1489         schedtune_dequeue_task(p, cpu_of(rq));
1490
1491         update_curr_rt(rq);
1492         dequeue_rt_entity(rt_se, flags);
1493         walt_dec_cumulative_runnable_avg(rq, p);
1494
1495         dequeue_pushable_task(rq, p);
1496 }
1497
1498 /*
1499  * Put task to the head or the end of the run list without the overhead of
1500  * dequeue followed by enqueue.
1501  */
1502 static void
1503 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1504 {
1505         if (on_rt_rq(rt_se)) {
1506                 struct rt_prio_array *array = &rt_rq->active;
1507                 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1508
1509                 if (head)
1510                         list_move(&rt_se->run_list, queue);
1511                 else
1512                         list_move_tail(&rt_se->run_list, queue);
1513         }
1514 }
1515
1516 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1517 {
1518         struct sched_rt_entity *rt_se = &p->rt;
1519         struct rt_rq *rt_rq;
1520
1521         for_each_sched_rt_entity(rt_se) {
1522                 rt_rq = rt_rq_of_se(rt_se);
1523                 requeue_rt_entity(rt_rq, rt_se, head);
1524         }
1525 }
1526
1527 static void yield_task_rt(struct rq *rq)
1528 {
1529         requeue_task_rt(rq, rq->curr, 0);
1530 }
1531
1532 #ifdef CONFIG_SMP
1533
1534 /* TODO:
1535  * attach/detach/migrate_task_rt_rq() for load tracking
1536  */
1537
1538 static int find_lowest_rq(struct task_struct *task);
1539
1540 static int
1541 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
1542                   int sibling_count_hint)
1543 {
1544         struct task_struct *curr;
1545         struct rq *rq;
1546
1547         /* For anything but wake ups, just return the task_cpu */
1548         if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1549                 goto out;
1550
1551         rq = cpu_rq(cpu);
1552
1553         rcu_read_lock();
1554         curr = READ_ONCE(rq->curr); /* unlocked access */
1555
1556         /*
1557          * If the current task on @p's runqueue is an RT task, then
1558          * try to see if we can wake this RT task up on another
1559          * runqueue. Otherwise simply start this RT task
1560          * on its current runqueue.
1561          *
1562          * We want to avoid overloading runqueues. If the woken
1563          * task is a higher priority, then it will stay on this CPU
1564          * and the lower prio task should be moved to another CPU.
1565          * Even though this will probably make the lower prio task
1566          * lose its cache, we do not want to bounce a higher task
1567          * around just because it gave up its CPU, perhaps for a
1568          * lock?
1569          *
1570          * For equal prio tasks, we just let the scheduler sort it out.
1571          *
1572          * Otherwise, just let it ride on the affined RQ and the
1573          * post-schedule router will push the preempted task away
1574          *
1575          * This test is optimistic, if we get it wrong the load-balancer
1576          * will have to sort it out.
1577          */
1578         if (curr && unlikely(rt_task(curr)) &&
1579             (curr->nr_cpus_allowed < 2 ||
1580              curr->prio <= p->prio)) {
1581                 int target = find_lowest_rq(p);
1582
1583                 /*
1584                  * Don't bother moving it if the destination CPU is
1585                  * not running a lower priority task.
1586                  */
1587                 if (target != -1 &&
1588                     p->prio < cpu_rq(target)->rt.highest_prio.curr)
1589                         cpu = target;
1590         }
1591         rcu_read_unlock();
1592
1593 out:
1594         return cpu;
1595 }
1596
1597 #ifdef CONFIG_RT_GROUP_SCHED
1598 /*
1599  * Called within set_task_rq() right before setting a task's cpu. The
1600  * caller only guarantees p->pi_lock is held; no other assumptions,
1601  * including the state of rq->lock, should be made.
1602  */
1603 void set_task_rq_rt(struct sched_rt_entity *rt_se,
1604                                     struct rt_rq *prev, struct rt_rq *next)
1605 {
1606         u64 p_last_update_time;
1607         u64 n_last_update_time;
1608
1609         if (!sched_feat(ATTACH_AGE_LOAD))
1610                 return;
1611         /*
1612          * We are supposed to update the task to "current" time, then its up to
1613          * date and ready to go to new CPU/rt_rq. But we have difficulty in
1614          * getting what current time is, so simply throw away the out-of-date
1615          * time. This will result in the wakee task is less decayed, but giving
1616          * the wakee more load sounds not bad.
1617          */
1618         if (!(rt_se->avg.last_update_time && prev))
1619                 return;
1620 #ifndef CONFIG_64BIT
1621         {
1622                 u64 p_last_update_time_copy;
1623                 u64 n_last_update_time_copy;
1624
1625                 do {
1626                         p_last_update_time_copy = prev->load_last_update_time_copy;
1627                         n_last_update_time_copy = next->load_last_update_time_copy;
1628
1629                         smp_rmb();
1630
1631                         p_last_update_time = prev->avg.last_update_time;
1632                         n_last_update_time = next->avg.last_update_time;
1633
1634                 } while (p_last_update_time != p_last_update_time_copy ||
1635                          n_last_update_time != n_last_update_time_copy);
1636         }
1637 #else
1638         p_last_update_time = prev->avg.last_update_time;
1639         n_last_update_time = next->avg.last_update_time;
1640 #endif
1641         __update_load_avg(p_last_update_time, cpu_of(rq_of_rt_rq(prev)),
1642                 &rt_se->avg, 0, 0, NULL);
1643
1644         rt_se->avg.last_update_time = n_last_update_time;
1645 }
1646 #endif /* CONFIG_RT_GROUP_SCHED */
1647
1648 #ifndef CONFIG_64BIT
1649 static inline u64 rt_rq_last_update_time(struct rt_rq *rt_rq)
1650 {
1651         u64 last_update_time_copy;
1652         u64 last_update_time;
1653
1654         do {
1655                 last_update_time_copy = rt_rq->load_last_update_time_copy;
1656                 smp_rmb();
1657                 last_update_time = rt_rq->avg.last_update_time;
1658         } while (last_update_time != last_update_time_copy);
1659
1660         return last_update_time;
1661 }
1662 #else
1663 static inline u64 rt_rq_last_update_time(struct rt_rq *rt_rq)
1664 {
1665         return rt_rq->avg.last_update_time;
1666 }
1667 #endif
1668
1669 /*
1670  * Synchronize entity load avg of dequeued entity without locking
1671  * the previous rq.
1672  */
1673 void sync_rt_entity_load_avg(struct sched_rt_entity *rt_se)
1674 {
1675         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1676         u64 last_update_time;
1677
1678         last_update_time = rt_rq_last_update_time(rt_rq);
1679         update_rt_load_avg(last_update_time, rt_se);
1680 }
1681
1682 /*
1683  * Task first catches up with rt_rq, and then subtract
1684  * itself from the rt_rq (task must be off the queue now).
1685  */
1686 static void remove_rt_entity_load_avg(struct sched_rt_entity *rt_se)
1687 {
1688         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1689
1690         /*
1691          * tasks cannot exit without having gone through wake_up_new_task() ->
1692          * post_init_entity_util_avg() which will have added things to the
1693          * rt_rq, so we can remove unconditionally.
1694          *
1695          * Similarly for groups, they will have passed through
1696          * post_init_entity_util_avg() before unregister_sched_fair_group()
1697          * calls this.
1698          */
1699
1700         sync_rt_entity_load_avg(rt_se);
1701         atomic_long_add(rt_se->avg.load_avg, &rt_rq->removed_load_avg);
1702         atomic_long_add(rt_se->avg.util_avg, &rt_rq->removed_util_avg);
1703 }
1704
1705 static void attach_task_rt_rq(struct task_struct *p)
1706 {
1707         struct sched_rt_entity *rt_se = &p->rt;
1708         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1709         u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
1710
1711         update_rt_load_avg(now, rt_se);
1712         attach_rt_entity_load_avg(rt_rq, rt_se);
1713 }
1714
1715 static void detach_task_rt_rq(struct task_struct *p)
1716 {
1717         struct sched_rt_entity *rt_se = &p->rt;
1718         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1719         u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
1720
1721         update_rt_load_avg(now, rt_se);
1722         detach_rt_entity_load_avg(rt_rq, rt_se);
1723 }
1724
1725 static void migrate_task_rq_rt(struct task_struct *p)
1726 {
1727         /*
1728          * We are supposed to update the task to "current" time, then its up to date
1729          * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
1730          * what current time is, so simply throw away the out-of-date time. This
1731          * will result in the wakee task is less decayed, but giving the wakee more
1732          * load sounds not bad.
1733          */
1734         remove_rt_entity_load_avg(&p->rt);
1735
1736         /* Tell new CPU we are migrated */
1737         p->rt.avg.last_update_time = 0;
1738
1739         /* We have migrated, no longer consider this task hot */
1740         p->se.exec_start = 0;
1741 }
1742
1743 static void task_dead_rt(struct task_struct *p)
1744 {
1745         remove_rt_entity_load_avg(&p->rt);
1746 }
1747
1748 #ifdef CONFIG_RT_GROUP_SCHED
1749 static void task_set_group_rt(struct task_struct *p)
1750 {
1751         set_task_rq(p, task_cpu(p));
1752 }
1753
1754 static void task_move_group_rt(struct task_struct *p)
1755 {
1756         detach_task_rt_rq(p);
1757         set_task_rq(p, task_cpu(p));
1758
1759 #ifdef CONFIG_SMP
1760         /* Tell se's cfs_rq has been changed -- migrated */
1761         p->se.avg.last_update_time = 0;
1762 #endif
1763         attach_task_rt_rq(p);
1764 }
1765
1766 static void task_change_group_rt(struct task_struct *p, int type)
1767 {
1768         switch (type) {
1769         case TASK_SET_GROUP:
1770                 task_set_group_rt(p);
1771                 break;
1772
1773         case TASK_MOVE_GROUP:
1774                 task_move_group_rt(p);
1775                 break;
1776         }
1777 }
1778 #endif
1779
1780 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1781 {
1782         /*
1783          * Current can't be migrated, useless to reschedule,
1784          * let's hope p can move out.
1785          */
1786         if (rq->curr->nr_cpus_allowed == 1 ||
1787             !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1788                 return;
1789
1790         /*
1791          * p is migratable, so let's not schedule it and
1792          * see if it is pushed or pulled somewhere else.
1793          */
1794         if (p->nr_cpus_allowed != 1
1795             && cpupri_find(&rq->rd->cpupri, p, NULL))
1796                 return;
1797
1798         /*
1799          * There appears to be other cpus that can accept
1800          * current and none to run 'p', so lets reschedule
1801          * to try and push current away:
1802          */
1803         requeue_task_rt(rq, p, 1);
1804         resched_curr(rq);
1805 }
1806
1807 /* Give new sched_entity start runnable values to heavy its load in infant time */
1808 void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se)
1809 {
1810         struct sched_avg *sa = &rt_se->avg;
1811
1812         sa->last_update_time = 0;
1813
1814         sa->period_contrib = 1023;
1815
1816         /*
1817          * Tasks are intialized with zero load.
1818          * Load is not actually used by RT, but can be inherited into fair task.
1819          */
1820         sa->load_avg = 0;
1821         sa->load_sum = 0;
1822         /*
1823          * At this point, util_avg won't be used in select_task_rq_rt anyway
1824          */
1825         sa->util_avg = 0;
1826         sa->util_sum = 0;
1827         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
1828 }
1829 #else
1830 void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se) { }
1831 #endif /* CONFIG_SMP */
1832
1833 /*
1834  * Preempt the current task with a newly woken task if needed:
1835  */
1836 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1837 {
1838         if (p->prio < rq->curr->prio) {
1839                 resched_curr(rq);
1840                 return;
1841         }
1842
1843 #ifdef CONFIG_SMP
1844         /*
1845          * If:
1846          *
1847          * - the newly woken task is of equal priority to the current task
1848          * - the newly woken task is non-migratable while current is migratable
1849          * - current will be preempted on the next reschedule
1850          *
1851          * we should check to see if current can readily move to a different
1852          * cpu.  If so, we will reschedule to allow the push logic to try
1853          * to move current somewhere else, making room for our non-migratable
1854          * task.
1855          */
1856         if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1857                 check_preempt_equal_prio(rq, p);
1858 #endif
1859 }
1860
1861 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1862                                                    struct rt_rq *rt_rq)
1863 {
1864         struct rt_prio_array *array = &rt_rq->active;
1865         struct sched_rt_entity *next = NULL;
1866         struct list_head *queue;
1867         int idx;
1868
1869         idx = sched_find_first_bit(array->bitmap);
1870         BUG_ON(idx >= MAX_RT_PRIO);
1871
1872         queue = array->queue + idx;
1873         next = list_entry(queue->next, struct sched_rt_entity, run_list);
1874
1875         return next;
1876 }
1877
1878 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1879 {
1880         struct sched_rt_entity *rt_se;
1881         struct task_struct *p;
1882         struct rt_rq *rt_rq  = &rq->rt;
1883         u64 now = rq_clock_task(rq);
1884
1885         do {
1886                 rt_se = pick_next_rt_entity(rq, rt_rq);
1887                 BUG_ON(!rt_se);
1888                 update_rt_load_avg(now, rt_se);
1889                 rt_rq->curr = rt_se;
1890                 rt_rq = group_rt_rq(rt_se);
1891         } while (rt_rq);
1892
1893         p = rt_task_of(rt_se);
1894         p->se.exec_start = now;
1895
1896         return p;
1897 }
1898
1899 extern int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running);
1900
1901 static struct task_struct *
1902 pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1903 {
1904         struct task_struct *p;
1905         struct rt_rq *rt_rq = &rq->rt;
1906
1907         if (need_pull_rt_task(rq, prev)) {
1908                 /*
1909                  * This is OK, because current is on_cpu, which avoids it being
1910                  * picked for load-balance and preemption/IRQs are still
1911                  * disabled avoiding further scheduler activity on it and we're
1912                  * being very careful to re-start the picking loop.
1913                  */
1914                 rq_unpin_lock(rq, rf);
1915                 pull_rt_task(rq);
1916                 rq_repin_lock(rq, rf);
1917                 /*
1918                  * pull_rt_task() can drop (and re-acquire) rq->lock; this
1919                  * means a dl or stop task can slip in, in which case we need
1920                  * to re-start task selection.
1921                  */
1922                 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1923                              rq->dl.dl_nr_running))
1924                         return RETRY_TASK;
1925         }
1926
1927         /*
1928          * We may dequeue prev's rt_rq in put_prev_task().
1929          * So, we update time before rt_nr_running check.
1930          */
1931         if (prev->sched_class == &rt_sched_class)
1932                 update_curr_rt(rq);
1933
1934         if (!rt_rq->rt_queued)
1935                 return NULL;
1936
1937         put_prev_task(rq, prev);
1938
1939         p = _pick_next_task_rt(rq);
1940
1941         /* The running task is never eligible for pushing */
1942         dequeue_pushable_task(rq, p);
1943
1944         queue_push_tasks(rq);
1945
1946         if (p)
1947                 update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), rt_rq,
1948                                         rq->curr->sched_class == &rt_sched_class);
1949
1950         return p;
1951 }
1952
1953 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
1954 {
1955         struct sched_rt_entity *rt_se = &p->rt;
1956         u64 now = rq_clock_task(rq);
1957
1958         update_curr_rt(rq);
1959
1960         update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
1961
1962         /*
1963          * The previous task needs to be made eligible for pushing
1964          * if it is still active
1965          */
1966         if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
1967                 enqueue_pushable_task(rq, p);
1968
1969         for_each_sched_rt_entity(rt_se) {
1970                 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1971                 if (rt_se->on_rq)
1972                         update_rt_load_avg(now, rt_se);
1973
1974                 rt_rq->curr = NULL;
1975         }
1976 }
1977
1978 #ifdef CONFIG_SMP
1979
1980 void rt_rq_util_change(struct rt_rq *rt_rq)
1981 {
1982         if (&this_rq()->rt == rt_rq)
1983                 cpufreq_update_util(rt_rq->rq, SCHED_CPUFREQ_RT);
1984 }
1985
1986 #ifdef CONFIG_RT_GROUP_SCHED
1987 /* Take into account change of utilization of a child task group */
1988 static inline void
1989 update_tg_rt_util(struct rt_rq *cfs_rq, struct sched_rt_entity *rt_se)
1990 {
1991         struct rt_rq *grt_rq = rt_se->my_q;
1992         long delta = grt_rq->avg.util_avg - rt_se->avg.util_avg;
1993
1994         /* Nothing to update */
1995         if (!delta)
1996                 return;
1997
1998         /* Set new sched_rt_entity's utilization */
1999         rt_se->avg.util_avg = grt_rq->avg.util_avg;
2000         rt_se->avg.util_sum = rt_se->avg.util_avg * LOAD_AVG_MAX;
2001
2002         /* Update parent rt_rq utilization */
2003         add_positive(&cfs_rq->avg.util_avg, delta);
2004         cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
2005 }
2006
2007
2008 /* Take into account change of load of a child task group */
2009 static inline void
2010 update_tg_rt_load(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
2011 {
2012         struct rt_rq *grt_rq = rt_se->my_q;
2013         long delta = grt_rq->avg.load_avg - rt_se->avg.load_avg;
2014
2015         /*
2016          * TODO: Need to consider the TG group update
2017          * for RT RQ
2018          */
2019
2020         /* Nothing to update */
2021         if (!delta)
2022                 return;
2023
2024         /* Set new sched_rt_entity's load */
2025         rt_se->avg.load_avg = grt_rq->avg.load_avg;
2026         rt_se->avg.load_sum = rt_se->avg.load_avg * LOAD_AVG_MAX;
2027
2028         /* Update parent cfs_rq load */
2029         add_positive(&rt_rq->avg.load_avg, delta);
2030         rt_rq->avg.load_sum = rt_rq->avg.load_avg * LOAD_AVG_MAX;
2031
2032         /*
2033          * TODO: If the sched_entity is already enqueued, should we have to update the
2034          * runnable load avg.
2035          */
2036 }
2037
2038 static inline int test_and_clear_tg_rt_propagate(struct sched_rt_entity *rt_se)
2039 {
2040         struct rt_rq *rt_rq = rt_se->my_q;
2041
2042         if (!rt_rq->propagate_avg)
2043                 return 0;
2044
2045         rt_rq->propagate_avg = 0;
2046         return 1;
2047 }
2048
2049 /* Update task and its cfs_rq load average */
2050 static inline int propagate_entity_rt_load_avg(struct sched_rt_entity *rt_se)
2051 {
2052         struct rt_rq *rt_rq;
2053
2054         if (rt_entity_is_task(rt_se))
2055                 return 0;
2056
2057         if (!test_and_clear_tg_rt_propagate(rt_se))
2058                 return 0;
2059
2060         rt_rq = rt_rq_of_se(rt_se);
2061
2062         rt_rq->propagate_avg = 1;
2063
2064         update_tg_rt_util(rt_rq, rt_se);
2065         update_tg_rt_load(rt_rq, rt_se);
2066
2067         return 1;
2068 }
2069 #else
2070 static inline int propagate_entity_rt_load_avg(struct sched_rt_entity *rt_se) { };
2071 #endif
2072
2073 void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se)
2074 {
2075         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
2076         struct rq *rq = rq_of_rt_rq(rt_rq);
2077         int cpu = cpu_of(rq);
2078         /*
2079          * Track task load average for carrying it to new CPU after migrated.
2080          */
2081         if (rt_se->avg.last_update_time)
2082                 __update_load_avg(now, cpu, &rt_se->avg, scale_load_down(NICE_0_LOAD),
2083                         rt_rq->curr == rt_se, NULL);
2084
2085         update_rt_rq_load_avg(now, cpu, rt_rq, true);
2086         propagate_entity_rt_load_avg(rt_se);
2087 }
2088
2089 /* Only try algorithms three times */
2090 #define RT_MAX_TRIES 3
2091
2092 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
2093 {
2094         if (!task_running(rq, p) &&
2095             cpumask_test_cpu(cpu, &p->cpus_allowed))
2096                 return 1;
2097         return 0;
2098 }
2099
2100 /*
2101  * Return the highest pushable rq's task, which is suitable to be executed
2102  * on the cpu, NULL otherwise
2103  */
2104 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
2105 {
2106         struct plist_head *head = &rq->rt.pushable_tasks;
2107         struct task_struct *p;
2108
2109         if (!has_pushable_tasks(rq))
2110                 return NULL;
2111
2112         plist_for_each_entry(p, head, pushable_tasks) {
2113                 if (pick_rt_task(rq, p, cpu))
2114                         return p;
2115         }
2116
2117         return NULL;
2118 }
2119
2120 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
2121
2122 static int find_lowest_rq(struct task_struct *task)
2123 {
2124         struct sched_domain *sd;
2125         struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
2126         int this_cpu = smp_processor_id();
2127         int cpu      = task_cpu(task);
2128
2129         /* Make sure the mask is initialized first */
2130         if (unlikely(!lowest_mask))
2131                 return -1;
2132
2133         if (task->nr_cpus_allowed == 1)
2134                 return -1; /* No other targets possible */
2135
2136         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
2137                 return -1; /* No targets found */
2138
2139         /*
2140          * At this point we have built a mask of cpus representing the
2141          * lowest priority tasks in the system.  Now we want to elect
2142          * the best one based on our affinity and topology.
2143          *
2144          * We prioritize the last cpu that the task executed on since
2145          * it is most likely cache-hot in that location.
2146          */
2147         if (cpumask_test_cpu(cpu, lowest_mask))
2148                 return cpu;
2149
2150         /*
2151          * Otherwise, we consult the sched_domains span maps to figure
2152          * out which cpu is logically closest to our hot cache data.
2153          */
2154         if (!cpumask_test_cpu(this_cpu, lowest_mask))
2155                 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
2156
2157         rcu_read_lock();
2158         for_each_domain(cpu, sd) {
2159                 if (sd->flags & SD_WAKE_AFFINE) {
2160                         int best_cpu;
2161
2162                         /*
2163                          * "this_cpu" is cheaper to preempt than a
2164                          * remote processor.
2165                          */
2166                         if (this_cpu != -1 &&
2167                             cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
2168                                 rcu_read_unlock();
2169                                 return this_cpu;
2170                         }
2171
2172                         best_cpu = cpumask_first_and(lowest_mask,
2173                                                      sched_domain_span(sd));
2174                         if (best_cpu < nr_cpu_ids) {
2175                                 rcu_read_unlock();
2176                                 return best_cpu;
2177                         }
2178                 }
2179         }
2180         rcu_read_unlock();
2181
2182         /*
2183          * And finally, if there were no matches within the domains
2184          * just give the caller *something* to work with from the compatible
2185          * locations.
2186          */
2187         if (this_cpu != -1)
2188                 return this_cpu;
2189
2190         cpu = cpumask_any(lowest_mask);
2191         if (cpu < nr_cpu_ids)
2192                 return cpu;
2193         return -1;
2194 }
2195
2196 /* Will lock the rq it finds */
2197 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
2198 {
2199         struct rq *lowest_rq = NULL;
2200         int tries;
2201         int cpu;
2202
2203         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
2204                 cpu = find_lowest_rq(task);
2205
2206                 if ((cpu == -1) || (cpu == rq->cpu))
2207                         break;
2208
2209                 lowest_rq = cpu_rq(cpu);
2210
2211                 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
2212                         /*
2213                          * Target rq has tasks of equal or higher priority,
2214                          * retrying does not release any lock and is unlikely
2215                          * to yield a different result.
2216                          */
2217                         lowest_rq = NULL;
2218                         break;
2219                 }
2220
2221                 /* if the prio of this runqueue changed, try again */
2222                 if (double_lock_balance(rq, lowest_rq)) {
2223                         /*
2224                          * We had to unlock the run queue. In
2225                          * the mean time, task could have
2226                          * migrated already or had its affinity changed.
2227                          * Also make sure that it wasn't scheduled on its rq.
2228                          */
2229                         if (unlikely(task_rq(task) != rq ||
2230                                      !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
2231                                      task_running(rq, task) ||
2232                                      !rt_task(task) ||
2233                                      !task_on_rq_queued(task))) {
2234
2235                                 double_unlock_balance(rq, lowest_rq);
2236                                 lowest_rq = NULL;
2237                                 break;
2238                         }
2239                 }
2240
2241                 /* If this rq is still suitable use it. */
2242                 if (lowest_rq->rt.highest_prio.curr > task->prio)
2243                         break;
2244
2245                 /* try again */
2246                 double_unlock_balance(rq, lowest_rq);
2247                 lowest_rq = NULL;
2248         }
2249
2250         return lowest_rq;
2251 }
2252
2253 static struct task_struct *pick_next_pushable_task(struct rq *rq)
2254 {
2255         struct task_struct *p;
2256
2257         if (!has_pushable_tasks(rq))
2258                 return NULL;
2259
2260         p = plist_first_entry(&rq->rt.pushable_tasks,
2261                               struct task_struct, pushable_tasks);
2262
2263         BUG_ON(rq->cpu != task_cpu(p));
2264         BUG_ON(task_current(rq, p));
2265         BUG_ON(p->nr_cpus_allowed <= 1);
2266
2267         BUG_ON(!task_on_rq_queued(p));
2268         BUG_ON(!rt_task(p));
2269
2270         return p;
2271 }
2272
2273 /*
2274  * If the current CPU has more than one RT task, see if the non
2275  * running task can migrate over to a CPU that is running a task
2276  * of lesser priority.
2277  */
2278 static int push_rt_task(struct rq *rq)
2279 {
2280         struct task_struct *next_task;
2281         struct rq *lowest_rq;
2282         int ret = 0;
2283
2284         if (!rq->rt.overloaded)
2285                 return 0;
2286
2287         next_task = pick_next_pushable_task(rq);
2288         if (!next_task)
2289                 return 0;
2290
2291 retry:
2292         if (unlikely(next_task == rq->curr)) {
2293                 WARN_ON(1);
2294                 return 0;
2295         }
2296
2297         /*
2298          * It's possible that the next_task slipped in of
2299          * higher priority than current. If that's the case
2300          * just reschedule current.
2301          */
2302         if (unlikely(next_task->prio < rq->curr->prio)) {
2303                 resched_curr(rq);
2304                 return 0;
2305         }
2306
2307         /* We might release rq lock */
2308         get_task_struct(next_task);
2309
2310         /* find_lock_lowest_rq locks the rq if found */
2311         lowest_rq = find_lock_lowest_rq(next_task, rq);
2312         if (!lowest_rq) {
2313                 struct task_struct *task;
2314                 /*
2315                  * find_lock_lowest_rq releases rq->lock
2316                  * so it is possible that next_task has migrated.
2317                  *
2318                  * We need to make sure that the task is still on the same
2319                  * run-queue and is also still the next task eligible for
2320                  * pushing.
2321                  */
2322                 task = pick_next_pushable_task(rq);
2323                 if (task == next_task) {
2324                         /*
2325                          * The task hasn't migrated, and is still the next
2326                          * eligible task, but we failed to find a run-queue
2327                          * to push it to.  Do not retry in this case, since
2328                          * other cpus will pull from us when ready.
2329                          */
2330                         goto out;
2331                 }
2332
2333                 if (!task)
2334                         /* No more tasks, just exit */
2335                         goto out;
2336
2337                 /*
2338                  * Something has shifted, try again.
2339                  */
2340                 put_task_struct(next_task);
2341                 next_task = task;
2342                 goto retry;
2343         }
2344
2345         deactivate_task(rq, next_task, 0);
2346         next_task->on_rq = TASK_ON_RQ_MIGRATING;
2347         set_task_cpu(next_task, lowest_rq->cpu);
2348         next_task->on_rq = TASK_ON_RQ_QUEUED;
2349         activate_task(lowest_rq, next_task, 0);
2350         ret = 1;
2351
2352         resched_curr(lowest_rq);
2353
2354         double_unlock_balance(rq, lowest_rq);
2355
2356 out:
2357         put_task_struct(next_task);
2358
2359         return ret;
2360 }
2361
2362 static void push_rt_tasks(struct rq *rq)
2363 {
2364         /* push_rt_task will return true if it moved an RT */
2365         while (push_rt_task(rq))
2366                 ;
2367 }
2368
2369 #ifdef HAVE_RT_PUSH_IPI
2370
2371 /*
2372  * When a high priority task schedules out from a CPU and a lower priority
2373  * task is scheduled in, a check is made to see if there's any RT tasks
2374  * on other CPUs that are waiting to run because a higher priority RT task
2375  * is currently running on its CPU. In this case, the CPU with multiple RT
2376  * tasks queued on it (overloaded) needs to be notified that a CPU has opened
2377  * up that may be able to run one of its non-running queued RT tasks.
2378  *
2379  * All CPUs with overloaded RT tasks need to be notified as there is currently
2380  * no way to know which of these CPUs have the highest priority task waiting
2381  * to run. Instead of trying to take a spinlock on each of these CPUs,
2382  * which has shown to cause large latency when done on machines with many
2383  * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2384  * RT tasks waiting to run.
2385  *
2386  * Just sending an IPI to each of the CPUs is also an issue, as on large
2387  * count CPU machines, this can cause an IPI storm on a CPU, especially
2388  * if its the only CPU with multiple RT tasks queued, and a large number
2389  * of CPUs scheduling a lower priority task at the same time.
2390  *
2391  * Each root domain has its own irq work function that can iterate over
2392  * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2393  * tassk must be checked if there's one or many CPUs that are lowering
2394  * their priority, there's a single irq work iterator that will try to
2395  * push off RT tasks that are waiting to run.
2396  *
2397  * When a CPU schedules a lower priority task, it will kick off the
2398  * irq work iterator that will jump to each CPU with overloaded RT tasks.
2399  * As it only takes the first CPU that schedules a lower priority task
2400  * to start the process, the rto_start variable is incremented and if
2401  * the atomic result is one, then that CPU will try to take the rto_lock.
2402  * This prevents high contention on the lock as the process handles all
2403  * CPUs scheduling lower priority tasks.
2404  *
2405  * All CPUs that are scheduling a lower priority task will increment the
2406  * rt_loop_next variable. This will make sure that the irq work iterator
2407  * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2408  * priority task, even if the iterator is in the middle of a scan. Incrementing
2409  * the rt_loop_next will cause the iterator to perform another scan.
2410  *
2411  */
2412 static int rto_next_cpu(struct root_domain *rd)
2413 {
2414         int next;
2415         int cpu;
2416
2417         /*
2418          * When starting the IPI RT pushing, the rto_cpu is set to -1,
2419          * rt_next_cpu() will simply return the first CPU found in
2420          * the rto_mask.
2421          *
2422          * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
2423          * will return the next CPU found in the rto_mask.
2424          *
2425          * If there are no more CPUs left in the rto_mask, then a check is made
2426          * against rto_loop and rto_loop_next. rto_loop is only updated with
2427          * the rto_lock held, but any CPU may increment the rto_loop_next
2428          * without any locking.
2429          */
2430         for (;;) {
2431
2432                 /* When rto_cpu is -1 this acts like cpumask_first() */
2433                 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2434
2435                 rd->rto_cpu = cpu;
2436
2437                 if (cpu < nr_cpu_ids)
2438                         return cpu;
2439
2440                 rd->rto_cpu = -1;
2441
2442                 /*
2443                  * ACQUIRE ensures we see the @rto_mask changes
2444                  * made prior to the @next value observed.
2445                  *
2446                  * Matches WMB in rt_set_overload().
2447                  */
2448                 next = atomic_read_acquire(&rd->rto_loop_next);
2449
2450                 if (rd->rto_loop == next)
2451                         break;
2452
2453                 rd->rto_loop = next;
2454         }
2455
2456         return -1;
2457 }
2458
2459 static inline bool rto_start_trylock(atomic_t *v)
2460 {
2461         return !atomic_cmpxchg_acquire(v, 0, 1);
2462 }
2463
2464 static inline void rto_start_unlock(atomic_t *v)
2465 {
2466         atomic_set_release(v, 0);
2467 }
2468
2469 static void tell_cpu_to_push(struct rq *rq)
2470 {
2471         int cpu = -1;
2472
2473         /* Keep the loop going if the IPI is currently active */
2474         atomic_inc(&rq->rd->rto_loop_next);
2475
2476         /* Only one CPU can initiate a loop at a time */
2477         if (!rto_start_trylock(&rq->rd->rto_loop_start))
2478                 return;
2479
2480         raw_spin_lock(&rq->rd->rto_lock);
2481
2482         /*
2483          * The rto_cpu is updated under the lock, if it has a valid cpu
2484          * then the IPI is still running and will continue due to the
2485          * update to loop_next, and nothing needs to be done here.
2486          * Otherwise it is finishing up and an ipi needs to be sent.
2487          */
2488         if (rq->rd->rto_cpu < 0)
2489                 cpu = rto_next_cpu(rq->rd);
2490
2491         raw_spin_unlock(&rq->rd->rto_lock);
2492
2493         rto_start_unlock(&rq->rd->rto_loop_start);
2494
2495         if (cpu >= 0) {
2496                 /* Make sure the rd does not get freed while pushing */
2497                 sched_get_rd(rq->rd);
2498                 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2499         }
2500 }
2501
2502 /* Called from hardirq context */
2503 void rto_push_irq_work_func(struct irq_work *work)
2504 {
2505         struct root_domain *rd =
2506                 container_of(work, struct root_domain, rto_push_work);
2507         struct rq *rq;
2508         int cpu;
2509
2510         rq = this_rq();
2511
2512         /*
2513          * We do not need to grab the lock to check for has_pushable_tasks.
2514          * When it gets updated, a check is made if a push is possible.
2515          */
2516         if (has_pushable_tasks(rq)) {
2517                 raw_spin_lock(&rq->lock);
2518                 push_rt_tasks(rq);
2519                 raw_spin_unlock(&rq->lock);
2520         }
2521
2522         raw_spin_lock(&rd->rto_lock);
2523
2524         /* Pass the IPI to the next rt overloaded queue */
2525         cpu = rto_next_cpu(rd);
2526
2527         raw_spin_unlock(&rd->rto_lock);
2528
2529         if (cpu < 0) {
2530                 sched_put_rd(rd);
2531                 return;
2532         }
2533
2534         /* Try the next RT overloaded CPU */
2535         irq_work_queue_on(&rd->rto_push_work, cpu);
2536 }
2537 #endif /* HAVE_RT_PUSH_IPI */
2538
2539 static void pull_rt_task(struct rq *this_rq)
2540 {
2541         int this_cpu = this_rq->cpu, cpu;
2542         bool resched = false;
2543         struct task_struct *p;
2544         struct rq *src_rq;
2545         int rt_overload_count = rt_overloaded(this_rq);
2546
2547         if (likely(!rt_overload_count))
2548                 return;
2549
2550         /*
2551          * Match the barrier from rt_set_overloaded; this guarantees that if we
2552          * see overloaded we must also see the rto_mask bit.
2553          */
2554         smp_rmb();
2555
2556         /* If we are the only overloaded CPU do nothing */
2557         if (rt_overload_count == 1 &&
2558             cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2559                 return;
2560
2561 #ifdef HAVE_RT_PUSH_IPI
2562         if (sched_feat(RT_PUSH_IPI)) {
2563                 tell_cpu_to_push(this_rq);
2564                 return;
2565         }
2566 #endif
2567
2568         for_each_cpu(cpu, this_rq->rd->rto_mask) {
2569                 if (this_cpu == cpu)
2570                         continue;
2571
2572                 src_rq = cpu_rq(cpu);
2573
2574                 /*
2575                  * Don't bother taking the src_rq->lock if the next highest
2576                  * task is known to be lower-priority than our current task.
2577                  * This may look racy, but if this value is about to go
2578                  * logically higher, the src_rq will push this task away.
2579                  * And if its going logically lower, we do not care
2580                  */
2581                 if (src_rq->rt.highest_prio.next >=
2582                     this_rq->rt.highest_prio.curr)
2583                         continue;
2584
2585                 /*
2586                  * We can potentially drop this_rq's lock in
2587                  * double_lock_balance, and another CPU could
2588                  * alter this_rq
2589                  */
2590                 double_lock_balance(this_rq, src_rq);
2591
2592                 /*
2593                  * We can pull only a task, which is pushable
2594                  * on its rq, and no others.
2595                  */
2596                 p = pick_highest_pushable_task(src_rq, this_cpu);
2597
2598                 /*
2599                  * Do we have an RT task that preempts
2600                  * the to-be-scheduled task?
2601                  */
2602                 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2603                         WARN_ON(p == src_rq->curr);
2604                         WARN_ON(!task_on_rq_queued(p));
2605
2606                         /*
2607                          * There's a chance that p is higher in priority
2608                          * than what's currently running on its cpu.
2609                          * This is just that p is wakeing up and hasn't
2610                          * had a chance to schedule. We only pull
2611                          * p if it is lower in priority than the
2612                          * current task on the run queue
2613                          */
2614                         if (p->prio < src_rq->curr->prio)
2615                                 goto skip;
2616
2617                         resched = true;
2618
2619                         deactivate_task(src_rq, p, 0);
2620                         p->on_rq = TASK_ON_RQ_MIGRATING;
2621                         set_task_cpu(p, this_cpu);
2622                         p->on_rq = TASK_ON_RQ_QUEUED;
2623                         activate_task(this_rq, p, 0);
2624                         /*
2625                          * We continue with the search, just in
2626                          * case there's an even higher prio task
2627                          * in another runqueue. (low likelihood
2628                          * but possible)
2629                          */
2630                 }
2631 skip:
2632                 double_unlock_balance(this_rq, src_rq);
2633         }
2634
2635         if (resched)
2636                 resched_curr(this_rq);
2637 }
2638
2639 /*
2640  * If we are not running and we are not going to reschedule soon, we should
2641  * try to push tasks away now
2642  */
2643 static void task_woken_rt(struct rq *rq, struct task_struct *p)
2644 {
2645         if (!task_running(rq, p) &&
2646             !test_tsk_need_resched(rq->curr) &&
2647             p->nr_cpus_allowed > 1 &&
2648             (dl_task(rq->curr) || rt_task(rq->curr)) &&
2649             (rq->curr->nr_cpus_allowed < 2 ||
2650              rq->curr->prio <= p->prio))
2651                 push_rt_tasks(rq);
2652 }
2653
2654 /* Assumes rq->lock is held */
2655 static void rq_online_rt(struct rq *rq)
2656 {
2657         if (rq->rt.overloaded)
2658                 rt_set_overload(rq);
2659
2660         __enable_runtime(rq);
2661
2662         cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
2663 }
2664
2665 /* Assumes rq->lock is held */
2666 static void rq_offline_rt(struct rq *rq)
2667 {
2668         if (rq->rt.overloaded)
2669                 rt_clear_overload(rq);
2670
2671         __disable_runtime(rq);
2672
2673         cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
2674 }
2675
2676 /*
2677  * When switch from the rt queue, we bring ourselves to a position
2678  * that we might want to pull RT tasks from other runqueues.
2679  */
2680 static void switched_from_rt(struct rq *rq, struct task_struct *p)
2681 {
2682         detach_task_rt_rq(p);
2683         /*
2684          * If there are other RT tasks then we will reschedule
2685          * and the scheduling of the other RT tasks will handle
2686          * the balancing. But if we are the last RT task
2687          * we may need to handle the pulling of RT tasks
2688          * now.
2689          */
2690         if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
2691                 return;
2692
2693         queue_pull_task(rq);
2694 }
2695
2696 void __init init_sched_rt_class(void)
2697 {
2698         unsigned int i;
2699
2700         for_each_possible_cpu(i) {
2701                 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
2702                                         GFP_KERNEL, cpu_to_node(i));
2703         }
2704 }
2705 #else
2706 void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se)
2707 {
2708 }
2709 #endif /* CONFIG_SMP */
2710
2711 extern void
2712 copy_sched_avg(struct sched_avg *from, struct sched_avg *to, unsigned int ratio);
2713
2714 /*
2715  * When switching a task to RT, we may overload the runqueue
2716  * with RT tasks. In this case we try to push them off to
2717  * other runqueues.
2718  */
2719 static void switched_to_rt(struct rq *rq, struct task_struct *p)
2720 {
2721         /* Copy fair sched avg into rt sched avg */
2722         copy_sched_avg(&p->se.avg, &p->rt.avg, 100);
2723         /*
2724          * If we are already running, then there's nothing
2725          * that needs to be done. But if we are not running
2726          * we may need to preempt the current running task.
2727          * If that current running task is also an RT task
2728          * then see if we can move to another run queue.
2729          */
2730         if (task_on_rq_queued(p) && rq->curr != p) {
2731 #ifdef CONFIG_SMP
2732                 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
2733                         queue_push_tasks(rq);
2734 #endif /* CONFIG_SMP */
2735                 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
2736                         resched_curr(rq);
2737         }
2738 }
2739
2740 /*
2741  * Priority of the task has changed. This may cause
2742  * us to initiate a push or pull.
2743  */
2744 static void
2745 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
2746 {
2747         if (!task_on_rq_queued(p))
2748                 return;
2749
2750         if (rq->curr == p) {
2751 #ifdef CONFIG_SMP
2752                 /*
2753                  * If our priority decreases while running, we
2754                  * may need to pull tasks to this runqueue.
2755                  */
2756                 if (oldprio < p->prio)
2757                         queue_pull_task(rq);
2758
2759                 /*
2760                  * If there's a higher priority task waiting to run
2761                  * then reschedule.
2762                  */
2763                 if (p->prio > rq->rt.highest_prio.curr)
2764                         resched_curr(rq);
2765 #else
2766                 /* For UP simply resched on drop of prio */
2767                 if (oldprio < p->prio)
2768                         resched_curr(rq);
2769 #endif /* CONFIG_SMP */
2770         } else {
2771                 /*
2772                  * This task is not running, but if it is
2773                  * greater than the current running task
2774                  * then reschedule.
2775                  */
2776                 if (p->prio < rq->curr->prio)
2777                         resched_curr(rq);
2778         }
2779 }
2780
2781 #ifdef CONFIG_POSIX_TIMERS
2782 static void watchdog(struct rq *rq, struct task_struct *p)
2783 {
2784         unsigned long soft, hard;
2785
2786         /* max may change after cur was read, this will be fixed next tick */
2787         soft = task_rlimit(p, RLIMIT_RTTIME);
2788         hard = task_rlimit_max(p, RLIMIT_RTTIME);
2789
2790         if (soft != RLIM_INFINITY) {
2791                 unsigned long next;
2792
2793                 if (p->rt.watchdog_stamp != jiffies) {
2794                         p->rt.timeout++;
2795                         p->rt.watchdog_stamp = jiffies;
2796                 }
2797
2798                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
2799                 if (p->rt.timeout > next)
2800                         p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
2801         }
2802 }
2803 #else
2804 static inline void watchdog(struct rq *rq, struct task_struct *p) { }
2805 #endif
2806
2807 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
2808 {
2809         struct sched_rt_entity *rt_se = &p->rt;
2810         u64 now = rq_clock_task(rq);
2811
2812         update_curr_rt(rq);
2813         update_rt_rq_load_avg(now, cpu_of(rq), &rq->rt, 1);
2814
2815         for_each_sched_rt_entity(rt_se)
2816                 update_rt_load_avg(now, rt_se);
2817
2818         watchdog(rq, p);
2819
2820         /*
2821          * RR tasks need a special form of timeslice management.
2822          * FIFO tasks have no timeslices.
2823          */
2824         if (p->policy != SCHED_RR)
2825                 return;
2826
2827         if (--p->rt.time_slice)
2828                 return;
2829
2830         p->rt.time_slice = sched_rr_timeslice;
2831
2832         /*
2833          * Requeue to the end of queue if we (and all of our ancestors) are not
2834          * the only element on the queue
2835          */
2836         for_each_sched_rt_entity(rt_se) {
2837                 if (rt_se->run_list.prev != rt_se->run_list.next) {
2838                         requeue_task_rt(rq, p, 0);
2839                         resched_curr(rq);
2840                         return;
2841                 }
2842         }
2843 }
2844
2845 static void set_curr_task_rt(struct rq *rq)
2846 {
2847         struct task_struct *p = rq->curr;
2848         struct sched_rt_entity *rt_se = &p->rt;
2849
2850         p->se.exec_start = rq_clock_task(rq);
2851
2852         for_each_sched_rt_entity(rt_se) {
2853                 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
2854                 rt_rq->curr = rt_se;
2855         }
2856
2857         /* The running task is never eligible for pushing */
2858         dequeue_pushable_task(rq, p);
2859 }
2860
2861 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
2862 {
2863         /*
2864          * Time slice is 0 for SCHED_FIFO tasks
2865          */
2866         if (task->policy == SCHED_RR)
2867                 return sched_rr_timeslice;
2868         else
2869                 return 0;
2870 }
2871
2872 const struct sched_class rt_sched_class = {
2873         .next                   = &fair_sched_class,
2874         .enqueue_task           = enqueue_task_rt,
2875         .dequeue_task           = dequeue_task_rt,
2876         .yield_task             = yield_task_rt,
2877
2878         .check_preempt_curr     = check_preempt_curr_rt,
2879
2880         .pick_next_task         = pick_next_task_rt,
2881         .put_prev_task          = put_prev_task_rt,
2882
2883 #ifdef CONFIG_SMP
2884         .select_task_rq         = select_task_rq_rt,
2885
2886         .migrate_task_rq                = migrate_task_rq_rt,
2887         .task_dead                              = task_dead_rt,
2888         .set_cpus_allowed       = set_cpus_allowed_common,
2889         .rq_online              = rq_online_rt,
2890         .rq_offline             = rq_offline_rt,
2891         .task_woken             = task_woken_rt,
2892         .switched_from          = switched_from_rt,
2893 #endif
2894
2895         .set_curr_task          = set_curr_task_rt,
2896         .task_tick              = task_tick_rt,
2897
2898         .get_rr_interval        = get_rr_interval_rt,
2899
2900         .prio_changed           = prio_changed_rt,
2901         .switched_to            = switched_to_rt,
2902
2903         .update_curr            = update_curr_rt,
2904 #ifdef CONFIG_RT_GROUP_SCHED
2905         .task_change_group      = task_change_group_rt,
2906 #endif
2907 };
2908
2909 #ifdef CONFIG_RT_GROUP_SCHED
2910 /*
2911  * Ensure that the real time constraints are schedulable.
2912  */
2913 static DEFINE_MUTEX(rt_constraints_mutex);
2914
2915 /* Must be called with tasklist_lock held */
2916 static inline int tg_has_rt_tasks(struct task_group *tg)
2917 {
2918         struct task_struct *g, *p;
2919
2920         /*
2921          * Autogroups do not have RT tasks; see autogroup_create().
2922          */
2923         if (task_group_is_autogroup(tg))
2924                 return 0;
2925
2926         for_each_process_thread(g, p) {
2927                 if (rt_task(p) && task_group(p) == tg)
2928                         return 1;
2929         }
2930
2931         return 0;
2932 }
2933
2934 struct rt_schedulable_data {
2935         struct task_group *tg;
2936         u64 rt_period;
2937         u64 rt_runtime;
2938 };
2939
2940 static int tg_rt_schedulable(struct task_group *tg, void *data)
2941 {
2942         struct rt_schedulable_data *d = data;
2943         struct task_group *child;
2944         unsigned long total, sum = 0;
2945         u64 period, runtime;
2946
2947         period = ktime_to_ns(tg->rt_bandwidth.rt_period);
2948         runtime = tg->rt_bandwidth.rt_runtime;
2949
2950         if (tg == d->tg) {
2951                 period = d->rt_period;
2952                 runtime = d->rt_runtime;
2953         }
2954
2955         /*
2956          * Cannot have more runtime than the period.
2957          */
2958         if (runtime > period && runtime != RUNTIME_INF)
2959                 return -EINVAL;
2960
2961         /*
2962          * Ensure we don't starve existing RT tasks.
2963          */
2964         if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
2965                 return -EBUSY;
2966
2967         total = to_ratio(period, runtime);
2968
2969         /*
2970          * Nobody can have more than the global setting allows.
2971          */
2972         if (total > to_ratio(global_rt_period(), global_rt_runtime()))
2973                 return -EINVAL;
2974
2975         /*
2976          * The sum of our children's runtime should not exceed our own.
2977          */
2978         list_for_each_entry_rcu(child, &tg->children, siblings) {
2979                 period = ktime_to_ns(child->rt_bandwidth.rt_period);
2980                 runtime = child->rt_bandwidth.rt_runtime;
2981
2982                 if (child == d->tg) {
2983                         period = d->rt_period;
2984                         runtime = d->rt_runtime;
2985                 }
2986
2987                 sum += to_ratio(period, runtime);
2988         }
2989
2990         if (sum > total)
2991                 return -EINVAL;
2992
2993         return 0;
2994 }
2995
2996 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
2997 {
2998         int ret;
2999
3000         struct rt_schedulable_data data = {
3001                 .tg = tg,
3002                 .rt_period = period,
3003                 .rt_runtime = runtime,
3004         };
3005
3006         rcu_read_lock();
3007         ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
3008         rcu_read_unlock();
3009
3010         return ret;
3011 }
3012
3013 static int tg_set_rt_bandwidth(struct task_group *tg,
3014                 u64 rt_period, u64 rt_runtime)
3015 {
3016         int i, err = 0;
3017
3018         /*
3019          * Disallowing the root group RT runtime is BAD, it would disallow the
3020          * kernel creating (and or operating) RT threads.
3021          */
3022         if (tg == &root_task_group && rt_runtime == 0)
3023                 return -EINVAL;
3024
3025         /* No period doesn't make any sense. */
3026         if (rt_period == 0)
3027                 return -EINVAL;
3028
3029         mutex_lock(&rt_constraints_mutex);
3030         read_lock(&tasklist_lock);
3031         err = __rt_schedulable(tg, rt_period, rt_runtime);
3032         if (err)
3033                 goto unlock;
3034
3035         raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
3036         tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
3037         tg->rt_bandwidth.rt_runtime = rt_runtime;
3038
3039         for_each_possible_cpu(i) {
3040                 struct rt_rq *rt_rq = tg->rt_rq[i];
3041
3042                 raw_spin_lock(&rt_rq->rt_runtime_lock);
3043                 rt_rq->rt_runtime = rt_runtime;
3044                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
3045         }
3046         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
3047 unlock:
3048         read_unlock(&tasklist_lock);
3049         mutex_unlock(&rt_constraints_mutex);
3050
3051         return err;
3052 }
3053
3054 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
3055 {
3056         u64 rt_runtime, rt_period;
3057
3058         rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
3059         rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
3060         if (rt_runtime_us < 0)
3061                 rt_runtime = RUNTIME_INF;
3062         else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
3063                 return -EINVAL;
3064
3065         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
3066 }
3067
3068 long sched_group_rt_runtime(struct task_group *tg)
3069 {
3070         u64 rt_runtime_us;
3071
3072         if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
3073                 return -1;
3074
3075         rt_runtime_us = tg->rt_bandwidth.rt_runtime;
3076         do_div(rt_runtime_us, NSEC_PER_USEC);
3077         return rt_runtime_us;
3078 }
3079
3080 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
3081 {
3082         u64 rt_runtime, rt_period;
3083
3084         if (rt_period_us > U64_MAX / NSEC_PER_USEC)
3085                 return -EINVAL;
3086
3087         rt_period = rt_period_us * NSEC_PER_USEC;
3088         rt_runtime = tg->rt_bandwidth.rt_runtime;
3089
3090         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
3091 }
3092
3093 long sched_group_rt_period(struct task_group *tg)
3094 {
3095         u64 rt_period_us;
3096
3097         rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
3098         do_div(rt_period_us, NSEC_PER_USEC);
3099         return rt_period_us;
3100 }
3101
3102 static int sched_rt_global_constraints(void)
3103 {
3104         int ret = 0;
3105
3106         mutex_lock(&rt_constraints_mutex);
3107         read_lock(&tasklist_lock);
3108         ret = __rt_schedulable(NULL, 0, 0);
3109         read_unlock(&tasklist_lock);
3110         mutex_unlock(&rt_constraints_mutex);
3111
3112         return ret;
3113 }
3114
3115 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
3116 {
3117         /* Don't accept realtime tasks when there is no way for them to run */
3118         if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
3119                 return 0;
3120
3121         return 1;
3122 }
3123
3124 #else /* !CONFIG_RT_GROUP_SCHED */
3125 static int sched_rt_global_constraints(void)
3126 {
3127         unsigned long flags;
3128         int i;
3129
3130         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
3131         for_each_possible_cpu(i) {
3132                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
3133
3134                 raw_spin_lock(&rt_rq->rt_runtime_lock);
3135                 rt_rq->rt_runtime = global_rt_runtime();
3136                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
3137         }
3138         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
3139
3140         return 0;
3141 }
3142 #endif /* CONFIG_RT_GROUP_SCHED */
3143
3144 static int sched_rt_global_validate(void)
3145 {
3146         if (sysctl_sched_rt_period <= 0)
3147                 return -EINVAL;
3148
3149         if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
3150                 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
3151                 return -EINVAL;
3152
3153         return 0;
3154 }
3155
3156 static void sched_rt_do_global(void)
3157 {
3158         unsigned long flags;
3159
3160         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
3161         def_rt_bandwidth.rt_runtime = global_rt_runtime();
3162         def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
3163         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
3164 }
3165
3166 int sched_rt_handler(struct ctl_table *table, int write,
3167                 void __user *buffer, size_t *lenp,
3168                 loff_t *ppos)
3169 {
3170         int old_period, old_runtime;
3171         static DEFINE_MUTEX(mutex);
3172         int ret;
3173
3174         mutex_lock(&mutex);
3175         old_period = sysctl_sched_rt_period;
3176         old_runtime = sysctl_sched_rt_runtime;
3177
3178         ret = proc_dointvec(table, write, buffer, lenp, ppos);
3179
3180         if (!ret && write) {
3181                 ret = sched_rt_global_validate();
3182                 if (ret)
3183                         goto undo;
3184
3185                 ret = sched_dl_global_validate();
3186                 if (ret)
3187                         goto undo;
3188
3189                 ret = sched_rt_global_constraints();
3190                 if (ret)
3191                         goto undo;
3192
3193                 sched_rt_do_global();
3194                 sched_dl_do_global();
3195         }
3196         if (0) {
3197 undo:
3198                 sysctl_sched_rt_period = old_period;
3199                 sysctl_sched_rt_runtime = old_runtime;
3200         }
3201         mutex_unlock(&mutex);
3202
3203         return ret;
3204 }
3205
3206 int sched_rr_handler(struct ctl_table *table, int write,
3207                 void __user *buffer, size_t *lenp,
3208                 loff_t *ppos)
3209 {
3210         int ret;
3211         static DEFINE_MUTEX(mutex);
3212
3213         mutex_lock(&mutex);
3214         ret = proc_dointvec(table, write, buffer, lenp, ppos);
3215         /*
3216          * Make sure that internally we keep jiffies.
3217          * Also, writing zero resets the timeslice to default:
3218          */
3219         if (!ret && write) {
3220                 sched_rr_timeslice =
3221                         sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
3222                         msecs_to_jiffies(sysctl_sched_rr_timeslice);
3223         }
3224         mutex_unlock(&mutex);
3225         return ret;
3226 }
3227
3228 #ifdef CONFIG_SCHED_DEBUG
3229 void print_rt_stats(struct seq_file *m, int cpu)
3230 {
3231         rt_rq_iter_t iter;
3232         struct rt_rq *rt_rq;
3233
3234         rcu_read_lock();
3235         for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
3236                 print_rt_rq(m, cpu, rt_rq);
3237         rcu_read_unlock();
3238 }
3239 #endif /* CONFIG_SCHED_DEBUG */