kernel/sched/rt.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Real-Time Scheduling Class (mapped to the SCHED_FIFO and SCHED_RR
   4  * policies)
   5  */
   6
   7 #include "sched.h"
   8
   9 #include <linux/slab.h>
  10 #include <linux/irq_work.h>
  11 #include "tune.h"
  12
  13 #include "walt.h"
  14
  15 #include <trace/events/sched.h>
  16
  17 int sched_rr_timeslice = RR_TIMESLICE;
  18 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
  19
  20
  21 void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se);
  22
  23 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
  24
  25 struct rt_bandwidth def_rt_bandwidth;
  26
  27 static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer)
  28 {
  29         struct rt_bandwidth *rt_b =
  30                 container_of(timer, struct rt_bandwidth, rt_period_timer);
  31         int idle = 0;
  32         int overrun;
  33
  34         raw_spin_lock(&rt_b->rt_runtime_lock);
  35         for (;;) {
  36                 overrun = hrtimer_forward_now(timer, rt_b->rt_period);
  37                 if (!overrun)
  38                         break;
  39
  40                 raw_spin_unlock(&rt_b->rt_runtime_lock);
  41                 idle = do_sched_rt_period_timer(rt_b, overrun);
  42                 raw_spin_lock(&rt_b->rt_runtime_lock);
  43         }
  44         if (idle)
  45                 rt_b->rt_period_active = 0;
  46         raw_spin_unlock(&rt_b->rt_runtime_lock);
  47
  48         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
  49 }
  50
  51 void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime)
  52 {
  53         rt_b->rt_period = ns_to_ktime(period);
  54         rt_b->rt_runtime = runtime;
  55
  56         raw_spin_lock_init(&rt_b->rt_runtime_lock);
  57
  58         hrtimer_init(&rt_b->rt_period_timer,
  59                         CLOCK_MONOTONIC, HRTIMER_MODE_REL);
  60         rt_b->rt_period_timer.function = sched_rt_period_timer;
  61 }
  62
  63 static inline void do_start_rt_bandwidth(struct rt_bandwidth *rt_b)
  64 {
  65         raw_spin_lock(&rt_b->rt_runtime_lock);
  66         if (!rt_b->rt_period_active) {
  67                 rt_b->rt_period_active = 1;
  68                 /*
  69                  * SCHED_DEADLINE updates the bandwidth, as a run away
  70                  * RT task with a DL task could hog a CPU. But DL does
  71                  * not reset the period. If a deadline task was running
  72                  * without an RT task running, it can cause RT tasks to
  73                  * throttle when they start up. Kick the timer right away
  74                  * to update the period.
  75                  */
  76                 hrtimer_forward_now(&rt_b->rt_period_timer, ns_to_ktime(0));
  77                 hrtimer_start_expires(&rt_b->rt_period_timer, HRTIMER_MODE_ABS_PINNED);
  78         }
  79         raw_spin_unlock(&rt_b->rt_runtime_lock);
  80 }
  81
  82 static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
  83 {
  84         if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
  85                 return;
  86
  87         do_start_rt_bandwidth(rt_b);
  88 }
  89
  90 void init_rt_rq(struct rt_rq *rt_rq)
  91 {
  92         struct rt_prio_array *array;
  93         int i;
  94
  95         array = &rt_rq->active;
  96         for (i = 0; i < MAX_RT_PRIO; i++) {
  97                 INIT_LIST_HEAD(array->queue + i);
  98                 __clear_bit(i, array->bitmap);
  99         }
 100         /* delimiter for bitsearch: */
 101         __set_bit(MAX_RT_PRIO, array->bitmap);
 102
 103 #if defined CONFIG_SMP
 104         rt_rq->highest_prio.curr = MAX_RT_PRIO;
 105         rt_rq->highest_prio.next = MAX_RT_PRIO;
 106         rt_rq->rt_nr_migratory = 0;
 107         rt_rq->overloaded = 0;
 108         plist_head_init(&rt_rq->pushable_tasks);
 109         atomic_long_set(&rt_rq->removed_util_avg, 0);
 110         atomic_long_set(&rt_rq->removed_load_avg, 0);
 111 #endif /* CONFIG_SMP */
 112         /* We start is dequeued state, because no RT tasks are queued */
 113         rt_rq->rt_queued = 0;
 114
 115         rt_rq->rt_time = 0;
 116         rt_rq->rt_throttled = 0;
 117         rt_rq->rt_runtime = 0;
 118         raw_spin_lock_init(&rt_rq->rt_runtime_lock);
 119 }
 120
 121 #ifdef CONFIG_RT_GROUP_SCHED
 122 static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b)
 123 {
 124         hrtimer_cancel(&rt_b->rt_period_timer);
 125 }
 126
 127 #define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
 128
 129 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 130 {
 131 #ifdef CONFIG_SCHED_DEBUG
 132         WARN_ON_ONCE(!rt_entity_is_task(rt_se));
 133 #endif
 134         return container_of(rt_se, struct task_struct, rt);
 135 }
 136
 137 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 138 {
 139         return rt_rq->rq;
 140 }
 141
 142 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 143 {
 144         return rt_se->rt_rq;
 145 }
 146
 147 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 148 {
 149         struct rt_rq *rt_rq = rt_se->rt_rq;
 150
 151         return rt_rq->rq;
 152 }
 153
 154 void free_rt_sched_group(struct task_group *tg)
 155 {
 156         int i;
 157
 158         if (tg->rt_se)
 159                 destroy_rt_bandwidth(&tg->rt_bandwidth);
 160
 161         for_each_possible_cpu(i) {
 162                 if (tg->rt_rq)
 163                         kfree(tg->rt_rq[i]);
 164                 if (tg->rt_se)
 165                         kfree(tg->rt_se[i]);
 166         }
 167
 168         kfree(tg->rt_rq);
 169         kfree(tg->rt_se);
 170 }
 171
 172 void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
 173                 struct sched_rt_entity *rt_se, int cpu,
 174                 struct sched_rt_entity *parent)
 175 {
 176         struct rq *rq = cpu_rq(cpu);
 177
 178         rt_rq->highest_prio.curr = MAX_RT_PRIO;
 179         rt_rq->rt_nr_boosted = 0;
 180         rt_rq->rq = rq;
 181         rt_rq->tg = tg;
 182
 183         tg->rt_rq[cpu] = rt_rq;
 184         tg->rt_se[cpu] = rt_se;
 185
 186         if (!rt_se)
 187                 return;
 188
 189         if (!parent)
 190                 rt_se->rt_rq = &rq->rt;
 191         else
 192                 rt_se->rt_rq = parent->my_q;
 193
 194         rt_se->my_q = rt_rq;
 195         rt_se->parent = parent;
 196         INIT_LIST_HEAD(&rt_se->run_list);
 197 }
 198
 199 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 200 {
 201         struct rt_rq *rt_rq;
 202         struct sched_rt_entity *rt_se;
 203         int i;
 204
 205         tg->rt_rq = kzalloc(sizeof(rt_rq) * nr_cpu_ids, GFP_KERNEL);
 206         if (!tg->rt_rq)
 207                 goto err;
 208         tg->rt_se = kzalloc(sizeof(rt_se) * nr_cpu_ids, GFP_KERNEL);
 209         if (!tg->rt_se)
 210                 goto err;
 211
 212         init_rt_bandwidth(&tg->rt_bandwidth,
 213                         ktime_to_ns(def_rt_bandwidth.rt_period), 0);
 214
 215         for_each_possible_cpu(i) {
 216                 rt_rq = kzalloc_node(sizeof(struct rt_rq),
 217                                      GFP_KERNEL, cpu_to_node(i));
 218                 if (!rt_rq)
 219                         goto err;
 220
 221                 rt_se = kzalloc_node(sizeof(struct sched_rt_entity),
 222                                      GFP_KERNEL, cpu_to_node(i));
 223                 if (!rt_se)
 224                         goto err_free_rq;
 225
 226                 init_rt_rq(rt_rq);
 227                 rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
 228                 init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
 229                 init_rt_entity_runnable_average(rt_se);
 230         }
 231
 232         return 1;
 233
 234 err_free_rq:
 235         kfree(rt_rq);
 236 err:
 237         return 0;
 238 }
 239
 240 #else /* CONFIG_RT_GROUP_SCHED */
 241
 242 #define rt_entity_is_task(rt_se) (1)
 243
 244 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
 245 {
 246         return container_of(rt_se, struct task_struct, rt);
 247 }
 248
 249 static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
 250 {
 251         return container_of(rt_rq, struct rq, rt);
 252 }
 253
 254 static inline struct rq *rq_of_rt_se(struct sched_rt_entity *rt_se)
 255 {
 256         struct task_struct *p = rt_task_of(rt_se);
 257
 258         return task_rq(p);
 259 }
 260
 261 static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
 262 {
 263         struct rq *rq = rq_of_rt_se(rt_se);
 264
 265         return &rq->rt;
 266 }
 267
 268 void free_rt_sched_group(struct task_group *tg) { }
 269
 270 int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 271 {
 272         return 1;
 273 }
 274 #endif /* CONFIG_RT_GROUP_SCHED */
 275
 276 #ifdef CONFIG_SMP
 277
 278 #include "sched-pelt.h"
 279 #define entity_is_task(se)      (!se->my_q)
 280
 281 extern u64 decay_load(u64 val, u64 n);
 282
 283 static u32 __accumulate_pelt_segments_rt(u64 periods, u32 d1, u32 d3)
 284 {
 285         u32 c1, c2, c3 = d3;
 286
 287         c1 = decay_load((u64)d1, periods);
 288
 289         c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
 290
 291         return c1 + c2 + c3;
 292 }
 293
 294 #define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
 295
 296 static __always_inline u32
 297 accumulate_sum_rt(u64 delta, int cpu, struct sched_avg *sa,
 298                unsigned long weight, int running)
 299 {
 300         unsigned long scale_freq, scale_cpu;
 301         u32 contrib = (u32)delta;
 302         u64 periods;
 303
 304         scale_freq = arch_scale_freq_capacity(NULL, cpu);
 305         scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
 306
 307         delta += sa->period_contrib;
 308         periods = delta / 1024;
 309
 310         if (periods) {
 311                 sa->load_sum = decay_load(sa->load_sum, periods);
 312                 sa->util_sum = decay_load((u64)(sa->util_sum), periods);
 313
 314                 delta %= 1024;
 315                 contrib = __accumulate_pelt_segments_rt(periods,
 316                                 1024 - sa->period_contrib, delta);
 317         }
 318         sa->period_contrib = delta;
 319
 320         contrib = cap_scale(contrib, scale_freq);
 321         if (weight) {
 322                 sa->load_sum += weight * contrib;
 323         }
 324         if (running)
 325                 sa->util_sum += contrib * scale_cpu;
 326
 327         return periods;
 328 }
 329
 330 /*
 331  * We can represent the historical contribution to runnable average as the
 332  * coefficients of a geometric series, exactly like fair task load.
 333  * refer the ___update_load_avg @ fair sched class
 334  */
 335 static __always_inline int
 336 __update_load_avg(u64 now, int cpu, struct sched_avg *sa,
 337         unsigned long weight, int running, struct rt_rq *rt_rq)
 338 {
 339         u64 delta;
 340
 341         delta = now - sa->last_update_time;
 342
 343         if ((s64)delta < 0) {
 344                 sa->last_update_time = now;
 345                 return 0;
 346         }
 347
 348         delta >>= 10;
 349         if (!delta)
 350                 return 0;
 351
 352         sa->last_update_time += delta << 10;
 353
 354         if (!weight)
 355                 running = 0;
 356
 357         if (!accumulate_sum_rt(delta, cpu, sa, weight, running))
 358                 return 0;
 359
 360         sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
 361         sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
 362
 363         return 1;
 364 }
 365
 366 static void pull_rt_task(struct rq *this_rq);
 367
 368 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 369 {
 370         /* Try to pull RT tasks here if we lower this rq's prio */
 371         return rq->rt.highest_prio.curr > prev->prio;
 372 }
 373
 374 static inline int rt_overloaded(struct rq *rq)
 375 {
 376         return atomic_read(&rq->rd->rto_count);
 377 }
 378
 379 static inline void rt_set_overload(struct rq *rq)
 380 {
 381         if (!rq->online)
 382                 return;
 383
 384         cpumask_set_cpu(rq->cpu, rq->rd->rto_mask);
 385         /*
 386          * Make sure the mask is visible before we set
 387          * the overload count. That is checked to determine
 388          * if we should look at the mask. It would be a shame
 389          * if we looked at the mask, but the mask was not
 390          * updated yet.
 391          *
 392          * Matched by the barrier in pull_rt_task().
 393          */
 394         smp_wmb();
 395         atomic_inc(&rq->rd->rto_count);
 396 }
 397
 398 static inline void rt_clear_overload(struct rq *rq)
 399 {
 400         if (!rq->online)
 401                 return;
 402
 403         /* the order here really doesn't matter */
 404         atomic_dec(&rq->rd->rto_count);
 405         cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 406 }
 407
 408 static void update_rt_migration(struct rt_rq *rt_rq)
 409 {
 410         if (rt_rq->rt_nr_migratory && rt_rq->rt_nr_total > 1) {
 411                 if (!rt_rq->overloaded) {
 412                         rt_set_overload(rq_of_rt_rq(rt_rq));
 413                         rt_rq->overloaded = 1;
 414                 }
 415         } else if (rt_rq->overloaded) {
 416                 rt_clear_overload(rq_of_rt_rq(rt_rq));
 417                 rt_rq->overloaded = 0;
 418         }
 419 }
 420
 421 static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 422 {
 423         struct task_struct *p;
 424
 425         if (!rt_entity_is_task(rt_se))
 426                 return;
 427
 428         p = rt_task_of(rt_se);
 429         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 430
 431         rt_rq->rt_nr_total++;
 432         if (p->nr_cpus_allowed > 1)
 433                 rt_rq->rt_nr_migratory++;
 434
 435         update_rt_migration(rt_rq);
 436 }
 437
 438 static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 439 {
 440         struct task_struct *p;
 441
 442         if (!rt_entity_is_task(rt_se))
 443                 return;
 444
 445         p = rt_task_of(rt_se);
 446         rt_rq = &rq_of_rt_rq(rt_rq)->rt;
 447
 448         rt_rq->rt_nr_total--;
 449         if (p->nr_cpus_allowed > 1)
 450                 rt_rq->rt_nr_migratory--;
 451
 452         update_rt_migration(rt_rq);
 453 }
 454
 455 static inline int has_pushable_tasks(struct rq *rq)
 456 {
 457         return !plist_head_empty(&rq->rt.pushable_tasks);
 458 }
 459
 460 static DEFINE_PER_CPU(struct callback_head, rt_push_head);
 461 static DEFINE_PER_CPU(struct callback_head, rt_pull_head);
 462
 463 static void push_rt_tasks(struct rq *);
 464 static void pull_rt_task(struct rq *);
 465
 466 static inline void queue_push_tasks(struct rq *rq)
 467 {
 468         if (!has_pushable_tasks(rq))
 469                 return;
 470
 471         queue_balance_callback(rq, &per_cpu(rt_push_head, rq->cpu), push_rt_tasks);
 472 }
 473
 474 static inline void queue_pull_task(struct rq *rq)
 475 {
 476         queue_balance_callback(rq, &per_cpu(rt_pull_head, rq->cpu), pull_rt_task);
 477 }
 478
 479 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 480 {
 481         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 482         plist_node_init(&p->pushable_tasks, p->prio);
 483         plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
 484
 485         /* Update the highest prio pushable task */
 486         if (p->prio < rq->rt.highest_prio.next)
 487                 rq->rt.highest_prio.next = p->prio;
 488 }
 489
 490 static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 491 {
 492         plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
 493
 494         /* Update the new highest prio pushable task */
 495         if (has_pushable_tasks(rq)) {
 496                 p = plist_first_entry(&rq->rt.pushable_tasks,
 497                                       struct task_struct, pushable_tasks);
 498                 rq->rt.highest_prio.next = p->prio;
 499         } else
 500                 rq->rt.highest_prio.next = MAX_RT_PRIO;
 501 }
 502
 503 #else
 504
 505 static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 506 {
 507 }
 508
 509 static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 510 {
 511 }
 512
 513 static inline
 514 void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 515 {
 516 }
 517
 518 static inline
 519 void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 520 {
 521 }
 522
 523 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
 524 {
 525         return false;
 526 }
 527
 528 static inline void pull_rt_task(struct rq *this_rq)
 529 {
 530 }
 531
 532 static inline void queue_push_tasks(struct rq *rq)
 533 {
 534 }
 535 #endif /* CONFIG_SMP */
 536
 537 static void enqueue_top_rt_rq(struct rt_rq *rt_rq);
 538 static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 539
 540 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 541 {
 542         return rt_se->on_rq;
 543 }
 544
 545 #ifdef CONFIG_RT_GROUP_SCHED
 546
 547 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 548 {
 549         if (!rt_rq->tg)
 550                 return RUNTIME_INF;
 551
 552         return rt_rq->rt_runtime;
 553 }
 554
 555 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 556 {
 557         return ktime_to_ns(rt_rq->tg->rt_bandwidth.rt_period);
 558 }
 559
 560 typedef struct task_group *rt_rq_iter_t;
 561
 562 static inline struct task_group *next_task_group(struct task_group *tg)
 563 {
 564         do {
 565                 tg = list_entry_rcu(tg->list.next,
 566                         typeof(struct task_group), list);
 567         } while (&tg->list != &task_groups && task_group_is_autogroup(tg));
 568
 569         if (&tg->list == &task_groups)
 570                 tg = NULL;
 571
 572         return tg;
 573 }
 574
 575 #define for_each_rt_rq(rt_rq, iter, rq)                                 \
 576         for (iter = container_of(&task_groups, typeof(*iter), list);    \
 577                 (iter = next_task_group(iter)) &&                       \
 578                 (rt_rq = iter->rt_rq[cpu_of(rq)]);)
 579
 580 #define for_each_sched_rt_entity(rt_se) \
 581         for (; rt_se; rt_se = rt_se->parent)
 582
 583 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 584 {
 585         return rt_se->my_q;
 586 }
 587
 588 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 589 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 590
 591 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 592 {
 593         struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
 594         struct rq *rq = rq_of_rt_rq(rt_rq);
 595         struct sched_rt_entity *rt_se;
 596
 597         int cpu = cpu_of(rq);
 598
 599         rt_se = rt_rq->tg->rt_se[cpu];
 600
 601         if (rt_rq->rt_nr_running) {
 602                 if (!rt_se)
 603                         enqueue_top_rt_rq(rt_rq);
 604                 else if (!on_rt_rq(rt_se))
 605                         enqueue_rt_entity(rt_se, 0);
 606
 607                 if (rt_rq->highest_prio.curr < curr->prio)
 608                         resched_curr(rq);
 609         }
 610 }
 611
 612 static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 613 {
 614         struct sched_rt_entity *rt_se;
 615         int cpu = cpu_of(rq_of_rt_rq(rt_rq));
 616
 617         rt_se = rt_rq->tg->rt_se[cpu];
 618
 619         if (!rt_se)
 620                 dequeue_top_rt_rq(rt_rq);
 621         else if (on_rt_rq(rt_se))
 622                 dequeue_rt_entity(rt_se, 0);
 623 }
 624
 625 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 626 {
 627         return rt_rq->rt_throttled && !rt_rq->rt_nr_boosted;
 628 }
 629
 630 static int rt_se_boosted(struct sched_rt_entity *rt_se)
 631 {
 632         struct rt_rq *rt_rq = group_rt_rq(rt_se);
 633         struct task_struct *p;
 634
 635         if (rt_rq)
 636                 return !!rt_rq->rt_nr_boosted;
 637
 638         p = rt_task_of(rt_se);
 639         return p->prio != p->normal_prio;
 640 }
 641
 642 #ifdef CONFIG_SMP
 643 static inline const struct cpumask *sched_rt_period_mask(void)
 644 {
 645         return this_rq()->rd->span;
 646 }
 647 #else
 648 static inline const struct cpumask *sched_rt_period_mask(void)
 649 {
 650         return cpu_online_mask;
 651 }
 652 #endif
 653
 654 static inline
 655 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 656 {
 657         return container_of(rt_b, struct task_group, rt_bandwidth)->rt_rq[cpu];
 658 }
 659
 660 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 661 {
 662         return &rt_rq->tg->rt_bandwidth;
 663 }
 664
 665 #else /* !CONFIG_RT_GROUP_SCHED */
 666
 667 static inline u64 sched_rt_runtime(struct rt_rq *rt_rq)
 668 {
 669         return rt_rq->rt_runtime;
 670 }
 671
 672 static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 673 {
 674         return ktime_to_ns(def_rt_bandwidth.rt_period);
 675 }
 676
 677 typedef struct rt_rq *rt_rq_iter_t;
 678
 679 #define for_each_rt_rq(rt_rq, iter, rq) \
 680         for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 681
 682 #define for_each_sched_rt_entity(rt_se) \
 683         for (; rt_se; rt_se = NULL)
 684
 685 static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
 686 {
 687         return NULL;
 688 }
 689
 690 static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 691 {
 692         struct rq *rq = rq_of_rt_rq(rt_rq);
 693
 694         if (!rt_rq->rt_nr_running)
 695                 return;
 696
 697         enqueue_top_rt_rq(rt_rq);
 698         resched_curr(rq);
 699 }
 700
 701 static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
 702 {
 703         dequeue_top_rt_rq(rt_rq);
 704 }
 705
 706 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
 707 {
 708         return rt_rq->rt_throttled;
 709 }
 710
 711 static inline const struct cpumask *sched_rt_period_mask(void)
 712 {
 713         return cpu_online_mask;
 714 }
 715
 716 static inline
 717 struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu)
 718 {
 719         return &cpu_rq(cpu)->rt;
 720 }
 721
 722 static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq)
 723 {
 724         return &def_rt_bandwidth;
 725 }
 726
 727 #endif /* CONFIG_RT_GROUP_SCHED */
 728
 729 bool sched_rt_bandwidth_account(struct rt_rq *rt_rq)
 730 {
 731         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 732
 733         return (hrtimer_active(&rt_b->rt_period_timer) ||
 734                 rt_rq->rt_time < rt_b->rt_runtime);
 735 }
 736
 737 #ifdef CONFIG_SMP
 738 /*
 739  * We ran out of runtime, see if we can borrow some from our neighbours.
 740  */
 741 static void do_balance_runtime(struct rt_rq *rt_rq)
 742 {
 743         struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 744         struct root_domain *rd = rq_of_rt_rq(rt_rq)->rd;
 745         int i, weight;
 746         u64 rt_period;
 747
 748         weight = cpumask_weight(rd->span);
 749
 750         raw_spin_lock(&rt_b->rt_runtime_lock);
 751         rt_period = ktime_to_ns(rt_b->rt_period);
 752         for_each_cpu(i, rd->span) {
 753                 struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 754                 s64 diff;
 755
 756                 if (iter == rt_rq)
 757                         continue;
 758
 759                 raw_spin_lock(&iter->rt_runtime_lock);
 760                 /*
 761                  * Either all rqs have inf runtime and there's nothing to steal
 762                  * or __disable_runtime() below sets a specific rq to inf to
 763                  * indicate its been disabled and disalow stealing.
 764                  */
 765                 if (iter->rt_runtime == RUNTIME_INF)
 766                         goto next;
 767
 768                 /*
 769                  * From runqueues with spare time, take 1/n part of their
 770                  * spare time, but no more than our period.
 771                  */
 772                 diff = iter->rt_runtime - iter->rt_time;
 773                 if (diff > 0) {
 774                         diff = div_u64((u64)diff, weight);
 775                         if (rt_rq->rt_runtime + diff > rt_period)
 776                                 diff = rt_period - rt_rq->rt_runtime;
 777                         iter->rt_runtime -= diff;
 778                         rt_rq->rt_runtime += diff;
 779                         if (rt_rq->rt_runtime == rt_period) {
 780                                 raw_spin_unlock(&iter->rt_runtime_lock);
 781                                 break;
 782                         }
 783                 }
 784 next:
 785                 raw_spin_unlock(&iter->rt_runtime_lock);
 786         }
 787         raw_spin_unlock(&rt_b->rt_runtime_lock);
 788 }
 789
 790 /*
 791  * Ensure this RQ takes back all the runtime it lend to its neighbours.
 792  */
 793 static void __disable_runtime(struct rq *rq)
 794 {
 795         struct root_domain *rd = rq->rd;
 796         rt_rq_iter_t iter;
 797         struct rt_rq *rt_rq;
 798
 799         if (unlikely(!scheduler_running))
 800                 return;
 801
 802         for_each_rt_rq(rt_rq, iter, rq) {
 803                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 804                 s64 want;
 805                 int i;
 806
 807                 raw_spin_lock(&rt_b->rt_runtime_lock);
 808                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 809                 /*
 810                  * Either we're all inf and nobody needs to borrow, or we're
 811                  * already disabled and thus have nothing to do, or we have
 812                  * exactly the right amount of runtime to take out.
 813                  */
 814                 if (rt_rq->rt_runtime == RUNTIME_INF ||
 815                                 rt_rq->rt_runtime == rt_b->rt_runtime)
 816                         goto balanced;
 817                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 818
 819                 /*
 820                  * Calculate the difference between what we started out with
 821                  * and what we current have, that's the amount of runtime
 822                  * we lend and now have to reclaim.
 823                  */
 824                 want = rt_b->rt_runtime - rt_rq->rt_runtime;
 825
 826                 /*
 827                  * Greedy reclaim, take back as much as we can.
 828                  */
 829                 for_each_cpu(i, rd->span) {
 830                         struct rt_rq *iter = sched_rt_period_rt_rq(rt_b, i);
 831                         s64 diff;
 832
 833                         /*
 834                          * Can't reclaim from ourselves or disabled runqueues.
 835                          */
 836                         if (iter == rt_rq || iter->rt_runtime == RUNTIME_INF)
 837                                 continue;
 838
 839                         raw_spin_lock(&iter->rt_runtime_lock);
 840                         if (want > 0) {
 841                                 diff = min_t(s64, iter->rt_runtime, want);
 842                                 iter->rt_runtime -= diff;
 843                                 want -= diff;
 844                         } else {
 845                                 iter->rt_runtime -= want;
 846                                 want -= want;
 847                         }
 848                         raw_spin_unlock(&iter->rt_runtime_lock);
 849
 850                         if (!want)
 851                                 break;
 852                 }
 853
 854                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 855                 /*
 856                  * We cannot be left wanting - that would mean some runtime
 857                  * leaked out of the system.
 858                  */
 859                 BUG_ON(want);
 860 balanced:
 861                 /*
 862                  * Disable all the borrow logic by pretending we have inf
 863                  * runtime - in which case borrowing doesn't make sense.
 864                  */
 865                 rt_rq->rt_runtime = RUNTIME_INF;
 866                 rt_rq->rt_throttled = 0;
 867                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 868                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 869
 870                 /* Make rt_rq available for pick_next_task() */
 871                 sched_rt_rq_enqueue(rt_rq);
 872         }
 873 }
 874
 875 static void __enable_runtime(struct rq *rq)
 876 {
 877         rt_rq_iter_t iter;
 878         struct rt_rq *rt_rq;
 879
 880         if (unlikely(!scheduler_running))
 881                 return;
 882
 883         /*
 884          * Reset each runqueue's bandwidth settings
 885          */
 886         for_each_rt_rq(rt_rq, iter, rq) {
 887                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
 888
 889                 raw_spin_lock(&rt_b->rt_runtime_lock);
 890                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 891                 rt_rq->rt_runtime = rt_b->rt_runtime;
 892                 rt_rq->rt_time = 0;
 893                 rt_rq->rt_throttled = 0;
 894                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 895                 raw_spin_unlock(&rt_b->rt_runtime_lock);
 896         }
 897 }
 898
 899 static void balance_runtime(struct rt_rq *rt_rq)
 900 {
 901         if (!sched_feat(RT_RUNTIME_SHARE))
 902                 return;
 903
 904         if (rt_rq->rt_time > rt_rq->rt_runtime) {
 905                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 906                 do_balance_runtime(rt_rq);
 907                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 908         }
 909 }
 910 #else /* !CONFIG_SMP */
 911 static inline void balance_runtime(struct rt_rq *rt_rq) {}
 912 #endif /* CONFIG_SMP */
 913
 914 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun)
 915 {
 916         int i, idle = 1, throttled = 0;
 917         const struct cpumask *span;
 918
 919         span = sched_rt_period_mask();
 920 #ifdef CONFIG_RT_GROUP_SCHED
 921         /*
 922          * FIXME: isolated CPUs should really leave the root task group,
 923          * whether they are isolcpus or were isolated via cpusets, lest
 924          * the timer run on a CPU which does not service all runqueues,
 925          * potentially leaving other CPUs indefinitely throttled.  If
 926          * isolation is really required, the user will turn the throttle
 927          * off to kill the perturbations it causes anyway.  Meanwhile,
 928          * this maintains functionality for boot and/or troubleshooting.
 929          */
 930         if (rt_b == &root_task_group.rt_bandwidth)
 931                 span = cpu_online_mask;
 932 #endif
 933         for_each_cpu(i, span) {
 934                 int enqueue = 0;
 935                 struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i);
 936                 struct rq *rq = rq_of_rt_rq(rt_rq);
 937                 int skip;
 938
 939                 /*
 940                  * When span == cpu_online_mask, taking each rq->lock
 941                  * can be time-consuming. Try to avoid it when possible.
 942                  */
 943                 raw_spin_lock(&rt_rq->rt_runtime_lock);
 944                 if (!sched_feat(RT_RUNTIME_SHARE) && rt_rq->rt_runtime != RUNTIME_INF)
 945                         rt_rq->rt_runtime = rt_b->rt_runtime;
 946                 skip = !rt_rq->rt_time && !rt_rq->rt_nr_running;
 947                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
 948                 if (skip)
 949                         continue;
 950
 951                 raw_spin_lock(&rq->lock);
 952                 update_rq_clock(rq);
 953
 954                 if (rt_rq->rt_time) {
 955                         u64 runtime;
 956
 957                         raw_spin_lock(&rt_rq->rt_runtime_lock);
 958                         if (rt_rq->rt_throttled)
 959                                 balance_runtime(rt_rq);
 960                         runtime = rt_rq->rt_runtime;
 961                         rt_rq->rt_time -= min(rt_rq->rt_time, overrun*runtime);
 962                         if (rt_rq->rt_throttled && rt_rq->rt_time < runtime) {
 963                                 rt_rq->rt_throttled = 0;
 964                                 enqueue = 1;
 965
 966                                 /*
 967                                  * When we're idle and a woken (rt) task is
 968                                  * throttled check_preempt_curr() will set
 969                                  * skip_update and the time between the wakeup
 970                                  * and this unthrottle will get accounted as
 971                                  * 'runtime'.
 972                                  */
 973                                 if (rt_rq->rt_nr_running && rq->curr == rq->idle)
 974                                         rq_clock_skip_update(rq, false);
 975                         }
 976                         if (rt_rq->rt_time || rt_rq->rt_nr_running)
 977                                 idle = 0;
 978                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
 979                 } else if (rt_rq->rt_nr_running) {
 980                         idle = 0;
 981                         if (!rt_rq_throttled(rt_rq))
 982                                 enqueue = 1;
 983                 }
 984                 if (rt_rq->rt_throttled)
 985                         throttled = 1;
 986
 987                 if (enqueue)
 988                         sched_rt_rq_enqueue(rt_rq);
 989                 raw_spin_unlock(&rq->lock);
 990         }
 991
 992         if (!throttled && (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF))
 993                 return 1;
 994
 995         return idle;
 996 }
 997
 998 static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 999 {
1000 #ifdef CONFIG_RT_GROUP_SCHED
1001         struct rt_rq *rt_rq = group_rt_rq(rt_se);
1002
1003         if (rt_rq)
1004                 return rt_rq->highest_prio.curr;
1005 #endif
1006
1007         return rt_task_of(rt_se)->prio;
1008 }
1009
1010 static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq)
1011 {
1012         u64 runtime = sched_rt_runtime(rt_rq);
1013
1014         if (rt_rq->rt_throttled)
1015                 return rt_rq_throttled(rt_rq);
1016
1017         if (runtime >= sched_rt_period(rt_rq))
1018                 return 0;
1019
1020         balance_runtime(rt_rq);
1021         runtime = sched_rt_runtime(rt_rq);
1022         if (runtime == RUNTIME_INF)
1023                 return 0;
1024
1025         if (rt_rq->rt_time > runtime) {
1026                 struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq);
1027
1028                 /*
1029                  * Don't actually throttle groups that have no runtime assigned
1030                  * but accrue some time due to boosting.
1031                  */
1032                 if (likely(rt_b->rt_runtime)) {
1033                         rt_rq->rt_throttled = 1;
1034                         printk_deferred_once("sched: RT throttling activated\n");
1035                 } else {
1036                         /*
1037                          * In case we did anyway, make it go away,
1038                          * replenishment is a joke, since it will replenish us
1039                          * with exactly 0 ns.
1040                          */
1041                         rt_rq->rt_time = 0;
1042                 }
1043
1044                 if (rt_rq_throttled(rt_rq)) {
1045                         sched_rt_rq_dequeue(rt_rq);
1046                         return 1;
1047                 }
1048         }
1049
1050         return 0;
1051 }
1052
1053 /*
1054  * Update the current task's runtime statistics. Skip current tasks that
1055  * are not in our scheduling class.
1056  */
1057 static void update_curr_rt(struct rq *rq)
1058 {
1059         struct task_struct *curr = rq->curr;
1060         struct sched_rt_entity *rt_se = &curr->rt;
1061         u64 delta_exec;
1062
1063         if (curr->sched_class != &rt_sched_class)
1064                 return;
1065
1066         delta_exec = rq_clock_task(rq) - curr->se.exec_start;
1067         if (unlikely((s64)delta_exec <= 0))
1068                 return;
1069
1070         /* Kick cpufreq (see the comment in kernel/sched/sched.h). */
1071         cpufreq_update_util(rq, SCHED_CPUFREQ_RT);
1072
1073         schedstat_set(curr->se.statistics.exec_max,
1074                       max(curr->se.statistics.exec_max, delta_exec));
1075
1076         curr->se.sum_exec_runtime += delta_exec;
1077         account_group_exec_runtime(curr, delta_exec);
1078
1079         curr->se.exec_start = rq_clock_task(rq);
1080         cpuacct_charge(curr, delta_exec);
1081
1082         sched_rt_avg_update(rq, delta_exec);
1083
1084         if (!rt_bandwidth_enabled())
1085                 return;
1086
1087         for_each_sched_rt_entity(rt_se) {
1088                 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1089                 int exceeded;
1090
1091                 if (sched_rt_runtime(rt_rq) != RUNTIME_INF) {
1092                         raw_spin_lock(&rt_rq->rt_runtime_lock);
1093                         rt_rq->rt_time += delta_exec;
1094                         exceeded = sched_rt_runtime_exceeded(rt_rq);
1095                         if (exceeded)
1096                                 resched_curr(rq);
1097                         raw_spin_unlock(&rt_rq->rt_runtime_lock);
1098                         if (exceeded)
1099                                 do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq));
1100                 }
1101         }
1102 }
1103
1104 static void
1105 dequeue_top_rt_rq(struct rt_rq *rt_rq)
1106 {
1107         struct rq *rq = rq_of_rt_rq(rt_rq);
1108
1109         BUG_ON(&rq->rt != rt_rq);
1110
1111         if (!rt_rq->rt_queued)
1112                 return;
1113
1114         BUG_ON(!rq->nr_running);
1115
1116         sub_nr_running(rq, rt_rq->rt_nr_running);
1117         rt_rq->rt_queued = 0;
1118 }
1119
1120 static void
1121 enqueue_top_rt_rq(struct rt_rq *rt_rq)
1122 {
1123         struct rq *rq = rq_of_rt_rq(rt_rq);
1124
1125         BUG_ON(&rq->rt != rt_rq);
1126
1127         if (rt_rq->rt_queued)
1128                 return;
1129         if (rt_rq_throttled(rt_rq) || !rt_rq->rt_nr_running)
1130                 return;
1131
1132         add_nr_running(rq, rt_rq->rt_nr_running);
1133         rt_rq->rt_queued = 1;
1134 }
1135
1136 #if defined CONFIG_SMP
1137
1138 static void
1139 inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1140 {
1141         struct rq *rq = rq_of_rt_rq(rt_rq);
1142
1143 #ifdef CONFIG_RT_GROUP_SCHED
1144         /*
1145          * Change rq's cpupri only if rt_rq is the top queue.
1146          */
1147         if (&rq->rt != rt_rq)
1148                 return;
1149 #endif
1150         if (rq->online && prio < prev_prio)
1151                 cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
1152 }
1153
1154 static void
1155 dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
1156 {
1157         struct rq *rq = rq_of_rt_rq(rt_rq);
1158
1159 #ifdef CONFIG_RT_GROUP_SCHED
1160         /*
1161          * Change rq's cpupri only if rt_rq is the top queue.
1162          */
1163         if (&rq->rt != rt_rq)
1164                 return;
1165 #endif
1166         if (rq->online && rt_rq->highest_prio.curr != prev_prio)
1167                 cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
1168 }
1169
1170 #else /* CONFIG_SMP */
1171
1172 static inline
1173 void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1174 static inline
1175 void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
1176
1177 #endif /* CONFIG_SMP */
1178
1179 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
1180 static void
1181 inc_rt_prio(struct rt_rq *rt_rq, int prio)
1182 {
1183         int prev_prio = rt_rq->highest_prio.curr;
1184
1185         if (prio < prev_prio)
1186                 rt_rq->highest_prio.curr = prio;
1187
1188         inc_rt_prio_smp(rt_rq, prio, prev_prio);
1189 }
1190
1191 static void
1192 dec_rt_prio(struct rt_rq *rt_rq, int prio)
1193 {
1194         int prev_prio = rt_rq->highest_prio.curr;
1195
1196         if (rt_rq->rt_nr_running) {
1197
1198                 WARN_ON(prio < prev_prio);
1199
1200                 /*
1201                  * This may have been our highest task, and therefore
1202                  * we may have some recomputation to do
1203                  */
1204                 if (prio == prev_prio) {
1205                         struct rt_prio_array *array = &rt_rq->active;
1206
1207                         rt_rq->highest_prio.curr =
1208                                 sched_find_first_bit(array->bitmap);
1209                 }
1210
1211         } else
1212                 rt_rq->highest_prio.curr = MAX_RT_PRIO;
1213
1214         dec_rt_prio_smp(rt_rq, prio, prev_prio);
1215 }
1216
1217 #else
1218
1219 static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
1220 static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
1221
1222 #endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
1223
1224 #ifdef CONFIG_RT_GROUP_SCHED
1225
1226 static void
1227 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1228 {
1229         if (rt_se_boosted(rt_se))
1230                 rt_rq->rt_nr_boosted++;
1231
1232         if (rt_rq->tg)
1233                 start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
1234 }
1235
1236 static void
1237 dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1238 {
1239         if (rt_se_boosted(rt_se))
1240                 rt_rq->rt_nr_boosted--;
1241
1242         WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
1243 }
1244
1245 #else /* CONFIG_RT_GROUP_SCHED */
1246
1247 static void
1248 inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1249 {
1250         start_rt_bandwidth(&def_rt_bandwidth);
1251 }
1252
1253 static inline
1254 void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
1255
1256 #endif /* CONFIG_RT_GROUP_SCHED */
1257
1258 static inline
1259 unsigned int rt_se_nr_running(struct sched_rt_entity *rt_se)
1260 {
1261         struct rt_rq *group_rq = group_rt_rq(rt_se);
1262
1263         if (group_rq)
1264                 return group_rq->rt_nr_running;
1265         else
1266                 return 1;
1267 }
1268
1269 static inline
1270 unsigned int rt_se_rr_nr_running(struct sched_rt_entity *rt_se)
1271 {
1272         struct rt_rq *group_rq = group_rt_rq(rt_se);
1273         struct task_struct *tsk;
1274
1275         if (group_rq)
1276                 return group_rq->rr_nr_running;
1277
1278         tsk = rt_task_of(rt_se);
1279
1280         return (tsk->policy == SCHED_RR) ? 1 : 0;
1281 }
1282
1283 static inline
1284 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1285 {
1286         int prio = rt_se_prio(rt_se);
1287
1288         WARN_ON(!rt_prio(prio));
1289         rt_rq->rt_nr_running += rt_se_nr_running(rt_se);
1290         rt_rq->rr_nr_running += rt_se_rr_nr_running(rt_se);
1291
1292         inc_rt_prio(rt_rq, prio);
1293         inc_rt_migration(rt_se, rt_rq);
1294         inc_rt_group(rt_se, rt_rq);
1295 }
1296
1297 static inline
1298 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
1299 {
1300         WARN_ON(!rt_prio(rt_se_prio(rt_se)));
1301         WARN_ON(!rt_rq->rt_nr_running);
1302         rt_rq->rt_nr_running -= rt_se_nr_running(rt_se);
1303         rt_rq->rr_nr_running -= rt_se_rr_nr_running(rt_se);
1304
1305         dec_rt_prio(rt_rq, rt_se_prio(rt_se));
1306         dec_rt_migration(rt_se, rt_rq);
1307         dec_rt_group(rt_se, rt_rq);
1308 }
1309
1310 #ifdef CONFIG_SMP
1311 static void
1312 attach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1313 {
1314         rt_se->avg.last_update_time = rt_rq->avg.last_update_time;
1315         rt_rq->avg.util_avg += rt_se->avg.util_avg;
1316         rt_rq->avg.util_sum += rt_se->avg.util_sum;
1317         rt_rq->avg.load_avg += rt_se->avg.load_avg;
1318         rt_rq->avg.load_sum += rt_se->avg.load_sum;
1319 #ifdef CONFIG_RT_GROUP_SCHED
1320         rt_rq->propagate_avg = 1;
1321 #endif
1322         rt_rq_util_change(rt_rq);
1323 }
1324
1325 static void
1326 detach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
1327 {
1328         sub_positive(&rt_rq->avg.util_avg, rt_se->avg.util_avg);
1329         sub_positive(&rt_rq->avg.util_sum, rt_se->avg.util_sum);
1330         sub_positive(&rt_rq->avg.load_avg, rt_se->avg.load_avg);
1331         sub_positive(&rt_rq->avg.load_sum, rt_se->avg.load_sum);
1332 #ifdef CONFIG_RT_GROUP_SCHED
1333         rt_rq->propagate_avg = 1;
1334 #endif
1335         rt_rq_util_change(rt_rq);
1336 }
1337 #else
1338 static inline void
1339 attach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) {}
1340 static inline void
1341 detach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) {}
1342 #endif
1343
1344 /*
1345  * Change rt_se->run_list location unless SAVE && !MOVE
1346  *
1347  * assumes ENQUEUE/DEQUEUE flags match
1348  */
1349 static inline bool move_entity(unsigned int flags)
1350 {
1351         if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
1352                 return false;
1353
1354         return true;
1355 }
1356
1357 static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
1358 {
1359         list_del_init(&rt_se->run_list);
1360
1361         if (list_empty(array->queue + rt_se_prio(rt_se)))
1362                 __clear_bit(rt_se_prio(rt_se), array->bitmap);
1363
1364         rt_se->on_list = 0;
1365 }
1366
1367 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1368 {
1369         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1370         struct rt_prio_array *array = &rt_rq->active;
1371         struct rt_rq *group_rq = group_rt_rq(rt_se);
1372         struct list_head *queue = array->queue + rt_se_prio(rt_se);
1373
1374         /*
1375          * Don't enqueue the group if its throttled, or when empty.
1376          * The latter is a consequence of the former when a child group
1377          * get throttled and the current group doesn't have any other
1378          * active members.
1379          */
1380         if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
1381                 if (rt_se->on_list)
1382                         __delist_rt_entity(rt_se, array);
1383                 return;
1384         }
1385
1386         if (move_entity(flags)) {
1387                 WARN_ON_ONCE(rt_se->on_list);
1388                 if (flags & ENQUEUE_HEAD)
1389                         list_add(&rt_se->run_list, queue);
1390                 else
1391                         list_add_tail(&rt_se->run_list, queue);
1392
1393                 __set_bit(rt_se_prio(rt_se), array->bitmap);
1394                 rt_se->on_list = 1;
1395         }
1396         rt_se->on_rq = 1;
1397
1398         update_rt_load_avg(rq_clock_task(rq_of_rt_rq(rt_rq)), rt_se);
1399
1400         if (rt_entity_is_task(rt_se) && !rt_se->avg.last_update_time)
1401                 attach_rt_entity_load_avg(rt_rq, rt_se);
1402
1403         inc_rt_tasks(rt_se, rt_rq);
1404 }
1405
1406 static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1407 {
1408         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1409         struct rt_prio_array *array = &rt_rq->active;
1410
1411         if (move_entity(flags)) {
1412                 WARN_ON_ONCE(!rt_se->on_list);
1413                 __delist_rt_entity(rt_se, array);
1414         }
1415         rt_se->on_rq = 0;
1416
1417         update_rt_load_avg(rq_clock_task(rq_of_rt_rq(rt_rq)), rt_se);
1418
1419         dec_rt_tasks(rt_se, rt_rq);
1420 }
1421
1422 /*
1423  * Because the prio of an upper entry depends on the lower
1424  * entries, we must remove entries top - down.
1425  */
1426 static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
1427 {
1428         struct sched_rt_entity *back = NULL;
1429
1430         for_each_sched_rt_entity(rt_se) {
1431                 rt_se->back = back;
1432                 back = rt_se;
1433         }
1434
1435         dequeue_top_rt_rq(rt_rq_of_se(back));
1436
1437         for (rt_se = back; rt_se; rt_se = rt_se->back) {
1438                 if (on_rt_rq(rt_se))
1439                         __dequeue_rt_entity(rt_se, flags);
1440         }
1441 }
1442
1443 static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1444 {
1445         struct rq *rq = rq_of_rt_se(rt_se);
1446
1447         dequeue_rt_stack(rt_se, flags);
1448         for_each_sched_rt_entity(rt_se)
1449                 __enqueue_rt_entity(rt_se, flags);
1450         enqueue_top_rt_rq(&rq->rt);
1451 }
1452
1453 static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
1454 {
1455         struct rq *rq = rq_of_rt_se(rt_se);
1456
1457         dequeue_rt_stack(rt_se, flags);
1458
1459         for_each_sched_rt_entity(rt_se) {
1460                 struct rt_rq *rt_rq = group_rt_rq(rt_se);
1461
1462                 if (rt_rq && rt_rq->rt_nr_running)
1463                         __enqueue_rt_entity(rt_se, flags);
1464         }
1465         enqueue_top_rt_rq(&rq->rt);
1466 }
1467
1468 /*
1469  * Adding/removing a task to/from a priority array:
1470  */
1471 static void
1472 enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1473 {
1474         struct sched_rt_entity *rt_se = &p->rt;
1475
1476         schedtune_enqueue_task(p, cpu_of(rq));
1477
1478         if (flags & ENQUEUE_WAKEUP)
1479                 rt_se->timeout = 0;
1480
1481         enqueue_rt_entity(rt_se, flags);
1482         walt_inc_cumulative_runnable_avg(rq, p);
1483
1484         if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
1485                 enqueue_pushable_task(rq, p);
1486 }
1487
1488 static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
1489 {
1490         struct sched_rt_entity *rt_se = &p->rt;
1491
1492         schedtune_dequeue_task(p, cpu_of(rq));
1493
1494         update_curr_rt(rq);
1495         dequeue_rt_entity(rt_se, flags);
1496         walt_dec_cumulative_runnable_avg(rq, p);
1497
1498         dequeue_pushable_task(rq, p);
1499 }
1500
1501 /*
1502  * Put task to the head or the end of the run list without the overhead of
1503  * dequeue followed by enqueue.
1504  */
1505 static void
1506 requeue_rt_entity(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se, int head)
1507 {
1508         if (on_rt_rq(rt_se)) {
1509                 struct rt_prio_array *array = &rt_rq->active;
1510                 struct list_head *queue = array->queue + rt_se_prio(rt_se);
1511
1512                 if (head)
1513                         list_move(&rt_se->run_list, queue);
1514                 else
1515                         list_move_tail(&rt_se->run_list, queue);
1516         }
1517 }
1518
1519 static void requeue_task_rt(struct rq *rq, struct task_struct *p, int head)
1520 {
1521         struct sched_rt_entity *rt_se = &p->rt;
1522         struct rt_rq *rt_rq;
1523
1524         for_each_sched_rt_entity(rt_se) {
1525                 rt_rq = rt_rq_of_se(rt_se);
1526                 requeue_rt_entity(rt_rq, rt_se, head);
1527         }
1528 }
1529
1530 static void yield_task_rt(struct rq *rq)
1531 {
1532         requeue_task_rt(rq, rq->curr, 0);
1533 }
1534
1535 #ifdef CONFIG_SMP
1536
1537 /* TODO:
1538  * attach/detach/migrate_task_rt_rq() for load tracking
1539  */
1540
1541 #ifdef CONFIG_SCHED_USE_FLUID_RT
1542 static int find_lowest_rq(struct task_struct *task, int wake_flags);
1543 #else
1544 static int find_lowest_rq(struct task_struct *task);
1545 #endif
1546 static int
1547 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
1548                   int sibling_count_hint)
1549 {
1550         struct task_struct *curr;
1551         struct rq *rq;
1552
1553         /* For anything but wake ups, just return the task_cpu */
1554         if (sd_flag != SD_BALANCE_WAKE && sd_flag != SD_BALANCE_FORK)
1555                 goto out;
1556
1557         rq = cpu_rq(cpu);
1558
1559         rcu_read_lock();
1560         curr = READ_ONCE(rq->curr); /* unlocked access */
1561
1562         /*
1563          * If the current task on @p's runqueue is an RT task, then
1564          * try to see if we can wake this RT task up on another
1565          * runqueue. Otherwise simply start this RT task
1566          * on its current runqueue.
1567          *
1568          * We want to avoid overloading runqueues. If the woken
1569          * task is a higher priority, then it will stay on this CPU
1570          * and the lower prio task should be moved to another CPU.
1571          * Even though this will probably make the lower prio task
1572          * lose its cache, we do not want to bounce a higher task
1573          * around just because it gave up its CPU, perhaps for a
1574          * lock?
1575          *
1576          * For equal prio tasks, we just let the scheduler sort it out.
1577          *
1578          * Otherwise, just let it ride on the affined RQ and the
1579          * post-schedule router will push the preempted task away
1580          *
1581          * This test is optimistic, if we get it wrong the load-balancer
1582          * will have to sort it out.
1583          */
1584         if (curr && unlikely(rt_task(curr)) &&
1585             (curr->nr_cpus_allowed < 2 ||
1586              curr->prio <= p->prio)) {
1587 #ifdef CONFIG_SCHED_USE_FLUID_RT
1588                 int target = find_lowest_rq(p, flags);
1589                 /*
1590                  * Even though the destination CPU is running
1591                  * a higher priority task, FluidRT can bother moving it
1592                  * when its utilization is very small, and the other CPU is too busy
1593                  * to accomodate the p in the point of priority and utilization.
1594                  *
1595                  * BTW, if the curr has higher priority than p, FluidRT tries to find
1596                  * the other CPUs first. In the worst case, curr can be victim, if it
1597                  * has very small utilization.
1598                  */
1599                 if (likely(target != -1)) {
1600                         cpu = target;
1601                 }
1602 #else
1603                 int target = find_lowest_rq(p);
1604                 /*
1605                  * Don't bother moving it if the destination CPU is
1606                  * not running a lower priority task.
1607                  */
1608                 if (target != -1 &&
1609                     p->prio < cpu_rq(target)->rt.highest_prio.curr)
1610                         cpu = target;
1611 #endif
1612         }
1613         rcu_read_unlock();
1614
1615 out:
1616         return cpu;
1617 }
1618
1619 #ifdef CONFIG_RT_GROUP_SCHED
1620 /*
1621  * Called within set_task_rq() right before setting a task's cpu. The
1622  * caller only guarantees p->pi_lock is held; no other assumptions,
1623  * including the state of rq->lock, should be made.
1624  */
1625 void set_task_rq_rt(struct sched_rt_entity *rt_se,
1626                                     struct rt_rq *prev, struct rt_rq *next)
1627 {
1628         u64 p_last_update_time;
1629         u64 n_last_update_time;
1630
1631         if (!sched_feat(ATTACH_AGE_LOAD))
1632                 return;
1633         /*
1634          * We are supposed to update the task to "current" time, then its up to
1635          * date and ready to go to new CPU/rt_rq. But we have difficulty in
1636          * getting what current time is, so simply throw away the out-of-date
1637          * time. This will result in the wakee task is less decayed, but giving
1638          * the wakee more load sounds not bad.
1639          */
1640         if (!(rt_se->avg.last_update_time && prev))
1641                 return;
1642 #ifndef CONFIG_64BIT
1643         {
1644                 u64 p_last_update_time_copy;
1645                 u64 n_last_update_time_copy;
1646
1647                 do {
1648                         p_last_update_time_copy = prev->load_last_update_time_copy;
1649                         n_last_update_time_copy = next->load_last_update_time_copy;
1650
1651                         smp_rmb();
1652
1653                         p_last_update_time = prev->avg.last_update_time;
1654                         n_last_update_time = next->avg.last_update_time;
1655
1656                 } while (p_last_update_time != p_last_update_time_copy ||
1657                          n_last_update_time != n_last_update_time_copy);
1658         }
1659 #else
1660         p_last_update_time = prev->avg.last_update_time;
1661         n_last_update_time = next->avg.last_update_time;
1662 #endif
1663         __update_load_avg(p_last_update_time, cpu_of(rq_of_rt_rq(prev)),
1664                 &rt_se->avg, 0, 0, NULL);
1665
1666         rt_se->avg.last_update_time = n_last_update_time;
1667 }
1668 #endif /* CONFIG_RT_GROUP_SCHED */
1669
1670 #ifndef CONFIG_64BIT
1671 static inline u64 rt_rq_last_update_time(struct rt_rq *rt_rq)
1672 {
1673         u64 last_update_time_copy;
1674         u64 last_update_time;
1675
1676         do {
1677                 last_update_time_copy = rt_rq->load_last_update_time_copy;
1678                 smp_rmb();
1679                 last_update_time = rt_rq->avg.last_update_time;
1680         } while (last_update_time != last_update_time_copy);
1681
1682         return last_update_time;
1683 }
1684 #else
1685 static inline u64 rt_rq_last_update_time(struct rt_rq *rt_rq)
1686 {
1687         return rt_rq->avg.last_update_time;
1688 }
1689 #endif
1690
1691 /*
1692  * Synchronize entity load avg of dequeued entity without locking
1693  * the previous rq.
1694  */
1695 void sync_rt_entity_load_avg(struct sched_rt_entity *rt_se)
1696 {
1697         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1698         u64 last_update_time;
1699
1700         last_update_time = rt_rq_last_update_time(rt_rq);
1701         __update_load_avg(last_update_time, cpu_of(rq_of_rt_rq(rt_rq)),
1702                                 &rt_se->avg, 0, 0, NULL);
1703 }
1704
1705 /*
1706  * Task first catches up with rt_rq, and then subtract
1707  * itself from the rt_rq (task must be off the queue now).
1708  */
1709 static void remove_rt_entity_load_avg(struct sched_rt_entity *rt_se)
1710 {
1711         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1712
1713         /*
1714          * tasks cannot exit without having gone through wake_up_new_task() ->
1715          * post_init_entity_util_avg() which will have added things to the
1716          * rt_rq, so we can remove unconditionally.
1717          *
1718          * Similarly for groups, they will have passed through
1719          * post_init_entity_util_avg() before unregister_sched_fair_group()
1720          * calls this.
1721          */
1722
1723         sync_rt_entity_load_avg(rt_se);
1724         atomic_long_add(rt_se->avg.load_avg, &rt_rq->removed_load_avg);
1725         atomic_long_add(rt_se->avg.util_avg, &rt_rq->removed_util_avg);
1726 }
1727
1728 static void attach_task_rt_rq(struct task_struct *p)
1729 {
1730         struct sched_rt_entity *rt_se = &p->rt;
1731         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1732         u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
1733
1734         update_rt_load_avg(now, rt_se);
1735         attach_rt_entity_load_avg(rt_rq, rt_se);
1736 }
1737
1738 static void detach_task_rt_rq(struct task_struct *p)
1739 {
1740         struct sched_rt_entity *rt_se = &p->rt;
1741         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
1742         u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
1743
1744         update_rt_load_avg(now, rt_se);
1745         detach_rt_entity_load_avg(rt_rq, rt_se);
1746 }
1747
1748 static void migrate_task_rq_rt(struct task_struct *p)
1749 {
1750         /*
1751          * We are supposed to update the task to "current" time, then its up to date
1752          * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
1753          * what current time is, so simply throw away the out-of-date time. This
1754          * will result in the wakee task is less decayed, but giving the wakee more
1755          * load sounds not bad.
1756          */
1757         remove_rt_entity_load_avg(&p->rt);
1758
1759         /* Tell new CPU we are migrated */
1760         p->rt.avg.last_update_time = 0;
1761
1762         /* We have migrated, no longer consider this task hot */
1763         p->se.exec_start = 0;
1764 }
1765
1766 static void task_dead_rt(struct task_struct *p)
1767 {
1768         remove_rt_entity_load_avg(&p->rt);
1769 }
1770
1771 #ifdef CONFIG_RT_GROUP_SCHED
1772 static void task_set_group_rt(struct task_struct *p)
1773 {
1774         set_task_rq(p, task_cpu(p));
1775 }
1776
1777 static void task_move_group_rt(struct task_struct *p)
1778 {
1779         detach_task_rt_rq(p);
1780         set_task_rq(p, task_cpu(p));
1781
1782 #ifdef CONFIG_SMP
1783         /* Tell se's cfs_rq has been changed -- migrated */
1784         p->se.avg.last_update_time = 0;
1785 #endif
1786         attach_task_rt_rq(p);
1787 }
1788
1789 static void task_change_group_rt(struct task_struct *p, int type)
1790 {
1791         switch (type) {
1792         case TASK_SET_GROUP:
1793                 task_set_group_rt(p);
1794                 break;
1795
1796         case TASK_MOVE_GROUP:
1797                 task_move_group_rt(p);
1798                 break;
1799         }
1800 }
1801 #endif
1802
1803 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
1804 {
1805         /*
1806          * Current can't be migrated, useless to reschedule,
1807          * let's hope p can move out.
1808          */
1809         if (rq->curr->nr_cpus_allowed == 1 ||
1810             !cpupri_find(&rq->rd->cpupri, rq->curr, NULL))
1811                 return;
1812
1813         /*
1814          * p is migratable, so let's not schedule it and
1815          * see if it is pushed or pulled somewhere else.
1816          */
1817         if (p->nr_cpus_allowed != 1
1818             && cpupri_find(&rq->rd->cpupri, p, NULL))
1819                 return;
1820
1821         /*
1822          * There appears to be other cpus that can accept
1823          * current and none to run 'p', so lets reschedule
1824          * to try and push current away:
1825          */
1826         requeue_task_rt(rq, p, 1);
1827         resched_curr(rq);
1828 }
1829
1830 /* Give new sched_entity start runnable values to heavy its load in infant time */
1831 void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se)
1832 {
1833         struct sched_avg *sa = &rt_se->avg;
1834
1835         sa->last_update_time = 0;
1836
1837         sa->period_contrib = 1023;
1838
1839         /*
1840          * Tasks are intialized with zero load.
1841          * Load is not actually used by RT, but can be inherited into fair task.
1842          */
1843         sa->load_avg = 0;
1844         sa->load_sum = 0;
1845         /*
1846          * At this point, util_avg won't be used in select_task_rq_rt anyway
1847          */
1848         sa->util_avg = 0;
1849         sa->util_sum = 0;
1850         /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
1851 }
1852 #else
1853 void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se) { }
1854 #endif /* CONFIG_SMP */
1855
1856 #ifdef CONFIG_SCHED_USE_FLUID_RT
1857 static inline void set_victim_flag(struct task_struct *p)
1858 {
1859         p->victim_flag = 1;
1860 }
1861
1862 static inline void clear_victim_flag(struct task_struct *p)
1863 {
1864         p->victim_flag = 0;
1865 }
1866
1867 static inline bool test_victim_flag(struct task_struct *p)
1868 {
1869         if (p->victim_flag)
1870                 return true;
1871         else
1872                 return false;
1873 }
1874 #else
1875 static inline bool test_victim_flag(struct task_struct *p) { return false; }
1876 static inline void clear_victim_flag(struct task_struct *p) {}
1877 #endif
1878 /*
1879  * Preempt the current task with a newly woken task if needed:
1880  */
1881 static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
1882 {
1883         if (p->prio < rq->curr->prio) {
1884                 resched_curr(rq);
1885                 return;
1886         } else if (test_victim_flag(p)) {
1887                 requeue_task_rt(rq, p, 1);
1888                 resched_curr(rq);
1889                 return;
1890         }
1891
1892 #ifdef CONFIG_SMP
1893         /*
1894          * If:
1895          *
1896          * - the newly woken task is of equal priority to the current task
1897          * - the newly woken task is non-migratable while current is migratable
1898          * - current will be preempted on the next reschedule
1899          *
1900          * we should check to see if current can readily move to a different
1901          * cpu.  If so, we will reschedule to allow the push logic to try
1902          * to move current somewhere else, making room for our non-migratable
1903          * task.
1904          */
1905         if (p->prio == rq->curr->prio && !test_tsk_need_resched(rq->curr))
1906                 check_preempt_equal_prio(rq, p);
1907 #endif
1908 }
1909
1910 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
1911                                                    struct rt_rq *rt_rq)
1912 {
1913         struct rt_prio_array *array = &rt_rq->active;
1914         struct sched_rt_entity *next = NULL;
1915         struct list_head *queue;
1916         int idx;
1917
1918         idx = sched_find_first_bit(array->bitmap);
1919         BUG_ON(idx >= MAX_RT_PRIO);
1920
1921         queue = array->queue + idx;
1922         next = list_entry(queue->next, struct sched_rt_entity, run_list);
1923
1924         return next;
1925 }
1926
1927 static struct task_struct *_pick_next_task_rt(struct rq *rq)
1928 {
1929         struct sched_rt_entity *rt_se;
1930         struct task_struct *p;
1931         struct rt_rq *rt_rq  = &rq->rt;
1932         u64 now = rq_clock_task(rq);
1933
1934         do {
1935                 rt_se = pick_next_rt_entity(rq, rt_rq);
1936                 BUG_ON(!rt_se);
1937                 update_rt_load_avg(now, rt_se);
1938                 rt_rq->curr = rt_se;
1939                 rt_rq = group_rt_rq(rt_se);
1940         } while (rt_rq);
1941
1942         p = rt_task_of(rt_se);
1943         p->se.exec_start = now;
1944
1945         return p;
1946 }
1947
1948 extern int update_rt_rq_load_avg(u64 now, int cpu, struct rt_rq *rt_rq, int running);
1949
1950 static struct task_struct *
1951 pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
1952 {
1953         struct task_struct *p;
1954         struct rt_rq *rt_rq = &rq->rt;
1955
1956         if (need_pull_rt_task(rq, prev)) {
1957                 /*
1958                  * This is OK, because current is on_cpu, which avoids it being
1959                  * picked for load-balance and preemption/IRQs are still
1960                  * disabled avoiding further scheduler activity on it and we're
1961                  * being very careful to re-start the picking loop.
1962                  */
1963                 rq_unpin_lock(rq, rf);
1964                 pull_rt_task(rq);
1965                 rq_repin_lock(rq, rf);
1966                 /*
1967                  * pull_rt_task() can drop (and re-acquire) rq->lock; this
1968                  * means a dl or stop task can slip in, in which case we need
1969                  * to re-start task selection.
1970                  */
1971                 if (unlikely((rq->stop && task_on_rq_queued(rq->stop)) ||
1972                              rq->dl.dl_nr_running))
1973                         return RETRY_TASK;
1974         }
1975
1976         /*
1977          * We may dequeue prev's rt_rq in put_prev_task().
1978          * So, we update time before rt_nr_running check.
1979          */
1980         if (prev->sched_class == &rt_sched_class)
1981                 update_curr_rt(rq);
1982
1983         if (!rt_rq->rt_queued)
1984                 return NULL;
1985
1986         put_prev_task(rq, prev);
1987
1988         p = _pick_next_task_rt(rq);
1989
1990         /* The running task is never eligible for pushing */
1991         dequeue_pushable_task(rq, p);
1992
1993         queue_push_tasks(rq);
1994
1995         if (p)
1996                 update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), rt_rq,
1997                                         rq->curr->sched_class == &rt_sched_class);
1998
1999         clear_victim_flag(p);
2000
2001         return p;
2002 }
2003
2004 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
2005 {
2006         struct sched_rt_entity *rt_se = &p->rt;
2007         u64 now = rq_clock_task(rq);
2008
2009         update_curr_rt(rq);
2010
2011         /*
2012          * The previous task needs to be made eligible for pushing
2013          * if it is still active
2014          */
2015         if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
2016                 enqueue_pushable_task(rq, p);
2017
2018         for_each_sched_rt_entity(rt_se) {
2019                 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
2020                 if (rt_se->on_rq)
2021                         update_rt_load_avg(now, rt_se);
2022
2023                 rt_rq->curr = NULL;
2024         }
2025 }
2026
2027 #ifdef CONFIG_SMP
2028
2029 void rt_rq_util_change(struct rt_rq *rt_rq)
2030 {
2031         if (&this_rq()->rt == rt_rq)
2032                 cpufreq_update_util(rt_rq->rq, SCHED_CPUFREQ_RT);
2033 }
2034
2035 #ifdef CONFIG_RT_GROUP_SCHED
2036 /* Take into account change of utilization of a child task group */
2037 static inline void
2038 update_tg_rt_util(struct rt_rq *cfs_rq, struct sched_rt_entity *rt_se)
2039 {
2040         struct rt_rq *grt_rq = rt_se->my_q;
2041         long delta = grt_rq->avg.util_avg - rt_se->avg.util_avg;
2042
2043         /* Nothing to update */
2044         if (!delta)
2045                 return;
2046
2047         /* Set new sched_rt_entity's utilization */
2048         rt_se->avg.util_avg = grt_rq->avg.util_avg;
2049         rt_se->avg.util_sum = rt_se->avg.util_avg * LOAD_AVG_MAX;
2050
2051         /* Update parent rt_rq utilization */
2052         add_positive(&cfs_rq->avg.util_avg, delta);
2053         cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
2054 }
2055
2056
2057 /* Take into account change of load of a child task group */
2058 static inline void
2059 update_tg_rt_load(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
2060 {
2061         struct rt_rq *grt_rq = rt_se->my_q;
2062         long delta = grt_rq->avg.load_avg - rt_se->avg.load_avg;
2063
2064         /*
2065          * TODO: Need to consider the TG group update
2066          * for RT RQ
2067          */
2068
2069         /* Nothing to update */
2070         if (!delta)
2071                 return;
2072
2073         /* Set new sched_rt_entity's load */
2074         rt_se->avg.load_avg = grt_rq->avg.load_avg;
2075         rt_se->avg.load_sum = rt_se->avg.load_avg * LOAD_AVG_MAX;
2076
2077         /* Update parent cfs_rq load */
2078         add_positive(&rt_rq->avg.load_avg, delta);
2079         rt_rq->avg.load_sum = rt_rq->avg.load_avg * LOAD_AVG_MAX;
2080
2081         /*
2082          * TODO: If the sched_entity is already enqueued, should we have to update the
2083          * runnable load avg.
2084          */
2085 }
2086
2087 static inline int test_and_clear_tg_rt_propagate(struct sched_rt_entity *rt_se)
2088 {
2089         struct rt_rq *rt_rq = rt_se->my_q;
2090
2091         if (!rt_rq->propagate_avg)
2092                 return 0;
2093
2094         rt_rq->propagate_avg = 0;
2095         return 1;
2096 }
2097
2098 /* Update task and its cfs_rq load average */
2099 static inline int propagate_entity_rt_load_avg(struct sched_rt_entity *rt_se)
2100 {
2101         struct rt_rq *rt_rq;
2102
2103         if (rt_entity_is_task(rt_se))
2104                 return 0;
2105
2106         if (!test_and_clear_tg_rt_propagate(rt_se))
2107                 return 0;
2108
2109         rt_rq = rt_rq_of_se(rt_se);
2110
2111         rt_rq->propagate_avg = 1;
2112
2113         update_tg_rt_util(rt_rq, rt_se);
2114         update_tg_rt_load(rt_rq, rt_se);
2115
2116         return 1;
2117 }
2118 #else
2119 static inline int propagate_entity_rt_load_avg(struct sched_rt_entity *rt_se) { };
2120 #endif
2121
2122 void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se)
2123 {
2124         struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
2125         struct rq *rq = rq_of_rt_rq(rt_rq);
2126         int cpu = cpu_of(rq);
2127         /*
2128          * Track task load average for carrying it to new CPU after migrated.
2129          */
2130         if (rt_se->avg.last_update_time)
2131                 __update_load_avg(now, cpu, &rt_se->avg, scale_load_down(NICE_0_LOAD),
2132                         rt_rq->curr == rt_se, NULL);
2133
2134         update_rt_rq_load_avg(now, cpu, rt_rq, true);
2135         propagate_entity_rt_load_avg(rt_se);
2136
2137         if (entity_is_task(rt_se))
2138                 trace_sched_rt_load_avg_task(rt_task_of(rt_se), &rt_se->avg);
2139 }
2140
2141 /* Only try algorithms three times */
2142 #define RT_MAX_TRIES 3
2143
2144 static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
2145 {
2146         if (!task_running(rq, p) &&
2147             cpumask_test_cpu(cpu, &p->cpus_allowed))
2148                 return 1;
2149         return 0;
2150 }
2151
2152 /*
2153  * Return the highest pushable rq's task, which is suitable to be executed
2154  * on the cpu, NULL otherwise
2155  */
2156 static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
2157 {
2158         struct plist_head *head = &rq->rt.pushable_tasks;
2159         struct task_struct *p;
2160
2161         if (!has_pushable_tasks(rq))
2162                 return NULL;
2163
2164         plist_for_each_entry(p, head, pushable_tasks) {
2165                 if (pick_rt_task(rq, p, cpu))
2166                         return p;
2167         }
2168
2169         return NULL;
2170 }
2171
2172 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
2173
2174 #ifdef CONFIG_SCHED_USE_FLUID_RT
2175 static unsigned int sched_rt_boost_threshold = 60;
2176
2177 static inline struct cpumask *sched_group_cpus_rt(struct sched_group *sg)
2178 {
2179         return to_cpumask(sg->cpumask);
2180 }
2181
2182 static inline int weight_from_rtprio(int prio)
2183 {
2184         int idx = (prio >> 1);
2185
2186         if (!rt_prio(prio))
2187                 return sched_prio_to_weight[prio - MAX_RT_PRIO];
2188
2189         if ((idx << 1) == prio)
2190                 return rtprio_to_weight[idx];
2191         else
2192                 return ((rtprio_to_weight[idx] + rtprio_to_weight[idx+1]) >> 1);
2193 }
2194
2195 /* Affordable CPU:
2196  * to find the best CPU in which the data is kept in cache-hot
2197  *
2198  * In most of time, RT task is invoked because,
2199  *  Case - I : it is already scheduled some time ago, or
2200  *  Case - II: it is requested by some task without timedelay
2201  *
2202  * In case-I, it's hardly to find the best CPU in cache-hot if the time is relatively long.
2203  * But in case-II, waker CPU is likely to keep the cache-hot data useful to wakee RT task.
2204  */
2205 static inline int affordable_cpu(int cpu, unsigned long task_load)
2206 {
2207         /*
2208          * If the task.state is 'TASK_INTERRUPTIBLE',
2209          * she is likely to call 'schedule()' explicitely, for waking up RT task.
2210          *   and have something in common with it.
2211          */
2212         if (cpu_curr(cpu)->state != TASK_INTERRUPTIBLE)
2213                 return 0;
2214
2215         /*
2216          * Waker CPU must accommodate the target RT task.
2217          */
2218         if (capacity_of(cpu) <= task_load)
2219                 return 0;
2220
2221         /*
2222          * Future work (More concerns if needed):
2223          * - Min opportunity cost between the eviction of tasks and dismiss of target RT
2224          *      : If evicted tasks are expecting too many damage for its execution,
2225          *              Target RT should not be this CPU.
2226          *      load(RT) >= Capa(CPU)/3 && load(evicted tasks) >= Capa(CPU)/3
2227          * - Identifying the relation:
2228          *      : Is it possible to identify the relation (such as mutex owner and waiter)
2229          * -
2230          */
2231
2232         return 1;
2233 }
2234
2235 extern unsigned long cpu_util_wake(int cpu, struct task_struct *p);
2236 extern unsigned long task_util(struct task_struct *p);
2237
2238 /*
2239  * Must find the victim or recessive (not in lowest_mask)
2240  *
2241  */
2242 /* Future-safe accessor for struct task_struct's cpus_allowed. */
2243 #define rttsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
2244
2245 static int find_victim_rt_rq(struct task_struct *task, struct sched_group *sg, int *best_cpu) {
2246         struct cpumask *sg_cpus = sched_group_cpus_rt(sg);
2247         int i;
2248         unsigned long victim_rtweight, target_rtweight, min_rtweight;
2249         unsigned int victim_cpu_cap, min_cpu_cap = arch_scale_cpu_capacity(NULL, task_cpu(task));
2250         bool victim_rt = true;
2251
2252         if (!rt_task(task))
2253                 return *best_cpu;
2254
2255         target_rtweight = task->rt.avg.util_avg * weight_from_rtprio(task->prio);
2256         min_rtweight = target_rtweight;
2257
2258         for_each_cpu_and(i, sg_cpus, rttsk_cpus_allowed(task)) {
2259                 struct task_struct *victim = cpu_rq(i)->curr;
2260
2261                 if (victim->nr_cpus_allowed < 2)
2262                         continue;
2263
2264                 if (rt_task(victim)) {
2265                         victim_cpu_cap = arch_scale_cpu_capacity(NULL, i);
2266                         victim_rtweight = victim->rt.avg.util_avg * weight_from_rtprio(victim->prio);
2267
2268                         if (min_cpu_cap == victim_cpu_cap) {
2269                                 if (victim_rtweight < min_rtweight) {
2270                                         min_rtweight = victim_rtweight;
2271                                         *best_cpu = i;
2272                                         min_cpu_cap = victim_cpu_cap;
2273                                 }
2274                         } else {
2275                                 /*
2276                                  * It's necessary to un-cap the cpu capacity when comparing
2277                                  * utilization of each CPU. This is why the Fluid RT tries to give
2278                                  * the green light on big CPU to the long-run RT task
2279                                  * in accordance with the priority.
2280                                  */
2281                                 if (victim_rtweight * min_cpu_cap < min_rtweight * victim_cpu_cap) {
2282                                         min_rtweight = victim_rtweight;
2283                                         *best_cpu = i;
2284                                         min_cpu_cap = victim_cpu_cap;
2285                                 }
2286                         }
2287                 } else {
2288                         /* If Non-RT CPU is exist, select it first. */
2289                         *best_cpu = i;
2290                         victim_rt = false;
2291                         break;
2292                 }
2293         }
2294
2295         if (*best_cpu >= 0 && victim_rt) {
2296                 set_victim_flag(cpu_rq(*best_cpu)->curr);
2297         }
2298
2299         if (victim_rt)
2300                 trace_sched_fluid_stat(task, &task->se.avg, *best_cpu, "VICTIM-FAIR");
2301         else
2302                 trace_sched_fluid_stat(task, &task->se.avg, *best_cpu, "VICTIM-RT");
2303
2304         return *best_cpu;
2305
2306 }
2307
2308 static int find_lowest_rq_fluid(struct task_struct *task, int wake_flags)
2309 {
2310         int cpu, best_cpu = -1;
2311         int prefer_cpu = smp_processor_id();    /* Cache-hot with itself or waker (default). */
2312         int boosted = 0;
2313         struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
2314         struct sched_domain *sd;
2315         struct sched_group *sg;
2316         u64 cpu_load = ULLONG_MAX, min_load = ULLONG_MAX, min_rt_load = ULLONG_MAX;
2317         int min_cpu = -1, min_rt_cpu = -1;
2318
2319         /* Make sure the mask is initialized first */
2320         if (unlikely(!lowest_mask))
2321                 goto out;
2322
2323         if (task->nr_cpus_allowed == 1)
2324                 goto out; /* No other targets possible */
2325
2326         /* update the per-cpu local_cpu_mask (lowest_mask) */
2327         cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask);
2328
2329         /*
2330          *
2331          * Fluid Sched Core selection procedure:
2332          *
2333          * 1. Cache hot : this cpu (waker if wake_list is null)
2334          * 2. idle CPU selection (prev_cpu first)
2335          * 3. recessive task first (prev_cpu first)
2336          * 4. victim task first (prev_cpu first)
2337          */
2338
2339         /*
2340          * 1. Cache hot : packing the callee and caller,
2341          *      when there is nothing to run except callee
2342          */
2343         if ((wake_flags || affordable_cpu(prefer_cpu, task_util(task))) &&
2344                 cpumask_test_cpu(prefer_cpu, cpu_online_mask)) {
2345                 best_cpu = prefer_cpu;
2346                 trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "CACHE-HOT");
2347                 goto out;
2348         }
2349
2350         prefer_cpu = task_cpu(task);
2351
2352         /*
2353          * 2. idle CPU selection
2354          */
2355         boosted = (task->rt.avg.util_avg > sched_rt_boost_threshold) ? (1) : (0);
2356
2357         /* TODO: Need to refer the scheduling status of eHMP */
2358         for_each_cpu_and(cpu, rttsk_cpus_allowed(task), cpu_online_mask){
2359                 if (boosted && cpu < cpumask_first(cpu_coregroup_mask(prefer_cpu)))
2360                         continue;
2361
2362                 if (idle_cpu(cpu)) {
2363                         best_cpu = cpu;
2364                         trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "IDLE-FIRST");
2365                         goto out;
2366                 }
2367         }
2368
2369         rcu_read_lock();
2370
2371         sd = boosted ?
2372                 rcu_dereference(per_cpu(sd_ea, 0)) :
2373                 rcu_dereference(per_cpu(sd_ea, prefer_cpu));
2374
2375         if (!sd)
2376                 goto unlock;
2377
2378         sg = sd->groups;
2379
2380         /*
2381          * 3. recessive task first
2382          */
2383         do {
2384                 for_each_cpu_and(cpu, sched_group_span(sg), lowest_mask) {
2385
2386                         cpu_load = cpu_util_wake(cpu, task) + task_util(task);
2387
2388                         if (rt_task(cpu_rq(cpu)->curr)) {
2389                                 if (cpu_load < min_rt_load ||
2390                                         (cpu_load == min_rt_load && cpu == prefer_cpu)) {
2391                                         min_rt_load = cpu_load;
2392                                         min_rt_cpu = cpu;
2393                                 }
2394
2395                                 continue;
2396                         }
2397                         if (cpu_load < min_load ||
2398                                 (cpu_load == min_load && cpu == prefer_cpu)) {
2399                                 min_load = cpu_load;
2400                                 min_cpu = cpu;
2401                         }
2402
2403                 }
2404
2405                 /* Fair recessive task : best min-load of non-rt cpu is exist? */
2406                 if (min_cpu >= 0 &&
2407                         ((capacity_of(min_cpu) >= min_load) || (min_cpu == prefer_cpu))) {
2408                         best_cpu = min_cpu;
2409                         trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "FAIR-RECESS");
2410                         goto unlock;
2411                 }
2412
2413                 /* RT recessive task : best min-load of rt cpu is exist? */
2414                 if (min_rt_cpu >= 0 &&
2415                         ((capacity_of(min_rt_cpu) >= min_rt_load) || (min_rt_cpu == prefer_cpu))) {
2416                         best_cpu = min_rt_cpu;
2417                         trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "RT-RECESS");
2418                         goto unlock;
2419                 }
2420
2421         } while (sg = sg->next, sg != sd->groups);
2422         /* need to check the method for traversing the sg */
2423
2424         sg = sd->groups;
2425
2426         /*
2427          * 4. victim task first
2428          */
2429         do {
2430                 if (find_victim_rt_rq(task, sg, &best_cpu) != -1)
2431                         break;
2432         } while (sg = sg->next, sg != sd->groups);
2433
2434         if (best_cpu < 0)
2435                 best_cpu = prefer_cpu;
2436 unlock:
2437         rcu_read_unlock();
2438 out:
2439
2440         if (!cpumask_test_cpu(best_cpu, cpu_online_mask))
2441                 best_cpu = -1;
2442
2443         return best_cpu;
2444 }
2445 #endif /* CONFIG_SCHED_USE_FLUID_RT */
2446
2447 #ifdef CONFIG_SCHED_USE_FLUID_RT
2448 static int find_lowest_rq(struct task_struct *task, int wake_flags)
2449 #else
2450 static int find_lowest_rq(struct task_struct *task)
2451 #endif
2452 {
2453 #ifdef CONFIG_SCHED_USE_FLUID_RT
2454         return find_lowest_rq_fluid(task, wake_flags);
2455 #else
2456         struct sched_domain *sd;
2457         struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
2458         int this_cpu = smp_processor_id();
2459         int cpu      = task_cpu(task);
2460
2461         /* Make sure the mask is initialized first */
2462         if (unlikely(!lowest_mask))
2463                 return -1;
2464
2465         if (task->nr_cpus_allowed == 1)
2466                 return -1; /* No other targets possible */
2467
2468         if (!cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask))
2469                 return -1; /* No targets found */
2470
2471         /*
2472          * At this point we have built a mask of cpus representing the
2473          * lowest priority tasks in the system.  Now we want to elect
2474          * the best one based on our affinity and topology.
2475          *
2476          * We prioritize the last cpu that the task executed on since
2477          * it is most likely cache-hot in that location.
2478          */
2479         if (cpumask_test_cpu(cpu, lowest_mask))
2480                 return cpu;
2481
2482         /*
2483          * Otherwise, we consult the sched_domains span maps to figure
2484          * out which cpu is logically closest to our hot cache data.
2485          */
2486         if (!cpumask_test_cpu(this_cpu, lowest_mask))
2487                 this_cpu = -1; /* Skip this_cpu opt if not among lowest */
2488
2489         rcu_read_lock();
2490         for_each_domain(cpu, sd) {
2491                 if (sd->flags & SD_WAKE_AFFINE) {
2492                         int best_cpu;
2493
2494                         /*
2495                          * "this_cpu" is cheaper to preempt than a
2496                          * remote processor.
2497                          */
2498                         if (this_cpu != -1 &&
2499                             cpumask_test_cpu(this_cpu, sched_domain_span(sd))) {
2500                                 rcu_read_unlock();
2501                                 return this_cpu;
2502                         }
2503
2504                         best_cpu = cpumask_first_and(lowest_mask,
2505                                                      sched_domain_span(sd));
2506                         if (best_cpu < nr_cpu_ids) {
2507                                 rcu_read_unlock();
2508                                 return best_cpu;
2509                         }
2510                 }
2511         }
2512         rcu_read_unlock();
2513
2514         /*
2515          * And finally, if there were no matches within the domains
2516          * just give the caller *something* to work with from the compatible
2517          * locations.
2518          */
2519         if (this_cpu != -1)
2520                 return this_cpu;
2521
2522         cpu = cpumask_any(lowest_mask);
2523         if (cpu < nr_cpu_ids)
2524                 return cpu;
2525         return -1;
2526 #endif /* CONFIG_SCHED_USE_FLUID_RT */
2527 }
2528
2529 /* Will lock the rq it finds */
2530 static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
2531 {
2532         struct rq *lowest_rq = NULL;
2533         int tries;
2534         int cpu;
2535
2536         for (tries = 0; tries < RT_MAX_TRIES; tries++) {
2537 #ifdef CONFIG_SCHED_USE_FLUID_RT
2538                 cpu = find_lowest_rq(task, 0);
2539 #else
2540                 cpu = find_lowest_rq(task);
2541 #endif
2542                 if ((cpu == -1) || (cpu == rq->cpu))
2543                         break;
2544
2545                 lowest_rq = cpu_rq(cpu);
2546
2547 #ifdef CONFIG_SCHED_USE_FLUID_RT
2548                 /*
2549                  * Even though the lowest rq has a task of higher priority,
2550                  * FluidRT can expel it (victim task) if it has small utilization,
2551                  * or is not current task. Just keep trying.
2552                  */
2553 #else
2554                 if (lowest_rq->rt.highest_prio.curr <= task->prio) {
2555                         /*
2556                          * Target rq has tasks of equal or higher priority,
2557                          * retrying does not release any lock and is unlikely
2558                          * to yield a different result.
2559                          */
2560                         lowest_rq = NULL;
2561                         break;
2562                 }
2563 #endif
2564
2565                 /* if the prio of this runqueue changed, try again */
2566                 if (double_lock_balance(rq, lowest_rq)) {
2567                         /*
2568                          * We had to unlock the run queue. In
2569                          * the mean time, task could have
2570                          * migrated already or had its affinity changed.
2571                          * Also make sure that it wasn't scheduled on its rq.
2572                          */
2573                         if (unlikely(task_rq(task) != rq ||
2574                                      !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_allowed) ||
2575                                      task_running(rq, task) ||
2576                                      !rt_task(task) ||
2577                                      !task_on_rq_queued(task))) {
2578
2579                                 double_unlock_balance(rq, lowest_rq);
2580                                 lowest_rq = NULL;
2581                                 break;
2582                         }
2583                 }
2584
2585 #ifdef CONFIG_SCHED_USE_FLUID_RT
2586                 /* task is still rt task */
2587                 if (likely(rt_task(task)))
2588                         break;
2589 #else
2590                 /* If this rq is still suitable use it. */
2591                 if (lowest_rq->rt.highest_prio.curr > task->prio)
2592                         break;
2593
2594                 /* try again */
2595                 double_unlock_balance(rq, lowest_rq);
2596                 lowest_rq = NULL;
2597 #endif
2598         }
2599
2600         return lowest_rq;
2601 }
2602
2603 static struct task_struct *pick_next_pushable_task(struct rq *rq)
2604 {
2605         struct task_struct *p;
2606
2607         if (!has_pushable_tasks(rq))
2608                 return NULL;
2609
2610         p = plist_first_entry(&rq->rt.pushable_tasks,
2611                               struct task_struct, pushable_tasks);
2612
2613         BUG_ON(rq->cpu != task_cpu(p));
2614         BUG_ON(task_current(rq, p));
2615         BUG_ON(p->nr_cpus_allowed <= 1);
2616
2617         BUG_ON(!task_on_rq_queued(p));
2618         BUG_ON(!rt_task(p));
2619
2620         return p;
2621 }
2622
2623 /*
2624  * If the current CPU has more than one RT task, see if the non
2625  * running task can migrate over to a CPU that is running a task
2626  * of lesser priority.
2627  */
2628 static int push_rt_task(struct rq *rq)
2629 {
2630         struct task_struct *next_task;
2631         struct rq *lowest_rq;
2632         int ret = 0;
2633
2634         if (!rq->rt.overloaded)
2635                 return 0;
2636
2637         next_task = pick_next_pushable_task(rq);
2638         if (!next_task)
2639                 return 0;
2640
2641 retry:
2642         if (unlikely(next_task == rq->curr)) {
2643                 WARN_ON(1);
2644                 return 0;
2645         }
2646
2647         /*
2648          * It's possible that the next_task slipped in of
2649          * higher priority than current. If that's the case
2650          * just reschedule current.
2651          */
2652         if (unlikely(next_task->prio < rq->curr->prio)) {
2653                 resched_curr(rq);
2654                 return 0;
2655         }
2656
2657         /* We might release rq lock */
2658         get_task_struct(next_task);
2659
2660         /* find_lock_lowest_rq locks the rq if found */
2661         lowest_rq = find_lock_lowest_rq(next_task, rq);
2662         if (!lowest_rq) {
2663                 struct task_struct *task;
2664                 /*
2665                  * find_lock_lowest_rq releases rq->lock
2666                  * so it is possible that next_task has migrated.
2667                  *
2668                  * We need to make sure that the task is still on the same
2669                  * run-queue and is also still the next task eligible for
2670                  * pushing.
2671                  */
2672                 task = pick_next_pushable_task(rq);
2673                 if (task == next_task) {
2674                         /*
2675                          * The task hasn't migrated, and is still the next
2676                          * eligible task, but we failed to find a run-queue
2677                          * to push it to.  Do not retry in this case, since
2678                          * other cpus will pull from us when ready.
2679                          */
2680                         goto out;
2681                 }
2682
2683                 if (!task)
2684                         /* No more tasks, just exit */
2685                         goto out;
2686
2687                 /*
2688                  * Something has shifted, try again.
2689                  */
2690                 put_task_struct(next_task);
2691                 next_task = task;
2692                 goto retry;
2693         }
2694
2695         deactivate_task(rq, next_task, 0);
2696         next_task->on_rq = TASK_ON_RQ_MIGRATING;
2697         set_task_cpu(next_task, lowest_rq->cpu);
2698         next_task->on_rq = TASK_ON_RQ_QUEUED;
2699         activate_task(lowest_rq, next_task, 0);
2700         ret = 1;
2701
2702         resched_curr(lowest_rq);
2703
2704         double_unlock_balance(rq, lowest_rq);
2705
2706 out:
2707         put_task_struct(next_task);
2708
2709         return ret;
2710 }
2711
2712 static void push_rt_tasks(struct rq *rq)
2713 {
2714         /* push_rt_task will return true if it moved an RT */
2715         while (push_rt_task(rq))
2716                 ;
2717 }
2718
2719 #ifdef HAVE_RT_PUSH_IPI
2720
2721 /*
2722  * When a high priority task schedules out from a CPU and a lower priority
2723  * task is scheduled in, a check is made to see if there's any RT tasks
2724  * on other CPUs that are waiting to run because a higher priority RT task
2725  * is currently running on its CPU. In this case, the CPU with multiple RT
2726  * tasks queued on it (overloaded) needs to be notified that a CPU has opened
2727  * up that may be able to run one of its non-running queued RT tasks.
2728  *
2729  * All CPUs with overloaded RT tasks need to be notified as there is currently
2730  * no way to know which of these CPUs have the highest priority task waiting
2731  * to run. Instead of trying to take a spinlock on each of these CPUs,
2732  * which has shown to cause large latency when done on machines with many
2733  * CPUs, sending an IPI to the CPUs to have them push off the overloaded
2734  * RT tasks waiting to run.
2735  *
2736  * Just sending an IPI to each of the CPUs is also an issue, as on large
2737  * count CPU machines, this can cause an IPI storm on a CPU, especially
2738  * if its the only CPU with multiple RT tasks queued, and a large number
2739  * of CPUs scheduling a lower priority task at the same time.
2740  *
2741  * Each root domain has its own irq work function that can iterate over
2742  * all CPUs with RT overloaded tasks. Since all CPUs with overloaded RT
2743  * tassk must be checked if there's one or many CPUs that are lowering
2744  * their priority, there's a single irq work iterator that will try to
2745  * push off RT tasks that are waiting to run.
2746  *
2747  * When a CPU schedules a lower priority task, it will kick off the
2748  * irq work iterator that will jump to each CPU with overloaded RT tasks.
2749  * As it only takes the first CPU that schedules a lower priority task
2750  * to start the process, the rto_start variable is incremented and if
2751  * the atomic result is one, then that CPU will try to take the rto_lock.
2752  * This prevents high contention on the lock as the process handles all
2753  * CPUs scheduling lower priority tasks.
2754  *
2755  * All CPUs that are scheduling a lower priority task will increment the
2756  * rt_loop_next variable. This will make sure that the irq work iterator
2757  * checks all RT overloaded CPUs whenever a CPU schedules a new lower
2758  * priority task, even if the iterator is in the middle of a scan. Incrementing
2759  * the rt_loop_next will cause the iterator to perform another scan.
2760  *
2761  */
2762 static int rto_next_cpu(struct root_domain *rd)
2763 {
2764         int next;
2765         int cpu;
2766
2767         /*
2768          * When starting the IPI RT pushing, the rto_cpu is set to -1,
2769          * rt_next_cpu() will simply return the first CPU found in
2770          * the rto_mask.
2771          *
2772          * If rto_next_cpu() is called with rto_cpu is a valid cpu, it
2773          * will return the next CPU found in the rto_mask.
2774          *
2775          * If there are no more CPUs left in the rto_mask, then a check is made
2776          * against rto_loop and rto_loop_next. rto_loop is only updated with
2777          * the rto_lock held, but any CPU may increment the rto_loop_next
2778          * without any locking.
2779          */
2780         for (;;) {
2781
2782                 /* When rto_cpu is -1 this acts like cpumask_first() */
2783                 cpu = cpumask_next(rd->rto_cpu, rd->rto_mask);
2784
2785                 rd->rto_cpu = cpu;
2786
2787                 if (cpu < nr_cpu_ids)
2788                         return cpu;
2789
2790                 rd->rto_cpu = -1;
2791
2792                 /*
2793                  * ACQUIRE ensures we see the @rto_mask changes
2794                  * made prior to the @next value observed.
2795                  *
2796                  * Matches WMB in rt_set_overload().
2797                  */
2798                 next = atomic_read_acquire(&rd->rto_loop_next);
2799
2800                 if (rd->rto_loop == next)
2801                         break;
2802
2803                 rd->rto_loop = next;
2804         }
2805
2806         return -1;
2807 }
2808
2809 static inline bool rto_start_trylock(atomic_t *v)
2810 {
2811         return !atomic_cmpxchg_acquire(v, 0, 1);
2812 }
2813
2814 static inline void rto_start_unlock(atomic_t *v)
2815 {
2816         atomic_set_release(v, 0);
2817 }
2818
2819 static void tell_cpu_to_push(struct rq *rq)
2820 {
2821         int cpu = -1;
2822
2823         /* Keep the loop going if the IPI is currently active */
2824         atomic_inc(&rq->rd->rto_loop_next);
2825
2826         /* Only one CPU can initiate a loop at a time */
2827         if (!rto_start_trylock(&rq->rd->rto_loop_start))
2828                 return;
2829
2830         raw_spin_lock(&rq->rd->rto_lock);
2831
2832         /*
2833          * The rto_cpu is updated under the lock, if it has a valid cpu
2834          * then the IPI is still running and will continue due to the
2835          * update to loop_next, and nothing needs to be done here.
2836          * Otherwise it is finishing up and an ipi needs to be sent.
2837          */
2838         if (rq->rd->rto_cpu < 0)
2839                 cpu = rto_next_cpu(rq->rd);
2840
2841         raw_spin_unlock(&rq->rd->rto_lock);
2842
2843         rto_start_unlock(&rq->rd->rto_loop_start);
2844
2845         if (cpu >= 0) {
2846                 /* Make sure the rd does not get freed while pushing */
2847                 sched_get_rd(rq->rd);
2848                 irq_work_queue_on(&rq->rd->rto_push_work, cpu);
2849         }
2850 }
2851
2852 /* Called from hardirq context */
2853 void rto_push_irq_work_func(struct irq_work *work)
2854 {
2855         struct root_domain *rd =
2856                 container_of(work, struct root_domain, rto_push_work);
2857         struct rq *rq;
2858         int cpu;
2859
2860         rq = this_rq();
2861
2862         /*
2863          * We do not need to grab the lock to check for has_pushable_tasks.
2864          * When it gets updated, a check is made if a push is possible.
2865          */
2866         if (has_pushable_tasks(rq)) {
2867                 raw_spin_lock(&rq->lock);
2868                 push_rt_tasks(rq);
2869                 raw_spin_unlock(&rq->lock);
2870         }
2871
2872         raw_spin_lock(&rd->rto_lock);
2873
2874         /* Pass the IPI to the next rt overloaded queue */
2875         cpu = rto_next_cpu(rd);
2876
2877         raw_spin_unlock(&rd->rto_lock);
2878
2879         if (cpu < 0) {
2880                 sched_put_rd(rd);
2881                 return;
2882         }
2883
2884         /* Try the next RT overloaded CPU */
2885         irq_work_queue_on(&rd->rto_push_work, cpu);
2886 }
2887 #endif /* HAVE_RT_PUSH_IPI */
2888
2889 static void pull_rt_task(struct rq *this_rq)
2890 {
2891         int this_cpu = this_rq->cpu, cpu;
2892         bool resched = false;
2893         struct task_struct *p;
2894         struct rq *src_rq;
2895         int rt_overload_count = rt_overloaded(this_rq);
2896
2897         if (likely(!rt_overload_count))
2898                 return;
2899
2900         /*
2901          * Match the barrier from rt_set_overloaded; this guarantees that if we
2902          * see overloaded we must also see the rto_mask bit.
2903          */
2904         smp_rmb();
2905
2906         /* If we are the only overloaded CPU do nothing */
2907         if (rt_overload_count == 1 &&
2908             cpumask_test_cpu(this_rq->cpu, this_rq->rd->rto_mask))
2909                 return;
2910
2911 #ifdef HAVE_RT_PUSH_IPI
2912         if (sched_feat(RT_PUSH_IPI)) {
2913                 tell_cpu_to_push(this_rq);
2914                 return;
2915         }
2916 #endif
2917
2918         for_each_cpu(cpu, this_rq->rd->rto_mask) {
2919                 if (this_cpu == cpu)
2920                         continue;
2921
2922                 src_rq = cpu_rq(cpu);
2923
2924                 /*
2925                  * Don't bother taking the src_rq->lock if the next highest
2926                  * task is known to be lower-priority than our current task.
2927                  * This may look racy, but if this value is about to go
2928                  * logically higher, the src_rq will push this task away.
2929                  * And if its going logically lower, we do not care
2930                  */
2931                 if (src_rq->rt.highest_prio.next >=
2932                     this_rq->rt.highest_prio.curr)
2933                         continue;
2934
2935                 /*
2936                  * We can potentially drop this_rq's lock in
2937                  * double_lock_balance, and another CPU could
2938                  * alter this_rq
2939                  */
2940                 double_lock_balance(this_rq, src_rq);
2941
2942                 /*
2943                  * We can pull only a task, which is pushable
2944                  * on its rq, and no others.
2945                  */
2946                 p = pick_highest_pushable_task(src_rq, this_cpu);
2947
2948                 /*
2949                  * Do we have an RT task that preempts
2950                  * the to-be-scheduled task?
2951                  */
2952                 if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
2953                         WARN_ON(p == src_rq->curr);
2954                         WARN_ON(!task_on_rq_queued(p));
2955
2956                         /*
2957                          * There's a chance that p is higher in priority
2958                          * than what's currently running on its cpu.
2959                          * This is just that p is wakeing up and hasn't
2960                          * had a chance to schedule. We only pull
2961                          * p if it is lower in priority than the
2962                          * current task on the run queue
2963                          */
2964                         if (p->prio < src_rq->curr->prio)
2965                                 goto skip;
2966
2967                         resched = true;
2968
2969                         deactivate_task(src_rq, p, 0);
2970                         p->on_rq = TASK_ON_RQ_MIGRATING;
2971                         set_task_cpu(p, this_cpu);
2972                         p->on_rq = TASK_ON_RQ_QUEUED;
2973                         activate_task(this_rq, p, 0);
2974                         /*
2975                          * We continue with the search, just in
2976                          * case there's an even higher prio task
2977                          * in another runqueue. (low likelihood
2978                          * but possible)
2979                          */
2980                 }
2981 skip:
2982                 double_unlock_balance(this_rq, src_rq);
2983         }
2984
2985         if (resched)
2986                 resched_curr(this_rq);
2987 }
2988
2989 /*
2990  * If we are not running and we are not going to reschedule soon, we should
2991  * try to push tasks away now
2992  */
2993 static void task_woken_rt(struct rq *rq, struct task_struct *p)
2994 {
2995         if (!task_running(rq, p) &&
2996             !test_tsk_need_resched(rq->curr) &&
2997             p->nr_cpus_allowed > 1 &&
2998             (dl_task(rq->curr) || rt_task(rq->curr)) &&
2999             (rq->curr->nr_cpus_allowed < 2 ||
3000              rq->curr->prio <= p->prio))
3001                 push_rt_tasks(rq);
3002 }
3003
3004 /* Assumes rq->lock is held */
3005 static void rq_online_rt(struct rq *rq)
3006 {
3007         if (rq->rt.overloaded)
3008                 rt_set_overload(rq);
3009
3010         __enable_runtime(rq);
3011
3012         cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
3013 }
3014
3015 /* Assumes rq->lock is held */
3016 static void rq_offline_rt(struct rq *rq)
3017 {
3018         if (rq->rt.overloaded)
3019                 rt_clear_overload(rq);
3020
3021         __disable_runtime(rq);
3022
3023         cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_INVALID);
3024 }
3025
3026 /*
3027  * When switch from the rt queue, we bring ourselves to a position
3028  * that we might want to pull RT tasks from other runqueues.
3029  */
3030 static void switched_from_rt(struct rq *rq, struct task_struct *p)
3031 {
3032         detach_task_rt_rq(p);
3033         /*
3034          * If there are other RT tasks then we will reschedule
3035          * and the scheduling of the other RT tasks will handle
3036          * the balancing. But if we are the last RT task
3037          * we may need to handle the pulling of RT tasks
3038          * now.
3039          */
3040         if (!task_on_rq_queued(p) || rq->rt.rt_nr_running)
3041                 return;
3042
3043         queue_pull_task(rq);
3044 }
3045
3046 void __init init_sched_rt_class(void)
3047 {
3048         unsigned int i;
3049
3050         for_each_possible_cpu(i) {
3051                 zalloc_cpumask_var_node(&per_cpu(local_cpu_mask, i),
3052                                         GFP_KERNEL, cpu_to_node(i));
3053         }
3054 }
3055 #else
3056 void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se)
3057 {
3058 }
3059 #endif /* CONFIG_SMP */
3060
3061 extern void
3062 copy_sched_avg(struct sched_avg *from, struct sched_avg *to, unsigned int ratio);
3063
3064 /*
3065  * When switching a task to RT, we may overload the runqueue
3066  * with RT tasks. In this case we try to push them off to
3067  * other runqueues.
3068  */
3069 static void switched_to_rt(struct rq *rq, struct task_struct *p)
3070 {
3071         /* Copy fair sched avg into rt sched avg */
3072         copy_sched_avg(&p->se.avg, &p->rt.avg, 100);
3073         /*
3074          * If we are already running, then there's nothing
3075          * that needs to be done. But if we are not running
3076          * we may need to preempt the current running task.
3077          * If that current running task is also an RT task
3078          * then see if we can move to another run queue.
3079          */
3080         if (task_on_rq_queued(p) && rq->curr != p) {
3081 #ifdef CONFIG_SMP
3082                 if (p->nr_cpus_allowed > 1 && rq->rt.overloaded)
3083                         queue_push_tasks(rq);
3084 #endif /* CONFIG_SMP */
3085                 if (p->prio < rq->curr->prio && cpu_online(cpu_of(rq)))
3086                         resched_curr(rq);
3087         }
3088 }
3089
3090 /*
3091  * Priority of the task has changed. This may cause
3092  * us to initiate a push or pull.
3093  */
3094 static void
3095 prio_changed_rt(struct rq *rq, struct task_struct *p, int oldprio)
3096 {
3097         if (!task_on_rq_queued(p))
3098                 return;
3099
3100         if (rq->curr == p) {
3101 #ifdef CONFIG_SMP
3102                 /*
3103                  * If our priority decreases while running, we
3104                  * may need to pull tasks to this runqueue.
3105                  */
3106                 if (oldprio < p->prio)
3107                         queue_pull_task(rq);
3108
3109                 /*
3110                  * If there's a higher priority task waiting to run
3111                  * then reschedule.
3112                  */
3113                 if (p->prio > rq->rt.highest_prio.curr)
3114                         resched_curr(rq);
3115 #else
3116                 /* For UP simply resched on drop of prio */
3117                 if (oldprio < p->prio)
3118                         resched_curr(rq);
3119 #endif /* CONFIG_SMP */
3120         } else {
3121                 /*
3122                  * This task is not running, but if it is
3123                  * greater than the current running task
3124                  * then reschedule.
3125                  */
3126                 if (p->prio < rq->curr->prio)
3127                         resched_curr(rq);
3128         }
3129 }
3130
3131 #ifdef CONFIG_POSIX_TIMERS
3132 static void watchdog(struct rq *rq, struct task_struct *p)
3133 {
3134         unsigned long soft, hard;
3135
3136         /* max may change after cur was read, this will be fixed next tick */
3137         soft = task_rlimit(p, RLIMIT_RTTIME);
3138         hard = task_rlimit_max(p, RLIMIT_RTTIME);
3139
3140         if (soft != RLIM_INFINITY) {
3141                 unsigned long next;
3142
3143                 if (p->rt.watchdog_stamp != jiffies) {
3144                         p->rt.timeout++;
3145                         p->rt.watchdog_stamp = jiffies;
3146                 }
3147
3148                 next = DIV_ROUND_UP(min(soft, hard), USEC_PER_SEC/HZ);
3149                 if (p->rt.timeout > next)
3150                         p->cputime_expires.sched_exp = p->se.sum_exec_runtime;
3151         }
3152 }
3153 #else
3154 static inline void watchdog(struct rq *rq, struct task_struct *p) { }
3155 #endif
3156
3157 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
3158 {
3159         struct sched_rt_entity *rt_se = &p->rt;
3160         u64 now = rq_clock_task(rq);
3161
3162         update_curr_rt(rq);
3163
3164         for_each_sched_rt_entity(rt_se)
3165                 update_rt_load_avg(now, rt_se);
3166
3167         watchdog(rq, p);
3168
3169         /*
3170          * RR tasks need a special form of timeslice management.
3171          * FIFO tasks have no timeslices.
3172          */
3173         if (p->policy != SCHED_RR)
3174                 return;
3175
3176         if (--p->rt.time_slice)
3177                 return;
3178
3179         p->rt.time_slice = sched_rr_timeslice;
3180
3181         /*
3182          * Requeue to the end of queue if we (and all of our ancestors) are not
3183          * the only element on the queue
3184          */
3185         for_each_sched_rt_entity(rt_se) {
3186                 if (rt_se->run_list.prev != rt_se->run_list.next) {
3187                         requeue_task_rt(rq, p, 0);
3188                         resched_curr(rq);
3189                         return;
3190                 }
3191         }
3192 }
3193
3194 static void set_curr_task_rt(struct rq *rq)
3195 {
3196         struct task_struct *p = rq->curr;
3197         struct sched_rt_entity *rt_se = &p->rt;
3198
3199         p->se.exec_start = rq_clock_task(rq);
3200
3201         for_each_sched_rt_entity(rt_se) {
3202                 struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
3203                 rt_rq->curr = rt_se;
3204         }
3205
3206         /* The running task is never eligible for pushing */
3207         dequeue_pushable_task(rq, p);
3208 }
3209
3210 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
3211 {
3212         /*
3213          * Time slice is 0 for SCHED_FIFO tasks
3214          */
3215         if (task->policy == SCHED_RR)
3216                 return sched_rr_timeslice;
3217         else
3218                 return 0;
3219 }
3220
3221 const struct sched_class rt_sched_class = {
3222         .next                   = &fair_sched_class,
3223         .enqueue_task           = enqueue_task_rt,
3224         .dequeue_task           = dequeue_task_rt,
3225         .yield_task             = yield_task_rt,
3226
3227         .check_preempt_curr     = check_preempt_curr_rt,
3228
3229         .pick_next_task         = pick_next_task_rt,
3230         .put_prev_task          = put_prev_task_rt,
3231
3232 #ifdef CONFIG_SMP
3233         .select_task_rq         = select_task_rq_rt,
3234
3235         .migrate_task_rq                = migrate_task_rq_rt,
3236         .task_dead                              = task_dead_rt,
3237         .set_cpus_allowed       = set_cpus_allowed_common,
3238         .rq_online              = rq_online_rt,
3239         .rq_offline             = rq_offline_rt,
3240         .task_woken             = task_woken_rt,
3241         .switched_from          = switched_from_rt,
3242 #endif
3243
3244         .set_curr_task          = set_curr_task_rt,
3245         .task_tick              = task_tick_rt,
3246
3247         .get_rr_interval        = get_rr_interval_rt,
3248
3249         .prio_changed           = prio_changed_rt,
3250         .switched_to            = switched_to_rt,
3251
3252         .update_curr            = update_curr_rt,
3253 #ifdef CONFIG_RT_GROUP_SCHED
3254         .task_change_group      = task_change_group_rt,
3255 #endif
3256 };
3257
3258 #ifdef CONFIG_RT_GROUP_SCHED
3259 /*
3260  * Ensure that the real time constraints are schedulable.
3261  */
3262 static DEFINE_MUTEX(rt_constraints_mutex);
3263
3264 /* Must be called with tasklist_lock held */
3265 static inline int tg_has_rt_tasks(struct task_group *tg)
3266 {
3267         struct task_struct *g, *p;
3268
3269         /*
3270          * Autogroups do not have RT tasks; see autogroup_create().
3271          */
3272         if (task_group_is_autogroup(tg))
3273                 return 0;
3274
3275         for_each_process_thread(g, p) {
3276                 if (rt_task(p) && task_group(p) == tg)
3277                         return 1;
3278         }
3279
3280         return 0;
3281 }
3282
3283 struct rt_schedulable_data {
3284         struct task_group *tg;
3285         u64 rt_period;
3286         u64 rt_runtime;
3287 };
3288
3289 static int tg_rt_schedulable(struct task_group *tg, void *data)
3290 {
3291         struct rt_schedulable_data *d = data;
3292         struct task_group *child;
3293         unsigned long total, sum = 0;
3294         u64 period, runtime;
3295
3296         period = ktime_to_ns(tg->rt_bandwidth.rt_period);
3297         runtime = tg->rt_bandwidth.rt_runtime;
3298
3299         if (tg == d->tg) {
3300                 period = d->rt_period;
3301                 runtime = d->rt_runtime;
3302         }
3303
3304         /*
3305          * Cannot have more runtime than the period.
3306          */
3307         if (runtime > period && runtime != RUNTIME_INF)
3308                 return -EINVAL;
3309
3310         /*
3311          * Ensure we don't starve existing RT tasks.
3312          */
3313         if (rt_bandwidth_enabled() && !runtime && tg_has_rt_tasks(tg))
3314                 return -EBUSY;
3315
3316         total = to_ratio(period, runtime);
3317
3318         /*
3319          * Nobody can have more than the global setting allows.
3320          */
3321         if (total > to_ratio(global_rt_period(), global_rt_runtime()))
3322                 return -EINVAL;
3323
3324         /*
3325          * The sum of our children's runtime should not exceed our own.
3326          */
3327         list_for_each_entry_rcu(child, &tg->children, siblings) {
3328                 period = ktime_to_ns(child->rt_bandwidth.rt_period);
3329                 runtime = child->rt_bandwidth.rt_runtime;
3330
3331                 if (child == d->tg) {
3332                         period = d->rt_period;
3333                         runtime = d->rt_runtime;
3334                 }
3335
3336                 sum += to_ratio(period, runtime);
3337         }
3338
3339         if (sum > total)
3340                 return -EINVAL;
3341
3342         return 0;
3343 }
3344
3345 static int __rt_schedulable(struct task_group *tg, u64 period, u64 runtime)
3346 {
3347         int ret;
3348
3349         struct rt_schedulable_data data = {
3350                 .tg = tg,
3351                 .rt_period = period,
3352                 .rt_runtime = runtime,
3353         };
3354
3355         rcu_read_lock();
3356         ret = walk_tg_tree(tg_rt_schedulable, tg_nop, &data);
3357         rcu_read_unlock();
3358
3359         return ret;
3360 }
3361
3362 static int tg_set_rt_bandwidth(struct task_group *tg,
3363                 u64 rt_period, u64 rt_runtime)
3364 {
3365         int i, err = 0;
3366
3367         /*
3368          * Disallowing the root group RT runtime is BAD, it would disallow the
3369          * kernel creating (and or operating) RT threads.
3370          */
3371         if (tg == &root_task_group && rt_runtime == 0)
3372                 return -EINVAL;
3373
3374         /* No period doesn't make any sense. */
3375         if (rt_period == 0)
3376                 return -EINVAL;
3377
3378         mutex_lock(&rt_constraints_mutex);
3379         read_lock(&tasklist_lock);
3380         err = __rt_schedulable(tg, rt_period, rt_runtime);
3381         if (err)
3382                 goto unlock;
3383
3384         raw_spin_lock_irq(&tg->rt_bandwidth.rt_runtime_lock);
3385         tg->rt_bandwidth.rt_period = ns_to_ktime(rt_period);
3386         tg->rt_bandwidth.rt_runtime = rt_runtime;
3387
3388         for_each_possible_cpu(i) {
3389                 struct rt_rq *rt_rq = tg->rt_rq[i];
3390
3391                 raw_spin_lock(&rt_rq->rt_runtime_lock);
3392                 rt_rq->rt_runtime = rt_runtime;
3393                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
3394         }
3395         raw_spin_unlock_irq(&tg->rt_bandwidth.rt_runtime_lock);
3396 unlock:
3397         read_unlock(&tasklist_lock);
3398         mutex_unlock(&rt_constraints_mutex);
3399
3400         return err;
3401 }
3402
3403 int sched_group_set_rt_runtime(struct task_group *tg, long rt_runtime_us)
3404 {
3405         u64 rt_runtime, rt_period;
3406
3407         rt_period = ktime_to_ns(tg->rt_bandwidth.rt_period);
3408         rt_runtime = (u64)rt_runtime_us * NSEC_PER_USEC;
3409         if (rt_runtime_us < 0)
3410                 rt_runtime = RUNTIME_INF;
3411         else if ((u64)rt_runtime_us > U64_MAX / NSEC_PER_USEC)
3412                 return -EINVAL;
3413
3414         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
3415 }
3416
3417 long sched_group_rt_runtime(struct task_group *tg)
3418 {
3419         u64 rt_runtime_us;
3420
3421         if (tg->rt_bandwidth.rt_runtime == RUNTIME_INF)
3422                 return -1;
3423
3424         rt_runtime_us = tg->rt_bandwidth.rt_runtime;
3425         do_div(rt_runtime_us, NSEC_PER_USEC);
3426         return rt_runtime_us;
3427 }
3428
3429 int sched_group_set_rt_period(struct task_group *tg, u64 rt_period_us)
3430 {
3431         u64 rt_runtime, rt_period;
3432
3433         if (rt_period_us > U64_MAX / NSEC_PER_USEC)
3434                 return -EINVAL;
3435
3436         rt_period = rt_period_us * NSEC_PER_USEC;
3437         rt_runtime = tg->rt_bandwidth.rt_runtime;
3438
3439         return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
3440 }
3441
3442 long sched_group_rt_period(struct task_group *tg)
3443 {
3444         u64 rt_period_us;
3445
3446         rt_period_us = ktime_to_ns(tg->rt_bandwidth.rt_period);
3447         do_div(rt_period_us, NSEC_PER_USEC);
3448         return rt_period_us;
3449 }
3450
3451 static int sched_rt_global_constraints(void)
3452 {
3453         int ret = 0;
3454
3455         mutex_lock(&rt_constraints_mutex);
3456         read_lock(&tasklist_lock);
3457         ret = __rt_schedulable(NULL, 0, 0);
3458         read_unlock(&tasklist_lock);
3459         mutex_unlock(&rt_constraints_mutex);
3460
3461         return ret;
3462 }
3463
3464 int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
3465 {
3466         /* Don't accept realtime tasks when there is no way for them to run */
3467         if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
3468                 return 0;
3469
3470         return 1;
3471 }
3472
3473 #else /* !CONFIG_RT_GROUP_SCHED */
3474 static int sched_rt_global_constraints(void)
3475 {
3476         unsigned long flags;
3477         int i;
3478
3479         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
3480         for_each_possible_cpu(i) {
3481                 struct rt_rq *rt_rq = &cpu_rq(i)->rt;
3482
3483                 raw_spin_lock(&rt_rq->rt_runtime_lock);
3484                 rt_rq->rt_runtime = global_rt_runtime();
3485                 raw_spin_unlock(&rt_rq->rt_runtime_lock);
3486         }
3487         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
3488
3489         return 0;
3490 }
3491 #endif /* CONFIG_RT_GROUP_SCHED */
3492
3493 static int sched_rt_global_validate(void)
3494 {
3495         if (sysctl_sched_rt_period <= 0)
3496                 return -EINVAL;
3497
3498         if ((sysctl_sched_rt_runtime != RUNTIME_INF) &&
3499                 (sysctl_sched_rt_runtime > sysctl_sched_rt_period))
3500                 return -EINVAL;
3501
3502         return 0;
3503 }
3504
3505 static void sched_rt_do_global(void)
3506 {
3507         unsigned long flags;
3508
3509         raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags);
3510         def_rt_bandwidth.rt_runtime = global_rt_runtime();
3511         def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period());
3512         raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags);
3513 }
3514
3515 int sched_rt_handler(struct ctl_table *table, int write,
3516                 void __user *buffer, size_t *lenp,
3517                 loff_t *ppos)
3518 {
3519         int old_period, old_runtime;
3520         static DEFINE_MUTEX(mutex);
3521         int ret;
3522
3523         mutex_lock(&mutex);
3524         old_period = sysctl_sched_rt_period;
3525         old_runtime = sysctl_sched_rt_runtime;
3526
3527         ret = proc_dointvec(table, write, buffer, lenp, ppos);
3528
3529         if (!ret && write) {
3530                 ret = sched_rt_global_validate();
3531                 if (ret)
3532                         goto undo;
3533
3534                 ret = sched_dl_global_validate();
3535                 if (ret)
3536                         goto undo;
3537
3538                 ret = sched_rt_global_constraints();
3539                 if (ret)
3540                         goto undo;
3541
3542                 sched_rt_do_global();
3543                 sched_dl_do_global();
3544         }
3545         if (0) {
3546 undo:
3547                 sysctl_sched_rt_period = old_period;
3548                 sysctl_sched_rt_runtime = old_runtime;
3549         }
3550         mutex_unlock(&mutex);
3551
3552         return ret;
3553 }
3554
3555 int sched_rr_handler(struct ctl_table *table, int write,
3556                 void __user *buffer, size_t *lenp,
3557                 loff_t *ppos)
3558 {
3559         int ret;
3560         static DEFINE_MUTEX(mutex);
3561
3562         mutex_lock(&mutex);
3563         ret = proc_dointvec(table, write, buffer, lenp, ppos);
3564         /*
3565          * Make sure that internally we keep jiffies.
3566          * Also, writing zero resets the timeslice to default:
3567          */
3568         if (!ret && write) {
3569                 sched_rr_timeslice =
3570                         sysctl_sched_rr_timeslice <= 0 ? RR_TIMESLICE :
3571                         msecs_to_jiffies(sysctl_sched_rr_timeslice);
3572         }
3573         mutex_unlock(&mutex);
3574         return ret;
3575 }
3576
3577 #ifdef CONFIG_SCHED_DEBUG
3578 void print_rt_stats(struct seq_file *m, int cpu)
3579 {
3580         rt_rq_iter_t iter;
3581         struct rt_rq *rt_rq;
3582
3583         rcu_read_lock();
3584         for_each_rt_rq(rt_rq, iter, cpu_rq(cpu))
3585                 print_rt_rq(m, cpu, rt_rq);
3586         rcu_read_unlock();
3587 }
3588 #endif /* CONFIG_SCHED_DEBUG */