kernel/sched/core.c

   1 /*
   2  *  kernel/sched/core.c
   3  *
   4  *  Core kernel scheduler code and related syscalls
   5  *
   6  *  Copyright (C) 1991-2002  Linus Torvalds
   7  */
   8 #include <linux/sched.h>
   9 #include <linux/sched/clock.h>
  10 #include <uapi/linux/sched/types.h>
  11 #include <linux/sched/loadavg.h>
  12 #include <linux/sched/hotplug.h>
  13 #include <linux/wait_bit.h>
  14 #include <linux/cpuset.h>
  15 #include <linux/delayacct.h>
  16 #include <linux/init_task.h>
  17 #include <linux/context_tracking.h>
  18 #include <linux/rcupdate_wait.h>
  19
  20 #include <linux/blkdev.h>
  21 #include <linux/kprobes.h>
  22 #include <linux/mmu_context.h>
  23 #include <linux/module.h>
  24 #include <linux/nmi.h>
  25 #include <linux/prefetch.h>
  26 #include <linux/profile.h>
  27 #include <linux/security.h>
  28 #include <linux/syscalls.h>
  29
  30 #include <asm/switch_to.h>
  31 #include <asm/tlb.h>
  32 #ifdef CONFIG_PARAVIRT
  33 #include <asm/paravirt.h>
  34 #endif
  35
  36 #include "sched.h"
  37 #include "../workqueue_internal.h"
  38 #include "../smpboot.h"
  39
  40 #define CREATE_TRACE_POINTS
  41 #include <trace/events/sched.h>
  42 #include "walt.h"
  43
  44 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  45
  46 /*
  47  * Debugging: various feature bits
  48  */
  49
  50 #define SCHED_FEAT(name, enabled)       \
  51         (1UL << __SCHED_FEAT_##name) * enabled |
  52
  53 const_debug unsigned int sysctl_sched_features =
  54 #include "features.h"
  55         0;
  56
  57 #undef SCHED_FEAT
  58
  59 /*
  60  * Number of tasks to iterate in a single balance run.
  61  * Limited because this is done with IRQs disabled.
  62  */
  63 const_debug unsigned int sysctl_sched_nr_migrate = 32;
  64
  65 /*
  66  * period over which we average the RT time consumption, measured
  67  * in ms.
  68  *
  69  * default: 1s
  70  */
  71 const_debug unsigned int sysctl_sched_time_avg = MSEC_PER_SEC;
  72
  73 /*
  74  * period over which we measure -rt task CPU usage in us.
  75  * default: 1s
  76  */
  77 unsigned int sysctl_sched_rt_period = 1000000;
  78
  79 __read_mostly int scheduler_running;
  80
  81 /*
  82  * part of the period that we allow rt tasks to run in us.
  83  * default: 0.95s
  84  */
  85 int sysctl_sched_rt_runtime = 950000;
  86
  87 /* CPUs with isolated domains */
  88 cpumask_var_t cpu_isolated_map;
  89
  90 /*
  91  * __task_rq_lock - lock the rq @p resides on.
  92  */
  93 struct rq *__task_rq_lock(struct task_struct *p, struct rq_flags *rf)
  94         __acquires(rq->lock)
  95 {
  96         struct rq *rq;
  97
  98         lockdep_assert_held(&p->pi_lock);
  99
 100         for (;;) {
 101                 rq = task_rq(p);
 102                 raw_spin_lock(&rq->lock);
 103                 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
 104                         rq_pin_lock(rq, rf);
 105                         return rq;
 106                 }
 107                 raw_spin_unlock(&rq->lock);
 108
 109                 while (unlikely(task_on_rq_migrating(p)))
 110                         cpu_relax();
 111         }
 112 }
 113
 114 /*
 115  * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 116  */
 117 struct rq *task_rq_lock(struct task_struct *p, struct rq_flags *rf)
 118         __acquires(p->pi_lock)
 119         __acquires(rq->lock)
 120 {
 121         struct rq *rq;
 122
 123         for (;;) {
 124                 raw_spin_lock_irqsave(&p->pi_lock, rf->flags);
 125                 rq = task_rq(p);
 126                 raw_spin_lock(&rq->lock);
 127                 /*
 128                  *      move_queued_task()              task_rq_lock()
 129                  *
 130                  *      ACQUIRE (rq->lock)
 131                  *      [S] ->on_rq = MIGRATING         [L] rq = task_rq()
 132                  *      WMB (__set_task_cpu())          ACQUIRE (rq->lock);
 133                  *      [S] ->cpu = new_cpu             [L] task_rq()
 134                  *                                      [L] ->on_rq
 135                  *      RELEASE (rq->lock)
 136                  *
 137                  * If we observe the old cpu in task_rq_lock, the acquire of
 138                  * the old rq->lock will fully serialize against the stores.
 139                  *
 140                  * If we observe the new CPU in task_rq_lock, the acquire will
 141                  * pair with the WMB to ensure we must then also see migrating.
 142                  */
 143                 if (likely(rq == task_rq(p) && !task_on_rq_migrating(p))) {
 144                         rq_pin_lock(rq, rf);
 145                         return rq;
 146                 }
 147                 raw_spin_unlock(&rq->lock);
 148                 raw_spin_unlock_irqrestore(&p->pi_lock, rf->flags);
 149
 150                 while (unlikely(task_on_rq_migrating(p)))
 151                         cpu_relax();
 152         }
 153 }
 154
 155 /*
 156  * RQ-clock updating methods:
 157  */
 158
 159 static void update_rq_clock_task(struct rq *rq, s64 delta)
 160 {
 161 /*
 162  * In theory, the compile should just see 0 here, and optimize out the call
 163  * to sched_rt_avg_update. But I don't trust it...
 164  */
 165 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 166         s64 steal = 0, irq_delta = 0;
 167 #endif
 168 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 169         irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
 170
 171         /*
 172          * Since irq_time is only updated on {soft,}irq_exit, we might run into
 173          * this case when a previous update_rq_clock() happened inside a
 174          * {soft,}irq region.
 175          *
 176          * When this happens, we stop ->clock_task and only update the
 177          * prev_irq_time stamp to account for the part that fit, so that a next
 178          * update will consume the rest. This ensures ->clock_task is
 179          * monotonic.
 180          *
 181          * It does however cause some slight miss-attribution of {soft,}irq
 182          * time, a more accurate solution would be to update the irq_time using
 183          * the current rq->clock timestamp, except that would require using
 184          * atomic ops.
 185          */
 186         if (irq_delta > delta)
 187                 irq_delta = delta;
 188
 189         rq->prev_irq_time += irq_delta;
 190         delta -= irq_delta;
 191 #endif
 192 #ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING
 193         if (static_key_false((&paravirt_steal_rq_enabled))) {
 194                 steal = paravirt_steal_clock(cpu_of(rq));
 195                 steal -= rq->prev_steal_time_rq;
 196
 197                 if (unlikely(steal > delta))
 198                         steal = delta;
 199
 200                 rq->prev_steal_time_rq += steal;
 201                 delta -= steal;
 202         }
 203 #endif
 204
 205         rq->clock_task += delta;
 206
 207 #if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
 208         if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
 209                 sched_rt_avg_update(rq, irq_delta + steal);
 210 #endif
 211 }
 212
 213 void update_rq_clock(struct rq *rq)
 214 {
 215         s64 delta;
 216
 217         lockdep_assert_held(&rq->lock);
 218
 219         if (rq->clock_update_flags & RQCF_ACT_SKIP)
 220                 return;
 221
 222 #ifdef CONFIG_SCHED_DEBUG
 223         if (sched_feat(WARN_DOUBLE_CLOCK))
 224                 SCHED_WARN_ON(rq->clock_update_flags & RQCF_UPDATED);
 225         rq->clock_update_flags |= RQCF_UPDATED;
 226 #endif
 227
 228         delta = sched_clock_cpu(cpu_of(rq)) - rq->clock;
 229         if (delta < 0)
 230                 return;
 231         rq->clock += delta;
 232         update_rq_clock_task(rq, delta);
 233 }
 234
 235
 236 #ifdef CONFIG_SCHED_HRTICK
 237 /*
 238  * Use HR-timers to deliver accurate preemption points.
 239  */
 240
 241 static void hrtick_clear(struct rq *rq)
 242 {
 243         if (hrtimer_active(&rq->hrtick_timer))
 244                 hrtimer_cancel(&rq->hrtick_timer);
 245 }
 246
 247 /*
 248  * High-resolution timer tick.
 249  * Runs from hardirq context with interrupts disabled.
 250  */
 251 static enum hrtimer_restart hrtick(struct hrtimer *timer)
 252 {
 253         struct rq *rq = container_of(timer, struct rq, hrtick_timer);
 254         struct rq_flags rf;
 255
 256         WARN_ON_ONCE(cpu_of(rq) != smp_processor_id());
 257
 258         rq_lock(rq, &rf);
 259         update_rq_clock(rq);
 260         rq->curr->sched_class->task_tick(rq, rq->curr, 1);
 261         rq_unlock(rq, &rf);
 262
 263         return HRTIMER_NORESTART;
 264 }
 265
 266 #ifdef CONFIG_SMP
 267
 268 static void __hrtick_restart(struct rq *rq)
 269 {
 270         struct hrtimer *timer = &rq->hrtick_timer;
 271
 272         hrtimer_start_expires(timer, HRTIMER_MODE_ABS_PINNED);
 273 }
 274
 275 /*
 276  * called from hardirq (IPI) context
 277  */
 278 static void __hrtick_start(void *arg)
 279 {
 280         struct rq *rq = arg;
 281         struct rq_flags rf;
 282
 283         rq_lock(rq, &rf);
 284         __hrtick_restart(rq);
 285         rq->hrtick_csd_pending = 0;
 286         rq_unlock(rq, &rf);
 287 }
 288
 289 /*
 290  * Called to set the hrtick timer state.
 291  *
 292  * called with rq->lock held and irqs disabled
 293  */
 294 void hrtick_start(struct rq *rq, u64 delay)
 295 {
 296         struct hrtimer *timer = &rq->hrtick_timer;
 297         ktime_t time;
 298         s64 delta;
 299
 300         /*
 301          * Don't schedule slices shorter than 10000ns, that just
 302          * doesn't make sense and can cause timer DoS.
 303          */
 304         delta = max_t(s64, delay, 10000LL);
 305         time = ktime_add_ns(timer->base->get_time(), delta);
 306
 307         hrtimer_set_expires(timer, time);
 308
 309         if (rq == this_rq()) {
 310                 __hrtick_restart(rq);
 311         } else if (!rq->hrtick_csd_pending) {
 312                 smp_call_function_single_async(cpu_of(rq), &rq->hrtick_csd);
 313                 rq->hrtick_csd_pending = 1;
 314         }
 315 }
 316
 317 #else
 318 /*
 319  * Called to set the hrtick timer state.
 320  *
 321  * called with rq->lock held and irqs disabled
 322  */
 323 void hrtick_start(struct rq *rq, u64 delay)
 324 {
 325         /*
 326          * Don't schedule slices shorter than 10000ns, that just
 327          * doesn't make sense. Rely on vruntime for fairness.
 328          */
 329         delay = max_t(u64, delay, 10000LL);
 330         hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay),
 331                       HRTIMER_MODE_REL_PINNED);
 332 }
 333 #endif /* CONFIG_SMP */
 334
 335 static void init_rq_hrtick(struct rq *rq)
 336 {
 337 #ifdef CONFIG_SMP
 338         rq->hrtick_csd_pending = 0;
 339
 340         rq->hrtick_csd.flags = 0;
 341         rq->hrtick_csd.func = __hrtick_start;
 342         rq->hrtick_csd.info = rq;
 343 #endif
 344
 345         hrtimer_init(&rq->hrtick_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
 346         rq->hrtick_timer.function = hrtick;
 347 }
 348 #else   /* CONFIG_SCHED_HRTICK */
 349 static inline void hrtick_clear(struct rq *rq)
 350 {
 351 }
 352
 353 static inline void init_rq_hrtick(struct rq *rq)
 354 {
 355 }
 356 #endif  /* CONFIG_SCHED_HRTICK */
 357
 358 /*
 359  * cmpxchg based fetch_or, macro so it works for different integer types
 360  */
 361 #define fetch_or(ptr, mask)                                             \
 362         ({                                                              \
 363                 typeof(ptr) _ptr = (ptr);                               \
 364                 typeof(mask) _mask = (mask);                            \
 365                 typeof(*_ptr) _old, _val = *_ptr;                       \
 366                                                                         \
 367                 for (;;) {                                              \
 368                         _old = cmpxchg(_ptr, _val, _val | _mask);       \
 369                         if (_old == _val)                               \
 370                                 break;                                  \
 371                         _val = _old;                                    \
 372                 }                                                       \
 373         _old;                                                           \
 374 })
 375
 376 #if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG)
 377 /*
 378  * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG,
 379  * this avoids any races wrt polling state changes and thereby avoids
 380  * spurious IPIs.
 381  */
 382 static bool set_nr_and_not_polling(struct task_struct *p)
 383 {
 384         struct thread_info *ti = task_thread_info(p);
 385         return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG);
 386 }
 387
 388 /*
 389  * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set.
 390  *
 391  * If this returns true, then the idle task promises to call
 392  * sched_ttwu_pending() and reschedule soon.
 393  */
 394 static bool set_nr_if_polling(struct task_struct *p)
 395 {
 396         struct thread_info *ti = task_thread_info(p);
 397         typeof(ti->flags) old, val = READ_ONCE(ti->flags);
 398
 399         for (;;) {
 400                 if (!(val & _TIF_POLLING_NRFLAG))
 401                         return false;
 402                 if (val & _TIF_NEED_RESCHED)
 403                         return true;
 404                 old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED);
 405                 if (old == val)
 406                         break;
 407                 val = old;
 408         }
 409         return true;
 410 }
 411
 412 #else
 413 static bool set_nr_and_not_polling(struct task_struct *p)
 414 {
 415         set_tsk_need_resched(p);
 416         return true;
 417 }
 418
 419 #ifdef CONFIG_SMP
 420 static bool set_nr_if_polling(struct task_struct *p)
 421 {
 422         return false;
 423 }
 424 #endif
 425 #endif
 426
 427 void wake_q_add(struct wake_q_head *head, struct task_struct *task)
 428 {
 429         struct wake_q_node *node = &task->wake_q;
 430
 431         /*
 432          * Atomically grab the task, if ->wake_q is !nil already it means
 433          * its already queued (either by us or someone else) and will get the
 434          * wakeup due to that.
 435          *
 436          * This cmpxchg() implies a full barrier, which pairs with the write
 437          * barrier implied by the wakeup in wake_up_q().
 438          */
 439         if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL))
 440                 return;
 441
 442         head->count++;
 443
 444         get_task_struct(task);
 445
 446         /*
 447          * The head is context local, there can be no concurrency.
 448          */
 449         *head->lastp = node;
 450         head->lastp = &node->next;
 451 }
 452
 453 static int
 454 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
 455                int sibling_count_hint);
 456
 457 void wake_up_q(struct wake_q_head *head)
 458 {
 459         struct wake_q_node *node = head->first;
 460
 461         while (node != WAKE_Q_TAIL) {
 462                 struct task_struct *task;
 463
 464                 task = container_of(node, struct task_struct, wake_q);
 465                 BUG_ON(!task);
 466                 /* Task can safely be re-inserted now: */
 467                 node = node->next;
 468                 task->wake_q.next = NULL;
 469
 470                 /*
 471                  * try_to_wake_up() implies a wmb() to pair with the queueing
 472                  * in wake_q_add() so as not to miss wakeups.
 473                  */
 474                 try_to_wake_up(task, TASK_NORMAL, 0, head->count);
 475                 put_task_struct(task);
 476         }
 477 }
 478
 479 /*
 480  * resched_curr - mark rq's current task 'to be rescheduled now'.
 481  *
 482  * On UP this means the setting of the need_resched flag, on SMP it
 483  * might also involve a cross-CPU call to trigger the scheduler on
 484  * the target CPU.
 485  */
 486 void resched_curr(struct rq *rq)
 487 {
 488         struct task_struct *curr = rq->curr;
 489         int cpu;
 490
 491         lockdep_assert_held(&rq->lock);
 492
 493         if (test_tsk_need_resched(curr))
 494                 return;
 495
 496         cpu = cpu_of(rq);
 497
 498         if (cpu == smp_processor_id()) {
 499                 set_tsk_need_resched(curr);
 500                 set_preempt_need_resched();
 501                 return;
 502         }
 503
 504         if (set_nr_and_not_polling(curr))
 505                 smp_send_reschedule(cpu);
 506         else
 507                 trace_sched_wake_idle_without_ipi(cpu);
 508 }
 509
 510 void resched_cpu(int cpu)
 511 {
 512         struct rq *rq = cpu_rq(cpu);
 513         unsigned long flags;
 514
 515         raw_spin_lock_irqsave(&rq->lock, flags);
 516         resched_curr(rq);
 517         raw_spin_unlock_irqrestore(&rq->lock, flags);
 518 }
 519
 520 #ifdef CONFIG_SMP
 521 #ifdef CONFIG_NO_HZ_COMMON
 522 /*
 523  * In the semi idle case, use the nearest busy CPU for migrating timers
 524  * from an idle CPU.  This is good for power-savings.
 525  *
 526  * We don't do similar optimization for completely idle system, as
 527  * selecting an idle CPU will add more delays to the timers than intended
 528  * (as that CPU's timer base may not be uptodate wrt jiffies etc).
 529  */
 530 int get_nohz_timer_target(void)
 531 {
 532         int i, cpu = smp_processor_id();
 533         struct sched_domain *sd;
 534
 535         if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu))
 536                 return cpu;
 537
 538         rcu_read_lock();
 539         for_each_domain(cpu, sd) {
 540                 for_each_cpu(i, sched_domain_span(sd)) {
 541                         if (cpu == i)
 542                                 continue;
 543
 544                         if (!idle_cpu(i) && is_housekeeping_cpu(i)) {
 545                                 cpu = i;
 546                                 goto unlock;
 547                         }
 548                 }
 549         }
 550
 551         if (!is_housekeeping_cpu(cpu))
 552                 cpu = housekeeping_any_cpu();
 553 unlock:
 554         rcu_read_unlock();
 555         return cpu;
 556 }
 557
 558 /*
 559  * When add_timer_on() enqueues a timer into the timer wheel of an
 560  * idle CPU then this timer might expire before the next timer event
 561  * which is scheduled to wake up that CPU. In case of a completely
 562  * idle system the next event might even be infinite time into the
 563  * future. wake_up_idle_cpu() ensures that the CPU is woken up and
 564  * leaves the inner idle loop so the newly added timer is taken into
 565  * account when the CPU goes back to idle and evaluates the timer
 566  * wheel for the next timer event.
 567  */
 568 static void wake_up_idle_cpu(int cpu)
 569 {
 570         struct rq *rq = cpu_rq(cpu);
 571
 572         if (cpu == smp_processor_id())
 573                 return;
 574
 575         if (set_nr_and_not_polling(rq->idle))
 576                 smp_send_reschedule(cpu);
 577         else
 578                 trace_sched_wake_idle_without_ipi(cpu);
 579 }
 580
 581 static bool wake_up_full_nohz_cpu(int cpu)
 582 {
 583         /*
 584          * We just need the target to call irq_exit() and re-evaluate
 585          * the next tick. The nohz full kick at least implies that.
 586          * If needed we can still optimize that later with an
 587          * empty IRQ.
 588          */
 589         if (cpu_is_offline(cpu))
 590                 return true;  /* Don't try to wake offline CPUs. */
 591         if (tick_nohz_full_cpu(cpu)) {
 592                 if (cpu != smp_processor_id() ||
 593                     tick_nohz_tick_stopped())
 594                         tick_nohz_full_kick_cpu(cpu);
 595                 return true;
 596         }
 597
 598         return false;
 599 }
 600
 601 /*
 602  * Wake up the specified CPU.  If the CPU is going offline, it is the
 603  * caller's responsibility to deal with the lost wakeup, for example,
 604  * by hooking into the CPU_DEAD notifier like timers and hrtimers do.
 605  */
 606 void wake_up_nohz_cpu(int cpu)
 607 {
 608         if (!wake_up_full_nohz_cpu(cpu))
 609                 wake_up_idle_cpu(cpu);
 610 }
 611
 612 static inline bool got_nohz_idle_kick(void)
 613 {
 614         int cpu = smp_processor_id();
 615
 616         if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)))
 617                 return false;
 618
 619         if (idle_cpu(cpu) && !need_resched())
 620                 return true;
 621
 622         /*
 623          * We can't run Idle Load Balance on this CPU for this time so we
 624          * cancel it and clear NOHZ_BALANCE_KICK
 625          */
 626         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu));
 627         return false;
 628 }
 629
 630 #else /* CONFIG_NO_HZ_COMMON */
 631
 632 static inline bool got_nohz_idle_kick(void)
 633 {
 634         return false;
 635 }
 636
 637 #endif /* CONFIG_NO_HZ_COMMON */
 638
 639 #ifdef CONFIG_NO_HZ_FULL
 640 bool sched_can_stop_tick(struct rq *rq)
 641 {
 642         int fifo_nr_running;
 643
 644         /* Deadline tasks, even if single, need the tick */
 645         if (rq->dl.dl_nr_running)
 646                 return false;
 647
 648         /*
 649          * If there are more than one RR tasks, we need the tick to effect the
 650          * actual RR behaviour.
 651          */
 652         if (rq->rt.rr_nr_running) {
 653                 if (rq->rt.rr_nr_running == 1)
 654                         return true;
 655                 else
 656                         return false;
 657         }
 658
 659         /*
 660          * If there's no RR tasks, but FIFO tasks, we can skip the tick, no
 661          * forced preemption between FIFO tasks.
 662          */
 663         fifo_nr_running = rq->rt.rt_nr_running - rq->rt.rr_nr_running;
 664         if (fifo_nr_running)
 665                 return true;
 666
 667         /*
 668          * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left;
 669          * if there's more than one we need the tick for involuntary
 670          * preemption.
 671          */
 672         if (rq->nr_running > 1)
 673                 return false;
 674
 675         return true;
 676 }
 677 #endif /* CONFIG_NO_HZ_FULL */
 678
 679 void sched_avg_update(struct rq *rq)
 680 {
 681         s64 period = sched_avg_period();
 682
 683         while ((s64)(rq_clock(rq) - rq->age_stamp) > period) {
 684                 /*
 685                  * Inline assembly required to prevent the compiler
 686                  * optimising this loop into a divmod call.
 687                  * See __iter_div_u64_rem() for another example of this.
 688                  */
 689                 asm("" : "+rm" (rq->age_stamp));
 690                 rq->age_stamp += period;
 691                 rq->rt_avg /= 2;
 692         }
 693 }
 694
 695 #endif /* CONFIG_SMP */
 696
 697 #if defined(CONFIG_RT_GROUP_SCHED) || (defined(CONFIG_FAIR_GROUP_SCHED) && \
 698                         (defined(CONFIG_SMP) || defined(CONFIG_CFS_BANDWIDTH)))
 699 /*
 700  * Iterate task_group tree rooted at *from, calling @down when first entering a
 701  * node and @up when leaving it for the final time.
 702  *
 703  * Caller must hold rcu_lock or sufficient equivalent.
 704  */
 705 int walk_tg_tree_from(struct task_group *from,
 706                              tg_visitor down, tg_visitor up, void *data)
 707 {
 708         struct task_group *parent, *child;
 709         int ret;
 710
 711         parent = from;
 712
 713 down:
 714         ret = (*down)(parent, data);
 715         if (ret)
 716                 goto out;
 717         list_for_each_entry_rcu(child, &parent->children, siblings) {
 718                 parent = child;
 719                 goto down;
 720
 721 up:
 722                 continue;
 723         }
 724         ret = (*up)(parent, data);
 725         if (ret || parent == from)
 726                 goto out;
 727
 728         child = parent;
 729         parent = parent->parent;
 730         if (parent)
 731                 goto up;
 732 out:
 733         return ret;
 734 }
 735
 736 int tg_nop(struct task_group *tg, void *data)
 737 {
 738         return 0;
 739 }
 740 #endif
 741
 742 static void set_load_weight(struct task_struct *p)
 743 {
 744         int prio = p->static_prio - MAX_RT_PRIO;
 745         struct load_weight *load = &p->se.load;
 746
 747         /*
 748          * SCHED_IDLE tasks get minimal weight:
 749          */
 750         if (idle_policy(p->policy)) {
 751                 load->weight = scale_load(WEIGHT_IDLEPRIO);
 752                 load->inv_weight = WMULT_IDLEPRIO;
 753                 return;
 754         }
 755
 756         load->weight = scale_load(sched_prio_to_weight[prio]);
 757         load->inv_weight = sched_prio_to_wmult[prio];
 758 }
 759
 760 static inline void enqueue_task(struct rq *rq, struct task_struct *p, int flags)
 761 {
 762         if (!(flags & ENQUEUE_NOCLOCK))
 763                 update_rq_clock(rq);
 764
 765         if (!(flags & ENQUEUE_RESTORE))
 766                 sched_info_queued(rq, p);
 767
 768         p->sched_class->enqueue_task(rq, p, flags);
 769 }
 770
 771 static inline void dequeue_task(struct rq *rq, struct task_struct *p, int flags)
 772 {
 773         if (!(flags & DEQUEUE_NOCLOCK))
 774                 update_rq_clock(rq);
 775
 776         if (!(flags & DEQUEUE_SAVE))
 777                 sched_info_dequeued(rq, p);
 778
 779         p->sched_class->dequeue_task(rq, p, flags);
 780 }
 781
 782 void activate_task(struct rq *rq, struct task_struct *p, int flags)
 783 {
 784         if (task_contributes_to_load(p))
 785                 rq->nr_uninterruptible--;
 786
 787         enqueue_task(rq, p, flags);
 788 }
 789
 790 void deactivate_task(struct rq *rq, struct task_struct *p, int flags)
 791 {
 792         if (task_contributes_to_load(p))
 793                 rq->nr_uninterruptible++;
 794
 795         dequeue_task(rq, p, flags);
 796 }
 797
 798 /*
 799  * __normal_prio - return the priority that is based on the static prio
 800  */
 801 static inline int __normal_prio(struct task_struct *p)
 802 {
 803         return p->static_prio;
 804 }
 805
 806 /*
 807  * Calculate the expected normal priority: i.e. priority
 808  * without taking RT-inheritance into account. Might be
 809  * boosted by interactivity modifiers. Changes upon fork,
 810  * setprio syscalls, and whenever the interactivity
 811  * estimator recalculates.
 812  */
 813 static inline int normal_prio(struct task_struct *p)
 814 {
 815         int prio;
 816
 817         if (task_has_dl_policy(p))
 818                 prio = MAX_DL_PRIO-1;
 819         else if (task_has_rt_policy(p))
 820                 prio = MAX_RT_PRIO-1 - p->rt_priority;
 821         else
 822                 prio = __normal_prio(p);
 823         return prio;
 824 }
 825
 826 /*
 827  * Calculate the current priority, i.e. the priority
 828  * taken into account by the scheduler. This value might
 829  * be boosted by RT tasks, or might be boosted by
 830  * interactivity modifiers. Will be RT if the task got
 831  * RT-boosted. If not then it returns p->normal_prio.
 832  */
 833 static int effective_prio(struct task_struct *p)
 834 {
 835         p->normal_prio = normal_prio(p);
 836         /*
 837          * If we are RT tasks or we were boosted to RT priority,
 838          * keep the priority unchanged. Otherwise, update priority
 839          * to the normal priority:
 840          */
 841         if (!rt_prio(p->prio))
 842                 return p->normal_prio;
 843         return p->prio;
 844 }
 845
 846 /**
 847  * task_curr - is this task currently executing on a CPU?
 848  * @p: the task in question.
 849  *
 850  * Return: 1 if the task is currently executing. 0 otherwise.
 851  */
 852 inline int task_curr(const struct task_struct *p)
 853 {
 854         return cpu_curr(task_cpu(p)) == p;
 855 }
 856
 857 /*
 858  * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock,
 859  * use the balance_callback list if you want balancing.
 860  *
 861  * this means any call to check_class_changed() must be followed by a call to
 862  * balance_callback().
 863  */
 864 static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 865                                        const struct sched_class *prev_class,
 866                                        int oldprio)
 867 {
 868         if (prev_class != p->sched_class) {
 869                 if (prev_class->switched_from)
 870                         prev_class->switched_from(rq, p);
 871
 872                 p->sched_class->switched_to(rq, p);
 873         } else if (oldprio != p->prio || dl_task(p))
 874                 p->sched_class->prio_changed(rq, p, oldprio);
 875 }
 876
 877 void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
 878 {
 879         const struct sched_class *class;
 880
 881         if (p->sched_class == rq->curr->sched_class) {
 882                 rq->curr->sched_class->check_preempt_curr(rq, p, flags);
 883         } else {
 884                 for_each_class(class) {
 885                         if (class == rq->curr->sched_class)
 886                                 break;
 887                         if (class == p->sched_class) {
 888                                 resched_curr(rq);
 889                                 break;
 890                         }
 891                 }
 892         }
 893
 894         /*
 895          * A queue event has occurred, and we're going to schedule.  In
 896          * this case, we can save a useless back to back clock update.
 897          */
 898         if (task_on_rq_queued(rq->curr) && test_tsk_need_resched(rq->curr))
 899                 rq_clock_skip_update(rq, true);
 900 }
 901
 902 #ifdef CONFIG_SMP
 903 /*
 904  * This is how migration works:
 905  *
 906  * 1) we invoke migration_cpu_stop() on the target CPU using
 907  *    stop_one_cpu().
 908  * 2) stopper starts to run (implicitly forcing the migrated thread
 909  *    off the CPU)
 910  * 3) it checks whether the migrated task is still in the wrong runqueue.
 911  * 4) if it's in the wrong runqueue then the migration thread removes
 912  *    it and puts it into the right queue.
 913  * 5) stopper completes and stop_one_cpu() returns and the migration
 914  *    is done.
 915  */
 916
 917 /*
 918  * move_queued_task - move a queued task to new rq.
 919  *
 920  * Returns (locked) new rq. Old rq's lock is released.
 921  */
 922 static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf,
 923                                    struct task_struct *p, int new_cpu)
 924 {
 925         lockdep_assert_held(&rq->lock);
 926
 927         p->on_rq = TASK_ON_RQ_MIGRATING;
 928         dequeue_task(rq, p, DEQUEUE_NOCLOCK);
 929         set_task_cpu(p, new_cpu);
 930         rq_unlock(rq, rf);
 931
 932         rq = cpu_rq(new_cpu);
 933
 934         rq_lock(rq, rf);
 935         BUG_ON(task_cpu(p) != new_cpu);
 936         enqueue_task(rq, p, 0);
 937         p->on_rq = TASK_ON_RQ_QUEUED;
 938         check_preempt_curr(rq, p, 0);
 939
 940         return rq;
 941 }
 942
 943 struct migration_arg {
 944         struct task_struct *task;
 945         int dest_cpu;
 946 };
 947
 948 /*
 949  * Move (not current) task off this CPU, onto the destination CPU. We're doing
 950  * this because either it can't run here any more (set_cpus_allowed()
 951  * away from this CPU, or CPU going down), or because we're
 952  * attempting to rebalance this task on exec (sched_exec).
 953  *
 954  * So we race with normal scheduler movements, but that's OK, as long
 955  * as the task is no longer on this CPU.
 956  */
 957 static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf,
 958                                  struct task_struct *p, int dest_cpu)
 959 {
 960         if (p->flags & PF_KTHREAD) {
 961                 if (unlikely(!cpu_online(dest_cpu)))
 962                         return rq;
 963         } else {
 964                 if (unlikely(!cpu_active(dest_cpu)))
 965                         return rq;
 966         }
 967
 968         /* Affinity changed (again). */
 969         if (!cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
 970                 return rq;
 971
 972         update_rq_clock(rq);
 973         rq = move_queued_task(rq, rf, p, dest_cpu);
 974
 975         return rq;
 976 }
 977
 978 /*
 979  * migration_cpu_stop - this will be executed by a highprio stopper thread
 980  * and performs thread migration by bumping thread off CPU then
 981  * 'pushing' onto another runqueue.
 982  */
 983 static int migration_cpu_stop(void *data)
 984 {
 985         struct migration_arg *arg = data;
 986         struct task_struct *p = arg->task;
 987         struct rq *rq = this_rq();
 988         struct rq_flags rf;
 989
 990         /*
 991          * The original target CPU might have gone down and we might
 992          * be on another CPU but it doesn't matter.
 993          */
 994         local_irq_disable();
 995         /*
 996          * We need to explicitly wake pending tasks before running
 997          * __migrate_task() such that we will not miss enforcing cpus_allowed
 998          * during wakeups, see set_cpus_allowed_ptr()'s TASK_WAKING test.
 999          */
1000         sched_ttwu_pending();
1001
1002         raw_spin_lock(&p->pi_lock);
1003         rq_lock(rq, &rf);
1004         /*
1005          * If task_rq(p) != rq, it cannot be migrated here, because we're
1006          * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because
1007          * we're holding p->pi_lock.
1008          */
1009         if (task_rq(p) == rq) {
1010                 if (task_on_rq_queued(p))
1011                         rq = __migrate_task(rq, &rf, p, arg->dest_cpu);
1012                 else
1013                         p->wake_cpu = arg->dest_cpu;
1014         }
1015         rq_unlock(rq, &rf);
1016         raw_spin_unlock(&p->pi_lock);
1017
1018         local_irq_enable();
1019         return 0;
1020 }
1021
1022 /*
1023  * sched_class::set_cpus_allowed must do the below, but is not required to
1024  * actually call this function.
1025  */
1026 void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask)
1027 {
1028         cpumask_copy(&p->cpus_allowed, new_mask);
1029         p->nr_cpus_allowed = cpumask_weight(new_mask);
1030 }
1031
1032 void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask)
1033 {
1034         struct rq *rq = task_rq(p);
1035         bool queued, running;
1036
1037         lockdep_assert_held(&p->pi_lock);
1038
1039         queued = task_on_rq_queued(p);
1040         running = task_current(rq, p);
1041
1042         if (queued) {
1043                 /*
1044                  * Because __kthread_bind() calls this on blocked tasks without
1045                  * holding rq->lock.
1046                  */
1047                 lockdep_assert_held(&rq->lock);
1048                 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
1049         }
1050         if (running)
1051                 put_prev_task(rq, p);
1052
1053         p->sched_class->set_cpus_allowed(p, new_mask);
1054
1055         if (queued)
1056                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
1057         if (running)
1058                 set_curr_task(rq, p);
1059 }
1060
1061 /*
1062  * Change a given task's CPU affinity. Migrate the thread to a
1063  * proper CPU and schedule it away if the CPU it's executing on
1064  * is removed from the allowed bitmask.
1065  *
1066  * NOTE: the caller must have a valid reference to the task, the
1067  * task must not exit() & deallocate itself prematurely. The
1068  * call is not atomic; no spinlocks may be held.
1069  */
1070 static int __set_cpus_allowed_ptr(struct task_struct *p,
1071                                   const struct cpumask *new_mask, bool check)
1072 {
1073         const struct cpumask *cpu_valid_mask = cpu_active_mask;
1074         unsigned int dest_cpu;
1075         struct rq_flags rf;
1076         struct rq *rq;
1077         int ret = 0;
1078
1079         rq = task_rq_lock(p, &rf);
1080         update_rq_clock(rq);
1081
1082         if (p->flags & PF_KTHREAD) {
1083                 /*
1084                  * Kernel threads are allowed on online && !active CPUs
1085                  */
1086                 cpu_valid_mask = cpu_online_mask;
1087         }
1088
1089         /*
1090          * Must re-check here, to close a race against __kthread_bind(),
1091          * sched_setaffinity() is not guaranteed to observe the flag.
1092          */
1093         if (check && (p->flags & PF_NO_SETAFFINITY)) {
1094                 ret = -EINVAL;
1095                 goto out;
1096         }
1097
1098         if (cpumask_equal(&p->cpus_allowed, new_mask))
1099                 goto out;
1100
1101         if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
1102                 ret = -EINVAL;
1103                 goto out;
1104         }
1105
1106         do_set_cpus_allowed(p, new_mask);
1107
1108         if (p->flags & PF_KTHREAD) {
1109                 /*
1110                  * For kernel threads that do indeed end up on online &&
1111                  * !active we want to ensure they are strict per-CPU threads.
1112                  */
1113                 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
1114                         !cpumask_intersects(new_mask, cpu_active_mask) &&
1115                         p->nr_cpus_allowed != 1);
1116         }
1117
1118         /* Can the task run on the task's current CPU? If so, we're done */
1119         if (cpumask_test_cpu(task_cpu(p), new_mask))
1120                 goto out;
1121
1122         dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
1123         if (task_running(rq, p) || p->state == TASK_WAKING) {
1124                 struct migration_arg arg = { p, dest_cpu };
1125                 /* Need help from migration thread: drop lock and wait. */
1126                 task_rq_unlock(rq, p, &rf);
1127                 stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg);
1128                 tlb_migrate_finish(p->mm);
1129                 return 0;
1130         } else if (task_on_rq_queued(p)) {
1131                 /*
1132                  * OK, since we're going to drop the lock immediately
1133                  * afterwards anyway.
1134                  */
1135                 rq = move_queued_task(rq, &rf, p, dest_cpu);
1136         }
1137 out:
1138         task_rq_unlock(rq, p, &rf);
1139
1140         return ret;
1141 }
1142
1143 int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask)
1144 {
1145         return __set_cpus_allowed_ptr(p, new_mask, false);
1146 }
1147 EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr);
1148
1149 void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
1150 {
1151 #ifdef CONFIG_SCHED_DEBUG
1152         /*
1153          * We should never call set_task_cpu() on a blocked task,
1154          * ttwu() will sort out the placement.
1155          */
1156         WARN_ON_ONCE(p->state != TASK_RUNNING && p->state != TASK_WAKING &&
1157                         !p->on_rq);
1158
1159         /*
1160          * Migrating fair class task must have p->on_rq = TASK_ON_RQ_MIGRATING,
1161          * because schedstat_wait_{start,end} rebase migrating task's wait_start
1162          * time relying on p->on_rq.
1163          */
1164         WARN_ON_ONCE(p->state == TASK_RUNNING &&
1165                      p->sched_class == &fair_sched_class &&
1166                      (p->on_rq && !task_on_rq_migrating(p)));
1167
1168 #ifdef CONFIG_LOCKDEP
1169         /*
1170          * The caller should hold either p->pi_lock or rq->lock, when changing
1171          * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks.
1172          *
1173          * sched_move_task() holds both and thus holding either pins the cgroup,
1174          * see task_group().
1175          *
1176          * Furthermore, all task_rq users should acquire both locks, see
1177          * task_rq_lock().
1178          */
1179         WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) ||
1180                                       lockdep_is_held(&task_rq(p)->lock)));
1181 #endif
1182         /*
1183          * Clearly, migrating tasks to offline CPUs is a fairly daft thing.
1184          */
1185         WARN_ON_ONCE(!cpu_online(new_cpu));
1186 #endif
1187
1188         trace_sched_migrate_task(p, new_cpu);
1189
1190         if (task_cpu(p) != new_cpu) {
1191                 if (p->sched_class->migrate_task_rq)
1192                         p->sched_class->migrate_task_rq(p);
1193                 p->se.nr_migrations++;
1194                 perf_event_task_migrate(p);
1195
1196                 walt_fixup_busy_time(p, new_cpu);
1197         }
1198
1199         __set_task_cpu(p, new_cpu);
1200 }
1201
1202 static void __migrate_swap_task(struct task_struct *p, int cpu)
1203 {
1204         if (task_on_rq_queued(p)) {
1205                 struct rq *src_rq, *dst_rq;
1206                 struct rq_flags srf, drf;
1207
1208                 src_rq = task_rq(p);
1209                 dst_rq = cpu_rq(cpu);
1210
1211                 rq_pin_lock(src_rq, &srf);
1212                 rq_pin_lock(dst_rq, &drf);
1213
1214                 p->on_rq = TASK_ON_RQ_MIGRATING;
1215                 deactivate_task(src_rq, p, 0);
1216                 set_task_cpu(p, cpu);
1217                 activate_task(dst_rq, p, 0);
1218                 p->on_rq = TASK_ON_RQ_QUEUED;
1219                 check_preempt_curr(dst_rq, p, 0);
1220
1221                 rq_unpin_lock(dst_rq, &drf);
1222                 rq_unpin_lock(src_rq, &srf);
1223
1224         } else {
1225                 /*
1226                  * Task isn't running anymore; make it appear like we migrated
1227                  * it before it went to sleep. This means on wakeup we make the
1228                  * previous CPU our target instead of where it really is.
1229                  */
1230                 p->wake_cpu = cpu;
1231         }
1232 }
1233
1234 struct migration_swap_arg {
1235         struct task_struct *src_task, *dst_task;
1236         int src_cpu, dst_cpu;
1237 };
1238
1239 static int migrate_swap_stop(void *data)
1240 {
1241         struct migration_swap_arg *arg = data;
1242         struct rq *src_rq, *dst_rq;
1243         int ret = -EAGAIN;
1244
1245         if (!cpu_active(arg->src_cpu) || !cpu_active(arg->dst_cpu))
1246                 return -EAGAIN;
1247
1248         src_rq = cpu_rq(arg->src_cpu);
1249         dst_rq = cpu_rq(arg->dst_cpu);
1250
1251         double_raw_lock(&arg->src_task->pi_lock,
1252                         &arg->dst_task->pi_lock);
1253         double_rq_lock(src_rq, dst_rq);
1254
1255         if (task_cpu(arg->dst_task) != arg->dst_cpu)
1256                 goto unlock;
1257
1258         if (task_cpu(arg->src_task) != arg->src_cpu)
1259                 goto unlock;
1260
1261         if (!cpumask_test_cpu(arg->dst_cpu, &arg->src_task->cpus_allowed))
1262                 goto unlock;
1263
1264         if (!cpumask_test_cpu(arg->src_cpu, &arg->dst_task->cpus_allowed))
1265                 goto unlock;
1266
1267         __migrate_swap_task(arg->src_task, arg->dst_cpu);
1268         __migrate_swap_task(arg->dst_task, arg->src_cpu);
1269
1270         ret = 0;
1271
1272 unlock:
1273         double_rq_unlock(src_rq, dst_rq);
1274         raw_spin_unlock(&arg->dst_task->pi_lock);
1275         raw_spin_unlock(&arg->src_task->pi_lock);
1276
1277         return ret;
1278 }
1279
1280 /*
1281  * Cross migrate two tasks
1282  */
1283 int migrate_swap(struct task_struct *cur, struct task_struct *p)
1284 {
1285         struct migration_swap_arg arg;
1286         int ret = -EINVAL;
1287
1288         arg = (struct migration_swap_arg){
1289                 .src_task = cur,
1290                 .src_cpu = task_cpu(cur),
1291                 .dst_task = p,
1292                 .dst_cpu = task_cpu(p),
1293         };
1294
1295         if (arg.src_cpu == arg.dst_cpu)
1296                 goto out;
1297
1298         /*
1299          * These three tests are all lockless; this is OK since all of them
1300          * will be re-checked with proper locks held further down the line.
1301          */
1302         if (!cpu_active(arg.src_cpu) || !cpu_active(arg.dst_cpu))
1303                 goto out;
1304
1305         if (!cpumask_test_cpu(arg.dst_cpu, &arg.src_task->cpus_allowed))
1306                 goto out;
1307
1308         if (!cpumask_test_cpu(arg.src_cpu, &arg.dst_task->cpus_allowed))
1309                 goto out;
1310
1311         trace_sched_swap_numa(cur, arg.src_cpu, p, arg.dst_cpu);
1312         ret = stop_two_cpus(arg.dst_cpu, arg.src_cpu, migrate_swap_stop, &arg);
1313
1314 out:
1315         return ret;
1316 }
1317
1318 /*
1319  * wait_task_inactive - wait for a thread to unschedule.
1320  *
1321  * If @match_state is nonzero, it's the @p->state value just checked and
1322  * not expected to change.  If it changes, i.e. @p might have woken up,
1323  * then return zero.  When we succeed in waiting for @p to be off its CPU,
1324  * we return a positive number (its total switch count).  If a second call
1325  * a short while later returns the same number, the caller can be sure that
1326  * @p has remained unscheduled the whole time.
1327  *
1328  * The caller must ensure that the task *will* unschedule sometime soon,
1329  * else this function might spin for a *long* time. This function can't
1330  * be called with interrupts off, or it may introduce deadlock with
1331  * smp_call_function() if an IPI is sent by the same process we are
1332  * waiting to become inactive.
1333  */
1334 unsigned long wait_task_inactive(struct task_struct *p, long match_state)
1335 {
1336         int running, queued;
1337         struct rq_flags rf;
1338         unsigned long ncsw;
1339         struct rq *rq;
1340
1341         for (;;) {
1342                 /*
1343                  * We do the initial early heuristics without holding
1344                  * any task-queue locks at all. We'll only try to get
1345                  * the runqueue lock when things look like they will
1346                  * work out!
1347                  */
1348                 rq = task_rq(p);
1349
1350                 /*
1351                  * If the task is actively running on another CPU
1352                  * still, just relax and busy-wait without holding
1353                  * any locks.
1354                  *
1355                  * NOTE! Since we don't hold any locks, it's not
1356                  * even sure that "rq" stays as the right runqueue!
1357                  * But we don't care, since "task_running()" will
1358                  * return false if the runqueue has changed and p
1359                  * is actually now running somewhere else!
1360                  */
1361                 while (task_running(rq, p)) {
1362                         if (match_state && unlikely(p->state != match_state))
1363                                 return 0;
1364                         cpu_relax();
1365                 }
1366
1367                 /*
1368                  * Ok, time to look more closely! We need the rq
1369                  * lock now, to be *sure*. If we're wrong, we'll
1370                  * just go back and repeat.
1371                  */
1372                 rq = task_rq_lock(p, &rf);
1373                 trace_sched_wait_task(p);
1374                 running = task_running(rq, p);
1375                 queued = task_on_rq_queued(p);
1376                 ncsw = 0;
1377                 if (!match_state || p->state == match_state)
1378                         ncsw = p->nvcsw | LONG_MIN; /* sets MSB */
1379                 task_rq_unlock(rq, p, &rf);
1380
1381                 /*
1382                  * If it changed from the expected state, bail out now.
1383                  */
1384                 if (unlikely(!ncsw))
1385                         break;
1386
1387                 /*
1388                  * Was it really running after all now that we
1389                  * checked with the proper locks actually held?
1390                  *
1391                  * Oops. Go back and try again..
1392                  */
1393                 if (unlikely(running)) {
1394                         cpu_relax();
1395                         continue;
1396                 }
1397
1398                 /*
1399                  * It's not enough that it's not actively running,
1400                  * it must be off the runqueue _entirely_, and not
1401                  * preempted!
1402                  *
1403                  * So if it was still runnable (but just not actively
1404                  * running right now), it's preempted, and we should
1405                  * yield - it could be a while.
1406                  */
1407                 if (unlikely(queued)) {
1408                         ktime_t to = NSEC_PER_SEC / HZ;
1409
1410                         set_current_state(TASK_UNINTERRUPTIBLE);
1411                         schedule_hrtimeout(&to, HRTIMER_MODE_REL);
1412                         continue;
1413                 }
1414
1415                 /*
1416                  * Ahh, all good. It wasn't running, and it wasn't
1417                  * runnable, which means that it will never become
1418                  * running in the future either. We're all done!
1419                  */
1420                 break;
1421         }
1422
1423         return ncsw;
1424 }
1425
1426 /***
1427  * kick_process - kick a running thread to enter/exit the kernel
1428  * @p: the to-be-kicked thread
1429  *
1430  * Cause a process which is running on another CPU to enter
1431  * kernel-mode, without any delay. (to get signals handled.)
1432  *
1433  * NOTE: this function doesn't have to take the runqueue lock,
1434  * because all it wants to ensure is that the remote task enters
1435  * the kernel. If the IPI races and the task has been migrated
1436  * to another CPU then no harm is done and the purpose has been
1437  * achieved as well.
1438  */
1439 void kick_process(struct task_struct *p)
1440 {
1441         int cpu;
1442
1443         preempt_disable();
1444         cpu = task_cpu(p);
1445         if ((cpu != smp_processor_id()) && task_curr(p))
1446                 smp_send_reschedule(cpu);
1447         preempt_enable();
1448 }
1449 EXPORT_SYMBOL_GPL(kick_process);
1450
1451 /*
1452  * ->cpus_allowed is protected by both rq->lock and p->pi_lock
1453  *
1454  * A few notes on cpu_active vs cpu_online:
1455  *
1456  *  - cpu_active must be a subset of cpu_online
1457  *
1458  *  - on cpu-up we allow per-cpu kthreads on the online && !active cpu,
1459  *    see __set_cpus_allowed_ptr(). At this point the newly online
1460  *    CPU isn't yet part of the sched domains, and balancing will not
1461  *    see it.
1462  *
1463  *  - on CPU-down we clear cpu_active() to mask the sched domains and
1464  *    avoid the load balancer to place new tasks on the to be removed
1465  *    CPU. Existing tasks will remain running there and will be taken
1466  *    off.
1467  *
1468  * This means that fallback selection must not select !active CPUs.
1469  * And can assume that any active CPU must be online. Conversely
1470  * select_task_rq() below may allow selection of !active CPUs in order
1471  * to satisfy the above rules.
1472  */
1473 static int select_fallback_rq(int cpu, struct task_struct *p)
1474 {
1475         int nid = cpu_to_node(cpu);
1476         const struct cpumask *nodemask = NULL;
1477         enum { cpuset, possible, fail } state = cpuset;
1478         int dest_cpu;
1479
1480         /*
1481          * If the node that the CPU is on has been offlined, cpu_to_node()
1482          * will return -1. There is no CPU on the node, and we should
1483          * select the CPU on the other node.
1484          */
1485         if (nid != -1) {
1486                 nodemask = cpumask_of_node(nid);
1487
1488                 /* Look for allowed, online CPU in same node. */
1489                 for_each_cpu(dest_cpu, nodemask) {
1490                         if (!cpu_active(dest_cpu))
1491                                 continue;
1492                         if (cpumask_test_cpu(dest_cpu, &p->cpus_allowed))
1493                                 return dest_cpu;
1494                 }
1495         }
1496
1497         for (;;) {
1498                 /* Any allowed, online CPU? */
1499                 for_each_cpu(dest_cpu, &p->cpus_allowed) {
1500                         if (!(p->flags & PF_KTHREAD) && !cpu_active(dest_cpu))
1501                                 continue;
1502                         if (!cpu_online(dest_cpu))
1503                                 continue;
1504                         goto out;
1505                 }
1506
1507                 /* No more Mr. Nice Guy. */
1508                 switch (state) {
1509                 case cpuset:
1510                         if (IS_ENABLED(CONFIG_CPUSETS)) {
1511                                 cpuset_cpus_allowed_fallback(p);
1512                                 state = possible;
1513                                 break;
1514                         }
1515                         /* Fall-through */
1516                 case possible:
1517                         do_set_cpus_allowed(p, cpu_possible_mask);
1518                         state = fail;
1519                         break;
1520
1521                 case fail:
1522                         BUG();
1523                         break;
1524                 }
1525         }
1526
1527 out:
1528         if (state != cpuset) {
1529                 /*
1530                  * Don't tell them about moving exiting tasks or
1531                  * kernel threads (both mm NULL), since they never
1532                  * leave kernel.
1533                  */
1534                 if (p->mm && printk_ratelimit()) {
1535                         printk_deferred("process %d (%s) no longer affine to cpu%d\n",
1536                                         task_pid_nr(p), p->comm, cpu);
1537                 }
1538         }
1539
1540         return dest_cpu;
1541 }
1542
1543 /*
1544  * The caller (fork, wakeup) owns p->pi_lock, ->cpus_allowed is stable.
1545  */
1546 static inline
1547 int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags,
1548                    int sibling_count_hint)
1549 {
1550         lockdep_assert_held(&p->pi_lock);
1551
1552         if (p->nr_cpus_allowed > 1)
1553                 cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags,
1554                                                      sibling_count_hint);
1555         else
1556                 cpu = cpumask_any(&p->cpus_allowed);
1557
1558         /*
1559          * In order not to call set_task_cpu() on a blocking task we need
1560          * to rely on ttwu() to place the task on a valid ->cpus_allowed
1561          * CPU.
1562          *
1563          * Since this is common to all placement strategies, this lives here.
1564          *
1565          * [ this allows ->select_task() to simply return task_cpu(p) and
1566          *   not worry about this generic constraint ]
1567          */
1568         if (unlikely(!cpumask_test_cpu(cpu, &p->cpus_allowed) ||
1569                      !cpu_online(cpu)))
1570                 cpu = select_fallback_rq(task_cpu(p), p);
1571
1572         return cpu;
1573 }
1574
1575 static void update_avg(u64 *avg, u64 sample)
1576 {
1577         s64 diff = sample - *avg;
1578         *avg += diff >> 3;
1579 }
1580
1581 void sched_set_stop_task(int cpu, struct task_struct *stop)
1582 {
1583         struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 };
1584         struct task_struct *old_stop = cpu_rq(cpu)->stop;
1585
1586         if (stop) {
1587                 /*
1588                  * Make it appear like a SCHED_FIFO task, its something
1589                  * userspace knows about and won't get confused about.
1590                  *
1591                  * Also, it will make PI more or less work without too
1592                  * much confusion -- but then, stop work should not
1593                  * rely on PI working anyway.
1594                  */
1595                 sched_setscheduler_nocheck(stop, SCHED_FIFO, &param);
1596
1597                 stop->sched_class = &stop_sched_class;
1598         }
1599
1600         cpu_rq(cpu)->stop = stop;
1601
1602         if (old_stop) {
1603                 /*
1604                  * Reset it back to a normal scheduling class so that
1605                  * it can die in pieces.
1606                  */
1607                 old_stop->sched_class = &rt_sched_class;
1608         }
1609 }
1610
1611 #else
1612
1613 static inline int __set_cpus_allowed_ptr(struct task_struct *p,
1614                                          const struct cpumask *new_mask, bool check)
1615 {
1616         return set_cpus_allowed_ptr(p, new_mask);
1617 }
1618
1619 #endif /* CONFIG_SMP */
1620
1621 static void
1622 ttwu_stat(struct task_struct *p, int cpu, int wake_flags)
1623 {
1624         struct rq *rq;
1625
1626         if (!schedstat_enabled())
1627                 return;
1628
1629         rq = this_rq();
1630
1631 #ifdef CONFIG_SMP
1632         if (cpu == rq->cpu) {
1633                 schedstat_inc(rq->ttwu_local);
1634                 schedstat_inc(p->se.statistics.nr_wakeups_local);
1635         } else {
1636                 struct sched_domain *sd;
1637
1638                 schedstat_inc(p->se.statistics.nr_wakeups_remote);
1639                 rcu_read_lock();
1640                 for_each_domain(rq->cpu, sd) {
1641                         if (cpumask_test_cpu(cpu, sched_domain_span(sd))) {
1642                                 schedstat_inc(sd->ttwu_wake_remote);
1643                                 break;
1644                         }
1645                 }
1646                 rcu_read_unlock();
1647         }
1648
1649         if (wake_flags & WF_MIGRATED)
1650                 schedstat_inc(p->se.statistics.nr_wakeups_migrate);
1651 #endif /* CONFIG_SMP */
1652
1653         schedstat_inc(rq->ttwu_count);
1654         schedstat_inc(p->se.statistics.nr_wakeups);
1655
1656         if (wake_flags & WF_SYNC)
1657                 schedstat_inc(p->se.statistics.nr_wakeups_sync);
1658 }
1659
1660 static inline void ttwu_activate(struct rq *rq, struct task_struct *p, int en_flags)
1661 {
1662         activate_task(rq, p, en_flags);
1663         p->on_rq = TASK_ON_RQ_QUEUED;
1664
1665         /* If a worker is waking up, notify the workqueue: */
1666         if (p->flags & PF_WQ_WORKER)
1667                 wq_worker_waking_up(p, cpu_of(rq));
1668 }
1669
1670 /*
1671  * Mark the task runnable and perform wakeup-preemption.
1672  */
1673 static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags,
1674                            struct rq_flags *rf)
1675 {
1676         check_preempt_curr(rq, p, wake_flags);
1677         p->state = TASK_RUNNING;
1678         trace_sched_wakeup(p);
1679
1680 #ifdef CONFIG_SMP
1681         if (p->sched_class->task_woken) {
1682                 /*
1683                  * Our task @p is fully woken up and running; so its safe to
1684                  * drop the rq->lock, hereafter rq is only used for statistics.
1685                  */
1686                 rq_unpin_lock(rq, rf);
1687                 p->sched_class->task_woken(rq, p);
1688                 rq_repin_lock(rq, rf);
1689         }
1690
1691         if (rq->idle_stamp) {
1692                 u64 delta = rq_clock(rq) - rq->idle_stamp;
1693                 u64 max = 2*rq->max_idle_balance_cost;
1694
1695                 update_avg(&rq->avg_idle, delta);
1696
1697                 if (rq->avg_idle > max)
1698                         rq->avg_idle = max;
1699
1700                 rq->idle_stamp = 0;
1701         }
1702 #endif
1703 }
1704
1705 static void
1706 ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags,
1707                  struct rq_flags *rf)
1708 {
1709         int en_flags = ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK;
1710
1711         lockdep_assert_held(&rq->lock);
1712
1713 #ifdef CONFIG_SMP
1714         if (p->sched_contributes_to_load)
1715                 rq->nr_uninterruptible--;
1716
1717         if (wake_flags & WF_MIGRATED)
1718                 en_flags |= ENQUEUE_MIGRATED;
1719 #endif
1720
1721         ttwu_activate(rq, p, en_flags);
1722         ttwu_do_wakeup(rq, p, wake_flags, rf);
1723 }
1724
1725 /*
1726  * Called in case the task @p isn't fully descheduled from its runqueue,
1727  * in this case we must do a remote wakeup. Its a 'light' wakeup though,
1728  * since all we need to do is flip p->state to TASK_RUNNING, since
1729  * the task is still ->on_rq.
1730  */
1731 static int ttwu_remote(struct task_struct *p, int wake_flags)
1732 {
1733         struct rq_flags rf;
1734         struct rq *rq;
1735         int ret = 0;
1736
1737         rq = __task_rq_lock(p, &rf);
1738         if (task_on_rq_queued(p)) {
1739                 /* check_preempt_curr() may use rq clock */
1740                 update_rq_clock(rq);
1741                 ttwu_do_wakeup(rq, p, wake_flags, &rf);
1742                 ret = 1;
1743         }
1744         __task_rq_unlock(rq, &rf);
1745
1746         return ret;
1747 }
1748
1749 #ifdef CONFIG_SMP
1750 void sched_ttwu_pending(void)
1751 {
1752         struct rq *rq = this_rq();
1753         struct llist_node *llist = llist_del_all(&rq->wake_list);
1754         struct task_struct *p, *t;
1755         struct rq_flags rf;
1756
1757         if (!llist)
1758                 return;
1759
1760         rq_lock_irqsave(rq, &rf);
1761         update_rq_clock(rq);
1762
1763         llist_for_each_entry_safe(p, t, llist, wake_entry)
1764                 ttwu_do_activate(rq, p, p->sched_remote_wakeup ? WF_MIGRATED : 0, &rf);
1765
1766         rq_unlock_irqrestore(rq, &rf);
1767 }
1768
1769 void scheduler_ipi(void)
1770 {
1771         /*
1772          * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting
1773          * TIF_NEED_RESCHED remotely (for the first time) will also send
1774          * this IPI.
1775          */
1776         preempt_fold_need_resched();
1777
1778         if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick())
1779                 return;
1780
1781         /*
1782          * Not all reschedule IPI handlers call irq_enter/irq_exit, since
1783          * traditionally all their work was done from the interrupt return
1784          * path. Now that we actually do some work, we need to make sure
1785          * we do call them.
1786          *
1787          * Some archs already do call them, luckily irq_enter/exit nest
1788          * properly.
1789          *
1790          * Arguably we should visit all archs and update all handlers,
1791          * however a fair share of IPIs are still resched only so this would
1792          * somewhat pessimize the simple resched case.
1793          */
1794         irq_enter();
1795         sched_ttwu_pending();
1796
1797         /*
1798          * Check if someone kicked us for doing the nohz idle load balance.
1799          */
1800         if (unlikely(got_nohz_idle_kick())) {
1801                 this_rq()->idle_balance = 1;
1802                 raise_softirq_irqoff(SCHED_SOFTIRQ);
1803         }
1804         irq_exit();
1805 }
1806
1807 static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags)
1808 {
1809         struct rq *rq = cpu_rq(cpu);
1810
1811         p->sched_remote_wakeup = !!(wake_flags & WF_MIGRATED);
1812
1813         if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) {
1814                 if (!set_nr_if_polling(rq->idle))
1815                         smp_send_reschedule(cpu);
1816                 else
1817                         trace_sched_wake_idle_without_ipi(cpu);
1818         }
1819 }
1820
1821 void wake_up_if_idle(int cpu)
1822 {
1823         struct rq *rq = cpu_rq(cpu);
1824         struct rq_flags rf;
1825
1826         rcu_read_lock();
1827
1828         if (!is_idle_task(rcu_dereference(rq->curr)))
1829                 goto out;
1830
1831         if (set_nr_if_polling(rq->idle)) {
1832                 trace_sched_wake_idle_without_ipi(cpu);
1833         } else {
1834                 rq_lock_irqsave(rq, &rf);
1835                 if (is_idle_task(rq->curr))
1836                         smp_send_reschedule(cpu);
1837                 /* Else CPU is not idle, do nothing here: */
1838                 rq_unlock_irqrestore(rq, &rf);
1839         }
1840
1841 out:
1842         rcu_read_unlock();
1843 }
1844
1845 bool cpus_share_cache(int this_cpu, int that_cpu)
1846 {
1847         return per_cpu(sd_llc_id, this_cpu) == per_cpu(sd_llc_id, that_cpu);
1848 }
1849 #endif /* CONFIG_SMP */
1850
1851 static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
1852 {
1853         struct rq *rq = cpu_rq(cpu);
1854         struct rq_flags rf;
1855
1856 #if defined(CONFIG_SMP)
1857         if (sched_feat(TTWU_QUEUE) && !cpus_share_cache(smp_processor_id(), cpu)) {
1858                 sched_clock_cpu(cpu); /* Sync clocks across CPUs */
1859                 ttwu_queue_remote(p, cpu, wake_flags);
1860                 return;
1861         }
1862 #endif
1863
1864         rq_lock(rq, &rf);
1865         update_rq_clock(rq);
1866         ttwu_do_activate(rq, p, wake_flags, &rf);
1867         rq_unlock(rq, &rf);
1868 }
1869
1870 /*
1871  * Notes on Program-Order guarantees on SMP systems.
1872  *
1873  *  MIGRATION
1874  *
1875  * The basic program-order guarantee on SMP systems is that when a task [t]
1876  * migrates, all its activity on its old CPU [c0] happens-before any subsequent
1877  * execution on its new CPU [c1].
1878  *
1879  * For migration (of runnable tasks) this is provided by the following means:
1880  *
1881  *  A) UNLOCK of the rq(c0)->lock scheduling out task t
1882  *  B) migration for t is required to synchronize *both* rq(c0)->lock and
1883  *     rq(c1)->lock (if not at the same time, then in that order).
1884  *  C) LOCK of the rq(c1)->lock scheduling in task
1885  *
1886  * Transitivity guarantees that B happens after A and C after B.
1887  * Note: we only require RCpc transitivity.
1888  * Note: the CPU doing B need not be c0 or c1
1889  *
1890  * Example:
1891  *
1892  *   CPU0            CPU1            CPU2
1893  *
1894  *   LOCK rq(0)->lock
1895  *   sched-out X
1896  *   sched-in Y
1897  *   UNLOCK rq(0)->lock
1898  *
1899  *                                   LOCK rq(0)->lock // orders against CPU0
1900  *                                   dequeue X
1901  *                                   UNLOCK rq(0)->lock
1902  *
1903  *                                   LOCK rq(1)->lock
1904  *                                   enqueue X
1905  *                                   UNLOCK rq(1)->lock
1906  *
1907  *                   LOCK rq(1)->lock // orders against CPU2
1908  *                   sched-out Z
1909  *                   sched-in X
1910  *                   UNLOCK rq(1)->lock
1911  *
1912  *
1913  *  BLOCKING -- aka. SLEEP + WAKEUP
1914  *
1915  * For blocking we (obviously) need to provide the same guarantee as for
1916  * migration. However the means are completely different as there is no lock
1917  * chain to provide order. Instead we do:
1918  *
1919  *   1) smp_store_release(X->on_cpu, 0)
1920  *   2) smp_cond_load_acquire(!X->on_cpu)
1921  *
1922  * Example:
1923  *
1924  *   CPU0 (schedule)  CPU1 (try_to_wake_up) CPU2 (schedule)
1925  *
1926  *   LOCK rq(0)->lock LOCK X->pi_lock
1927  *   dequeue X
1928  *   sched-out X
1929  *   smp_store_release(X->on_cpu, 0);
1930  *
1931  *                    smp_cond_load_acquire(&X->on_cpu, !VAL);
1932  *                    X->state = WAKING
1933  *                    set_task_cpu(X,2)
1934  *
1935  *                    LOCK rq(2)->lock
1936  *                    enqueue X
1937  *                    X->state = RUNNING
1938  *                    UNLOCK rq(2)->lock
1939  *
1940  *                                          LOCK rq(2)->lock // orders against CPU1
1941  *                                          sched-out Z
1942  *                                          sched-in X
1943  *                                          UNLOCK rq(2)->lock
1944  *
1945  *                    UNLOCK X->pi_lock
1946  *   UNLOCK rq(0)->lock
1947  *
1948  *
1949  * However; for wakeups there is a second guarantee we must provide, namely we
1950  * must observe the state that lead to our wakeup. That is, not only must our
1951  * task observe its own prior state, it must also observe the stores prior to
1952  * its wakeup.
1953  *
1954  * This means that any means of doing remote wakeups must order the CPU doing
1955  * the wakeup against the CPU the task is going to end up running on. This,
1956  * however, is already required for the regular Program-Order guarantee above,
1957  * since the waking CPU is the one issueing the ACQUIRE (smp_cond_load_acquire).
1958  *
1959  */
1960
1961 #ifdef CONFIG_SMP
1962 #ifdef CONFIG_SCHED_WALT
1963 /* utility function to update walt signals at wakeup */
1964 static inline void walt_try_to_wake_up(struct task_struct *p)
1965 {
1966         struct rq *rq = cpu_rq(task_cpu(p));
1967         struct rq_flags rf;
1968         u64 wallclock;
1969
1970         rq_lock_irqsave(rq, &rf);
1971         wallclock = walt_ktime_clock();
1972         walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
1973         walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
1974         rq_unlock_irqrestore(rq, &rf);
1975 }
1976 #else
1977 #define walt_try_to_wake_up(a) {}
1978 #endif
1979 #endif
1980
1981 /**
1982  * try_to_wake_up - wake up a thread
1983  * @p: the thread to be awakened
1984  * @state: the mask of task states that can be woken
1985  * @wake_flags: wake modifier flags (WF_*)
1986  * @sibling_count_hint: A hint at the number of threads that are being woken up
1987  *                      in this event.
1988  *
1989  * If (@state & @p->state) @p->state = TASK_RUNNING.
1990  *
1991  * If the task was not queued/runnable, also place it back on a runqueue.
1992  *
1993  * Atomic against schedule() which would dequeue a task, also see
1994  * set_current_state().
1995  *
1996  * Return: %true if @p->state changes (an actual wakeup was done),
1997  *         %false otherwise.
1998  */
1999 static int
2000 try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
2001                int sibling_count_hint)
2002 {
2003         unsigned long flags;
2004         int cpu, success = 0;
2005
2006         /*
2007          * If we are going to wake up a thread waiting for CONDITION we
2008          * need to ensure that CONDITION=1 done by the caller can not be
2009          * reordered with p->state check below. This pairs with mb() in
2010          * set_current_state() the waiting thread does.
2011          */
2012         raw_spin_lock_irqsave(&p->pi_lock, flags);
2013         smp_mb__after_spinlock();
2014         if (!(p->state & state))
2015                 goto out;
2016
2017         trace_sched_waking(p);
2018
2019         /* We're going to change ->state: */
2020         success = 1;
2021         cpu = task_cpu(p);
2022
2023         /*
2024          * Ensure we load p->on_rq _after_ p->state, otherwise it would
2025          * be possible to, falsely, observe p->on_rq == 0 and get stuck
2026          * in smp_cond_load_acquire() below.
2027          *
2028          * sched_ttwu_pending()                 try_to_wake_up()
2029          *   [S] p->on_rq = 1;                  [L] P->state
2030          *       UNLOCK rq->lock  -----.
2031          *                              \
2032          *                               +---   RMB
2033          * schedule()                   /
2034          *       LOCK rq->lock    -----'
2035          *       UNLOCK rq->lock
2036          *
2037          * [task p]
2038          *   [S] p->state = UNINTERRUPTIBLE     [L] p->on_rq
2039          *
2040          * Pairs with the UNLOCK+LOCK on rq->lock from the
2041          * last wakeup of our task and the schedule that got our task
2042          * current.
2043          */
2044         smp_rmb();
2045         if (p->on_rq && ttwu_remote(p, wake_flags))
2046                 goto stat;
2047
2048 #ifdef CONFIG_SMP
2049         /*
2050          * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be
2051          * possible to, falsely, observe p->on_cpu == 0.
2052          *
2053          * One must be running (->on_cpu == 1) in order to remove oneself
2054          * from the runqueue.
2055          *
2056          *  [S] ->on_cpu = 1;   [L] ->on_rq
2057          *      UNLOCK rq->lock
2058          *                      RMB
2059          *      LOCK   rq->lock
2060          *  [S] ->on_rq = 0;    [L] ->on_cpu
2061          *
2062          * Pairs with the full barrier implied in the UNLOCK+LOCK on rq->lock
2063          * from the consecutive calls to schedule(); the first switching to our
2064          * task, the second putting it to sleep.
2065          */
2066         smp_rmb();
2067
2068         /*
2069          * If the owning (remote) CPU is still in the middle of schedule() with
2070          * this task as prev, wait until its done referencing the task.
2071          *
2072          * Pairs with the smp_store_release() in finish_lock_switch().
2073          *
2074          * This ensures that tasks getting woken will be fully ordered against
2075          * their previous state and preserve Program Order.
2076          */
2077         smp_cond_load_acquire(&p->on_cpu, !VAL);
2078
2079         walt_try_to_wake_up(p);
2080
2081         p->sched_contributes_to_load = !!task_contributes_to_load(p);
2082         p->state = TASK_WAKING;
2083
2084         if (p->in_iowait) {
2085                 delayacct_blkio_end();
2086                 atomic_dec(&task_rq(p)->nr_iowait);
2087         }
2088
2089         cpu = select_task_rq(p, p->wake_cpu, SD_BALANCE_WAKE, wake_flags,
2090                              sibling_count_hint);
2091         if (task_cpu(p) != cpu) {
2092                 wake_flags |= WF_MIGRATED;
2093                 set_task_cpu(p, cpu);
2094         }
2095
2096 #else /* CONFIG_SMP */
2097
2098         if (p->in_iowait) {
2099                 delayacct_blkio_end();
2100                 atomic_dec(&task_rq(p)->nr_iowait);
2101         }
2102
2103 #endif /* CONFIG_SMP */
2104
2105         ttwu_queue(p, cpu, wake_flags);
2106 stat:
2107         ttwu_stat(p, cpu, wake_flags);
2108 out:
2109         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2110
2111         return success;
2112 }
2113
2114 /**
2115  * try_to_wake_up_local - try to wake up a local task with rq lock held
2116  * @p: the thread to be awakened
2117  * @rf: request-queue flags for pinning
2118  *
2119  * Put @p on the run-queue if it's not already there. The caller must
2120  * ensure that this_rq() is locked, @p is bound to this_rq() and not
2121  * the current task.
2122  */
2123 static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
2124 {
2125         struct rq *rq = task_rq(p);
2126
2127         if (WARN_ON_ONCE(rq != this_rq()) ||
2128             WARN_ON_ONCE(p == current))
2129                 return;
2130
2131         lockdep_assert_held(&rq->lock);
2132
2133         if (!raw_spin_trylock(&p->pi_lock)) {
2134                 /*
2135                  * This is OK, because current is on_cpu, which avoids it being
2136                  * picked for load-balance and preemption/IRQs are still
2137                  * disabled avoiding further scheduler activity on it and we've
2138                  * not yet picked a replacement task.
2139                  */
2140                 rq_unlock(rq, rf);
2141                 raw_spin_lock(&p->pi_lock);
2142                 rq_relock(rq, rf);
2143         }
2144
2145         if (!(p->state & TASK_NORMAL))
2146                 goto out;
2147
2148         trace_sched_waking(p);
2149
2150         if (!task_on_rq_queued(p)) {
2151                 u64 wallclock = walt_ktime_clock();
2152
2153                 walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
2154                 walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
2155
2156                 if (p->in_iowait) {
2157                         delayacct_blkio_end();
2158                         atomic_dec(&rq->nr_iowait);
2159                 }
2160                 ttwu_activate(rq, p, ENQUEUE_WAKEUP | ENQUEUE_NOCLOCK);
2161         }
2162
2163         ttwu_do_wakeup(rq, p, 0, rf);
2164         ttwu_stat(p, smp_processor_id(), 0);
2165 out:
2166         raw_spin_unlock(&p->pi_lock);
2167 }
2168
2169 /**
2170  * wake_up_process - Wake up a specific process
2171  * @p: The process to be woken up.
2172  *
2173  * Attempt to wake up the nominated process and move it to the set of runnable
2174  * processes.
2175  *
2176  * Return: 1 if the process was woken up, 0 if it was already running.
2177  *
2178  * It may be assumed that this function implies a write memory barrier before
2179  * changing the task state if and only if any tasks are woken up.
2180  */
2181 int wake_up_process(struct task_struct *p)
2182 {
2183         return try_to_wake_up(p, TASK_NORMAL, 0, 1);
2184 }
2185 EXPORT_SYMBOL(wake_up_process);
2186
2187 int wake_up_state(struct task_struct *p, unsigned int state)
2188 {
2189         return try_to_wake_up(p, state, 0, 1);
2190 }
2191
2192 /*
2193  * Perform scheduler related setup for a newly forked process p.
2194  * p is forked by current.
2195  *
2196  * __sched_fork() is basic setup used by init_idle() too:
2197  */
2198 static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
2199 {
2200         p->on_rq                        = 0;
2201
2202         p->se.on_rq                     = 0;
2203         p->se.exec_start                = 0;
2204         p->se.sum_exec_runtime          = 0;
2205         p->se.prev_sum_exec_runtime     = 0;
2206         p->se.nr_migrations             = 0;
2207         p->se.vruntime                  = 0;
2208 #ifdef CONFIG_SCHED_WALT
2209         p->last_sleep_ts                = 0;
2210 #endif
2211
2212         INIT_LIST_HEAD(&p->se.group_node);
2213         walt_init_new_task_load(p);
2214
2215 #ifdef CONFIG_FAIR_GROUP_SCHED
2216         p->se.cfs_rq                    = NULL;
2217 #endif
2218
2219 #ifdef CONFIG_SCHEDSTATS
2220         /* Even if schedstat is disabled, there should not be garbage */
2221         memset(&p->se.statistics, 0, sizeof(p->se.statistics));
2222 #endif
2223
2224         RB_CLEAR_NODE(&p->dl.rb_node);
2225         init_dl_task_timer(&p->dl);
2226         init_dl_inactive_task_timer(&p->dl);
2227         __dl_clear_params(p);
2228
2229         INIT_LIST_HEAD(&p->rt.run_list);
2230         p->rt.timeout           = 0;
2231         p->rt.time_slice        = sched_rr_timeslice;
2232         p->rt.on_rq             = 0;
2233         p->rt.on_list           = 0;
2234
2235 #ifdef CONFIG_PREEMPT_NOTIFIERS
2236         INIT_HLIST_HEAD(&p->preempt_notifiers);
2237 #endif
2238
2239 #ifdef CONFIG_NUMA_BALANCING
2240         if (p->mm && atomic_read(&p->mm->mm_users) == 1) {
2241                 p->mm->numa_next_scan = jiffies + msecs_to_jiffies(sysctl_numa_balancing_scan_delay);
2242                 p->mm->numa_scan_seq = 0;
2243         }
2244
2245         if (clone_flags & CLONE_VM)
2246                 p->numa_preferred_nid = current->numa_preferred_nid;
2247         else
2248                 p->numa_preferred_nid = -1;
2249
2250         p->node_stamp = 0ULL;
2251         p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
2252         p->numa_scan_period = sysctl_numa_balancing_scan_delay;
2253         p->numa_work.next = &p->numa_work;
2254         p->numa_faults = NULL;
2255         p->last_task_numa_placement = 0;
2256         p->last_sum_exec_runtime = 0;
2257
2258         p->numa_group = NULL;
2259 #endif /* CONFIG_NUMA_BALANCING */
2260 }
2261
2262 DEFINE_STATIC_KEY_FALSE(sched_numa_balancing);
2263
2264 #ifdef CONFIG_NUMA_BALANCING
2265
2266 void set_numabalancing_state(bool enabled)
2267 {
2268         if (enabled)
2269                 static_branch_enable(&sched_numa_balancing);
2270         else
2271                 static_branch_disable(&sched_numa_balancing);
2272 }
2273
2274 #ifdef CONFIG_PROC_SYSCTL
2275 int sysctl_numa_balancing(struct ctl_table *table, int write,
2276                          void __user *buffer, size_t *lenp, loff_t *ppos)
2277 {
2278         struct ctl_table t;
2279         int err;
2280         int state = static_branch_likely(&sched_numa_balancing);
2281
2282         if (write && !capable(CAP_SYS_ADMIN))
2283                 return -EPERM;
2284
2285         t = *table;
2286         t.data = &state;
2287         err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2288         if (err < 0)
2289                 return err;
2290         if (write)
2291                 set_numabalancing_state(state);
2292         return err;
2293 }
2294 #endif
2295 #endif
2296
2297 #ifdef CONFIG_SCHEDSTATS
2298
2299 DEFINE_STATIC_KEY_FALSE(sched_schedstats);
2300 static bool __initdata __sched_schedstats = false;
2301
2302 static void set_schedstats(bool enabled)
2303 {
2304         if (enabled)
2305                 static_branch_enable(&sched_schedstats);
2306         else
2307                 static_branch_disable(&sched_schedstats);
2308 }
2309
2310 void force_schedstat_enabled(void)
2311 {
2312         if (!schedstat_enabled()) {
2313                 pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n");
2314                 static_branch_enable(&sched_schedstats);
2315         }
2316 }
2317
2318 static int __init setup_schedstats(char *str)
2319 {
2320         int ret = 0;
2321         if (!str)
2322                 goto out;
2323
2324         /*
2325          * This code is called before jump labels have been set up, so we can't
2326          * change the static branch directly just yet.  Instead set a temporary
2327          * variable so init_schedstats() can do it later.
2328          */
2329         if (!strcmp(str, "enable")) {
2330                 __sched_schedstats = true;
2331                 ret = 1;
2332         } else if (!strcmp(str, "disable")) {
2333                 __sched_schedstats = false;
2334                 ret = 1;
2335         }
2336 out:
2337         if (!ret)
2338                 pr_warn("Unable to parse schedstats=\n");
2339
2340         return ret;
2341 }
2342 __setup("schedstats=", setup_schedstats);
2343
2344 static void __init init_schedstats(void)
2345 {
2346         set_schedstats(__sched_schedstats);
2347 }
2348
2349 #ifdef CONFIG_PROC_SYSCTL
2350 int sysctl_schedstats(struct ctl_table *table, int write,
2351                          void __user *buffer, size_t *lenp, loff_t *ppos)
2352 {
2353         struct ctl_table t;
2354         int err;
2355         int state = static_branch_likely(&sched_schedstats);
2356
2357         if (write && !capable(CAP_SYS_ADMIN))
2358                 return -EPERM;
2359
2360         t = *table;
2361         t.data = &state;
2362         err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
2363         if (err < 0)
2364                 return err;
2365         if (write)
2366                 set_schedstats(state);
2367         return err;
2368 }
2369 #endif /* CONFIG_PROC_SYSCTL */
2370 #else  /* !CONFIG_SCHEDSTATS */
2371 static inline void init_schedstats(void) {}
2372 #endif /* CONFIG_SCHEDSTATS */
2373
2374 /*
2375  * fork()/clone()-time setup:
2376  */
2377 int sched_fork(unsigned long clone_flags, struct task_struct *p)
2378 {
2379         unsigned long flags;
2380         int cpu = get_cpu();
2381
2382         __sched_fork(clone_flags, p);
2383         /*
2384          * We mark the process as NEW here. This guarantees that
2385          * nobody will actually run it, and a signal or other external
2386          * event cannot wake it up and insert it on the runqueue either.
2387          */
2388         p->state = TASK_NEW;
2389
2390         /*
2391          * Make sure we do not leak PI boosting priority to the child.
2392          */
2393         p->prio = current->normal_prio;
2394
2395         /*
2396          * Revert to default priority/policy on fork if requested.
2397          */
2398         if (unlikely(p->sched_reset_on_fork)) {
2399                 if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
2400                         p->policy = SCHED_NORMAL;
2401                         p->static_prio = NICE_TO_PRIO(0);
2402                         p->rt_priority = 0;
2403                 } else if (PRIO_TO_NICE(p->static_prio) < 0)
2404                         p->static_prio = NICE_TO_PRIO(0);
2405
2406                 p->prio = p->normal_prio = __normal_prio(p);
2407                 set_load_weight(p);
2408
2409                 /*
2410                  * We don't need the reset flag anymore after the fork. It has
2411                  * fulfilled its duty:
2412                  */
2413                 p->sched_reset_on_fork = 0;
2414         }
2415
2416         if (dl_prio(p->prio)) {
2417                 put_cpu();
2418                 return -EAGAIN;
2419         } else if (rt_prio(p->prio)) {
2420                 p->sched_class = &rt_sched_class;
2421         } else {
2422                 p->sched_class = &fair_sched_class;
2423         }
2424
2425         init_entity_runnable_average(&p->se);
2426
2427         /*
2428          * The child is not yet in the pid-hash so no cgroup attach races,
2429          * and the cgroup is pinned to this child due to cgroup_fork()
2430          * is ran before sched_fork().
2431          *
2432          * Silence PROVE_RCU.
2433          */
2434         raw_spin_lock_irqsave(&p->pi_lock, flags);
2435         /*
2436          * We're setting the CPU for the first time, we don't migrate,
2437          * so use __set_task_cpu().
2438          */
2439         __set_task_cpu(p, cpu);
2440         if (p->sched_class->task_fork)
2441                 p->sched_class->task_fork(p);
2442         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2443
2444 #ifdef CONFIG_SCHED_INFO
2445         if (likely(sched_info_on()))
2446                 memset(&p->sched_info, 0, sizeof(p->sched_info));
2447 #endif
2448 #if defined(CONFIG_SMP)
2449         p->on_cpu = 0;
2450 #endif
2451         init_task_preempt_count(p);
2452 #ifdef CONFIG_SMP
2453         plist_node_init(&p->pushable_tasks, MAX_PRIO);
2454         RB_CLEAR_NODE(&p->pushable_dl_tasks);
2455 #endif
2456
2457         put_cpu();
2458         return 0;
2459 }
2460
2461 unsigned long to_ratio(u64 period, u64 runtime)
2462 {
2463         if (runtime == RUNTIME_INF)
2464                 return BW_UNIT;
2465
2466         /*
2467          * Doing this here saves a lot of checks in all
2468          * the calling paths, and returning zero seems
2469          * safe for them anyway.
2470          */
2471         if (period == 0)
2472                 return 0;
2473
2474         return div64_u64(runtime << BW_SHIFT, period);
2475 }
2476
2477 /*
2478  * wake_up_new_task - wake up a newly created task for the first time.
2479  *
2480  * This function will do some initial scheduler statistics housekeeping
2481  * that must be done for every newly created context, then puts the task
2482  * on the runqueue and wakes it.
2483  */
2484 void wake_up_new_task(struct task_struct *p)
2485 {
2486         struct rq_flags rf;
2487         struct rq *rq;
2488
2489         raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
2490
2491         walt_init_new_task_load(p);
2492
2493         p->state = TASK_RUNNING;
2494 #ifdef CONFIG_SMP
2495         /*
2496          * Fork balancing, do it here and not earlier because:
2497          *  - cpus_allowed can change in the fork path
2498          *  - any previously selected CPU might disappear through hotplug
2499          *
2500          * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
2501          * as we're not fully set-up yet.
2502          */
2503         __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0, 1));
2504 #endif
2505         rq = __task_rq_lock(p, &rf);
2506         update_rq_clock(rq);
2507         post_init_entity_util_avg(&p->se);
2508
2509         activate_task(rq, p, ENQUEUE_NOCLOCK);
2510         walt_mark_task_starting(p);
2511
2512         p->on_rq = TASK_ON_RQ_QUEUED;
2513         trace_sched_wakeup_new(p);
2514         check_preempt_curr(rq, p, WF_FORK);
2515 #ifdef CONFIG_SMP
2516         if (p->sched_class->task_woken) {
2517                 /*
2518                  * Nothing relies on rq->lock after this, so its fine to
2519                  * drop it.
2520                  */
2521                 rq_unpin_lock(rq, &rf);
2522                 p->sched_class->task_woken(rq, p);
2523                 rq_repin_lock(rq, &rf);
2524         }
2525 #endif
2526         task_rq_unlock(rq, p, &rf);
2527 }
2528
2529 #ifdef CONFIG_PREEMPT_NOTIFIERS
2530
2531 static struct static_key preempt_notifier_key = STATIC_KEY_INIT_FALSE;
2532
2533 void preempt_notifier_inc(void)
2534 {
2535         static_key_slow_inc(&preempt_notifier_key);
2536 }
2537 EXPORT_SYMBOL_GPL(preempt_notifier_inc);
2538
2539 void preempt_notifier_dec(void)
2540 {
2541         static_key_slow_dec(&preempt_notifier_key);
2542 }
2543 EXPORT_SYMBOL_GPL(preempt_notifier_dec);
2544
2545 /**
2546  * preempt_notifier_register - tell me when current is being preempted & rescheduled
2547  * @notifier: notifier struct to register
2548  */
2549 void preempt_notifier_register(struct preempt_notifier *notifier)
2550 {
2551         if (!static_key_false(&preempt_notifier_key))
2552                 WARN(1, "registering preempt_notifier while notifiers disabled\n");
2553
2554         hlist_add_head(&notifier->link, &current->preempt_notifiers);
2555 }
2556 EXPORT_SYMBOL_GPL(preempt_notifier_register);
2557
2558 /**
2559  * preempt_notifier_unregister - no longer interested in preemption notifications
2560  * @notifier: notifier struct to unregister
2561  *
2562  * This is *not* safe to call from within a preemption notifier.
2563  */
2564 void preempt_notifier_unregister(struct preempt_notifier *notifier)
2565 {
2566         hlist_del(&notifier->link);
2567 }
2568 EXPORT_SYMBOL_GPL(preempt_notifier_unregister);
2569
2570 static void __fire_sched_in_preempt_notifiers(struct task_struct *curr)
2571 {
2572         struct preempt_notifier *notifier;
2573
2574         hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2575                 notifier->ops->sched_in(notifier, raw_smp_processor_id());
2576 }
2577
2578 static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2579 {
2580         if (static_key_false(&preempt_notifier_key))
2581                 __fire_sched_in_preempt_notifiers(curr);
2582 }
2583
2584 static void
2585 __fire_sched_out_preempt_notifiers(struct task_struct *curr,
2586                                    struct task_struct *next)
2587 {
2588         struct preempt_notifier *notifier;
2589
2590         hlist_for_each_entry(notifier, &curr->preempt_notifiers, link)
2591                 notifier->ops->sched_out(notifier, next);
2592 }
2593
2594 static __always_inline void
2595 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2596                                  struct task_struct *next)
2597 {
2598         if (static_key_false(&preempt_notifier_key))
2599                 __fire_sched_out_preempt_notifiers(curr, next);
2600 }
2601
2602 #else /* !CONFIG_PREEMPT_NOTIFIERS */
2603
2604 static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr)
2605 {
2606 }
2607
2608 static inline void
2609 fire_sched_out_preempt_notifiers(struct task_struct *curr,
2610                                  struct task_struct *next)
2611 {
2612 }
2613
2614 #endif /* CONFIG_PREEMPT_NOTIFIERS */
2615
2616 /**
2617  * prepare_task_switch - prepare to switch tasks
2618  * @rq: the runqueue preparing to switch
2619  * @prev: the current task that is being switched out
2620  * @next: the task we are going to switch to.
2621  *
2622  * This is called with the rq lock held and interrupts off. It must
2623  * be paired with a subsequent finish_task_switch after the context
2624  * switch.
2625  *
2626  * prepare_task_switch sets up locking and calls architecture specific
2627  * hooks.
2628  */
2629 static inline void
2630 prepare_task_switch(struct rq *rq, struct task_struct *prev,
2631                     struct task_struct *next)
2632 {
2633         sched_info_switch(rq, prev, next);
2634         perf_event_task_sched_out(prev, next);
2635         fire_sched_out_preempt_notifiers(prev, next);
2636         prepare_lock_switch(rq, next);
2637         prepare_arch_switch(next);
2638 }
2639
2640 /**
2641  * finish_task_switch - clean up after a task-switch
2642  * @prev: the thread we just switched away from.
2643  *
2644  * finish_task_switch must be called after the context switch, paired
2645  * with a prepare_task_switch call before the context switch.
2646  * finish_task_switch will reconcile locking set up by prepare_task_switch,
2647  * and do any other architecture-specific cleanup actions.
2648  *
2649  * Note that we may have delayed dropping an mm in context_switch(). If
2650  * so, we finish that here outside of the runqueue lock. (Doing it
2651  * with the lock held can cause deadlocks; see schedule() for
2652  * details.)
2653  *
2654  * The context switch have flipped the stack from under us and restored the
2655  * local variables which were saved when this task called schedule() in the
2656  * past. prev == current is still correct but we need to recalculate this_rq
2657  * because prev may have moved to another CPU.
2658  */
2659 static struct rq *finish_task_switch(struct task_struct *prev)
2660         __releases(rq->lock)
2661 {
2662         struct rq *rq = this_rq();
2663         struct mm_struct *mm = rq->prev_mm;
2664         long prev_state;
2665
2666         /*
2667          * The previous task will have left us with a preempt_count of 2
2668          * because it left us after:
2669          *
2670          *      schedule()
2671          *        preempt_disable();                    // 1
2672          *        __schedule()
2673          *          raw_spin_lock_irq(&rq->lock)        // 2
2674          *
2675          * Also, see FORK_PREEMPT_COUNT.
2676          */
2677         if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET,
2678                       "corrupted preempt_count: %s/%d/0x%x\n",
2679                       current->comm, current->pid, preempt_count()))
2680                 preempt_count_set(FORK_PREEMPT_COUNT);
2681
2682         rq->prev_mm = NULL;
2683
2684         /*
2685          * A task struct has one reference for the use as "current".
2686          * If a task dies, then it sets TASK_DEAD in tsk->state and calls
2687          * schedule one last time. The schedule call will never return, and
2688          * the scheduled task must drop that reference.
2689          *
2690          * We must observe prev->state before clearing prev->on_cpu (in
2691          * finish_lock_switch), otherwise a concurrent wakeup can get prev
2692          * running on another CPU and we could rave with its RUNNING -> DEAD
2693          * transition, resulting in a double drop.
2694          */
2695         prev_state = prev->state;
2696         vtime_task_switch(prev);
2697         perf_event_task_sched_in(prev, current);
2698         /*
2699          * The membarrier system call requires a full memory barrier
2700          * after storing to rq->curr, before going back to user-space.
2701          *
2702          * TODO: This smp_mb__after_unlock_lock can go away if PPC end
2703          * up adding a full barrier to switch_mm(), or we should figure
2704          * out if a smp_mb__after_unlock_lock is really the proper API
2705          * to use.
2706          */
2707         smp_mb__after_unlock_lock();
2708         finish_lock_switch(rq, prev);
2709         finish_arch_post_lock_switch();
2710
2711         fire_sched_in_preempt_notifiers(current);
2712         if (mm)
2713                 mmdrop(mm);
2714         if (unlikely(prev_state == TASK_DEAD)) {
2715                 if (prev->sched_class->task_dead)
2716                         prev->sched_class->task_dead(prev);
2717
2718                 /*
2719                  * Remove function-return probe instances associated with this
2720                  * task and put them back on the free list.
2721                  */
2722                 kprobe_flush_task(prev);
2723
2724                 /* Task is done with its stack. */
2725                 put_task_stack(prev);
2726
2727                 put_task_struct(prev);
2728         }
2729
2730         tick_nohz_task_switch();
2731         return rq;
2732 }
2733
2734 #ifdef CONFIG_SMP
2735
2736 /* rq->lock is NOT held, but preemption is disabled */
2737 static void __balance_callback(struct rq *rq)
2738 {
2739         struct callback_head *head, *next;
2740         void (*func)(struct rq *rq);
2741         unsigned long flags;
2742
2743         raw_spin_lock_irqsave(&rq->lock, flags);
2744         head = rq->balance_callback;
2745         rq->balance_callback = NULL;
2746         while (head) {
2747                 func = (void (*)(struct rq *))head->func;
2748                 next = head->next;
2749                 head->next = NULL;
2750                 head = next;
2751
2752                 func(rq);
2753         }
2754         raw_spin_unlock_irqrestore(&rq->lock, flags);
2755 }
2756
2757 static inline void balance_callback(struct rq *rq)
2758 {
2759         if (unlikely(rq->balance_callback))
2760                 __balance_callback(rq);
2761 }
2762
2763 #else
2764
2765 static inline void balance_callback(struct rq *rq)
2766 {
2767 }
2768
2769 #endif
2770
2771 /**
2772  * schedule_tail - first thing a freshly forked thread must call.
2773  * @prev: the thread we just switched away from.
2774  */
2775 asmlinkage __visible void schedule_tail(struct task_struct *prev)
2776         __releases(rq->lock)
2777 {
2778         struct rq *rq;
2779
2780         /*
2781          * New tasks start with FORK_PREEMPT_COUNT, see there and
2782          * finish_task_switch() for details.
2783          *
2784          * finish_task_switch() will drop rq->lock() and lower preempt_count
2785          * and the preempt_enable() will end up enabling preemption (on
2786          * PREEMPT_COUNT kernels).
2787          */
2788
2789         rq = finish_task_switch(prev);
2790         balance_callback(rq);
2791         preempt_enable();
2792
2793         if (current->set_child_tid)
2794                 put_user(task_pid_vnr(current), current->set_child_tid);
2795 }
2796
2797 /*
2798  * context_switch - switch to the new MM and the new thread's register state.
2799  */
2800 static __always_inline struct rq *
2801 context_switch(struct rq *rq, struct task_struct *prev,
2802                struct task_struct *next, struct rq_flags *rf)
2803 {
2804         struct mm_struct *mm, *oldmm;
2805
2806         prepare_task_switch(rq, prev, next);
2807
2808         mm = next->mm;
2809         oldmm = prev->active_mm;
2810         /*
2811          * For paravirt, this is coupled with an exit in switch_to to
2812          * combine the page table reload and the switch backend into
2813          * one hypercall.
2814          */
2815         arch_start_context_switch(prev);
2816
2817         if (!mm) {
2818                 next->active_mm = oldmm;
2819                 mmgrab(oldmm);
2820                 enter_lazy_tlb(oldmm, next);
2821         } else
2822                 switch_mm_irqs_off(oldmm, mm, next);
2823
2824         if (!prev->mm) {
2825                 prev->active_mm = NULL;
2826                 rq->prev_mm = oldmm;
2827         }
2828
2829         rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
2830
2831         /*
2832          * Since the runqueue lock will be released by the next
2833          * task (which is an invalid locking op but in the case
2834          * of the scheduler it's an obvious special-case), so we
2835          * do an early lockdep release here:
2836          */
2837         rq_unpin_lock(rq, rf);
2838         spin_release(&rq->lock.dep_map, 1, _THIS_IP_);
2839
2840         /* Here we just switch the register state and the stack. */
2841         switch_to(prev, next, prev);
2842         barrier();
2843
2844         return finish_task_switch(prev);
2845 }
2846
2847 /*
2848  * nr_running and nr_context_switches:
2849  *
2850  * externally visible scheduler statistics: current number of runnable
2851  * threads, total number of context switches performed since bootup.
2852  */
2853 unsigned long nr_running(void)
2854 {
2855         unsigned long i, sum = 0;
2856
2857         for_each_online_cpu(i)
2858                 sum += cpu_rq(i)->nr_running;
2859
2860         return sum;
2861 }
2862
2863 /*
2864  * Check if only the current task is running on the CPU.
2865  *
2866  * Caution: this function does not check that the caller has disabled
2867  * preemption, thus the result might have a time-of-check-to-time-of-use
2868  * race.  The caller is responsible to use it correctly, for example:
2869  *
2870  * - from a non-preemptable section (of course)
2871  *
2872  * - from a thread that is bound to a single CPU
2873  *
2874  * - in a loop with very short iterations (e.g. a polling loop)
2875  */
2876 bool single_task_running(void)
2877 {
2878         return raw_rq()->nr_running == 1;
2879 }
2880 EXPORT_SYMBOL(single_task_running);
2881
2882 unsigned long long nr_context_switches(void)
2883 {
2884         int i;
2885         unsigned long long sum = 0;
2886
2887         for_each_possible_cpu(i)
2888                 sum += cpu_rq(i)->nr_switches;
2889
2890         return sum;
2891 }
2892
2893 /*
2894  * IO-wait accounting, and how its mostly bollocks (on SMP).
2895  *
2896  * The idea behind IO-wait account is to account the idle time that we could
2897  * have spend running if it were not for IO. That is, if we were to improve the
2898  * storage performance, we'd have a proportional reduction in IO-wait time.
2899  *
2900  * This all works nicely on UP, where, when a task blocks on IO, we account
2901  * idle time as IO-wait, because if the storage were faster, it could've been
2902  * running and we'd not be idle.
2903  *
2904  * This has been extended to SMP, by doing the same for each CPU. This however
2905  * is broken.
2906  *
2907  * Imagine for instance the case where two tasks block on one CPU, only the one
2908  * CPU will have IO-wait accounted, while the other has regular idle. Even
2909  * though, if the storage were faster, both could've ran at the same time,
2910  * utilising both CPUs.
2911  *
2912  * This means, that when looking globally, the current IO-wait accounting on
2913  * SMP is a lower bound, by reason of under accounting.
2914  *
2915  * Worse, since the numbers are provided per CPU, they are sometimes
2916  * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly
2917  * associated with any one particular CPU, it can wake to another CPU than it
2918  * blocked on. This means the per CPU IO-wait number is meaningless.
2919  *
2920  * Task CPU affinities can make all that even more 'interesting'.
2921  */
2922
2923 unsigned long nr_iowait(void)
2924 {
2925         unsigned long i, sum = 0;
2926
2927         for_each_possible_cpu(i)
2928                 sum += atomic_read(&cpu_rq(i)->nr_iowait);
2929
2930         return sum;
2931 }
2932
2933 /*
2934  * Consumers of these two interfaces, like for example the cpufreq menu
2935  * governor are using nonsensical data. Boosting frequency for a CPU that has
2936  * IO-wait which might not even end up running the task when it does become
2937  * runnable.
2938  */
2939
2940 unsigned long nr_iowait_cpu(int cpu)
2941 {
2942         struct rq *this = cpu_rq(cpu);
2943         return atomic_read(&this->nr_iowait);
2944 }
2945
2946 void get_iowait_load(unsigned long *nr_waiters, unsigned long *load)
2947 {
2948         struct rq *rq = this_rq();
2949         *nr_waiters = atomic_read(&rq->nr_iowait);
2950         *load = rq->load.weight;
2951 }
2952
2953 #ifdef CONFIG_SMP
2954
2955 /*
2956  * sched_exec - execve() is a valuable balancing opportunity, because at
2957  * this point the task has the smallest effective memory and cache footprint.
2958  */
2959 void sched_exec(void)
2960 {
2961         struct task_struct *p = current;
2962         unsigned long flags;
2963         int dest_cpu;
2964
2965         raw_spin_lock_irqsave(&p->pi_lock, flags);
2966         dest_cpu = p->sched_class->select_task_rq(p, task_cpu(p), SD_BALANCE_EXEC, 0, 1);
2967         if (dest_cpu == smp_processor_id())
2968                 goto unlock;
2969
2970         if (likely(cpu_active(dest_cpu))) {
2971                 struct migration_arg arg = { p, dest_cpu };
2972
2973                 raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2974                 stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
2975                 return;
2976         }
2977 unlock:
2978         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
2979 }
2980
2981 #endif
2982
2983 DEFINE_PER_CPU(struct kernel_stat, kstat);
2984 DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat);
2985
2986 EXPORT_PER_CPU_SYMBOL(kstat);
2987 EXPORT_PER_CPU_SYMBOL(kernel_cpustat);
2988
2989 /*
2990  * The function fair_sched_class.update_curr accesses the struct curr
2991  * and its field curr->exec_start; when called from task_sched_runtime(),
2992  * we observe a high rate of cache misses in practice.
2993  * Prefetching this data results in improved performance.
2994  */
2995 static inline void prefetch_curr_exec_start(struct task_struct *p)
2996 {
2997 #ifdef CONFIG_FAIR_GROUP_SCHED
2998         struct sched_entity *curr = (&p->se)->cfs_rq->curr;
2999 #else
3000         struct sched_entity *curr = (&task_rq(p)->cfs)->curr;
3001 #endif
3002         prefetch(curr);
3003         prefetch(&curr->exec_start);
3004 }
3005
3006 /*
3007  * Return accounted runtime for the task.
3008  * In case the task is currently running, return the runtime plus current's
3009  * pending runtime that have not been accounted yet.
3010  */
3011 unsigned long long task_sched_runtime(struct task_struct *p)
3012 {
3013         struct rq_flags rf;
3014         struct rq *rq;
3015         u64 ns;
3016
3017 #if defined(CONFIG_64BIT) && defined(CONFIG_SMP)
3018         /*
3019          * 64-bit doesn't need locks to atomically read a 64bit value.
3020          * So we have a optimization chance when the task's delta_exec is 0.
3021          * Reading ->on_cpu is racy, but this is ok.
3022          *
3023          * If we race with it leaving CPU, we'll take a lock. So we're correct.
3024          * If we race with it entering CPU, unaccounted time is 0. This is
3025          * indistinguishable from the read occurring a few cycles earlier.
3026          * If we see ->on_cpu without ->on_rq, the task is leaving, and has
3027          * been accounted, so we're correct here as well.
3028          */
3029         if (!p->on_cpu || !task_on_rq_queued(p))
3030                 return p->se.sum_exec_runtime;
3031 #endif
3032
3033         rq = task_rq_lock(p, &rf);
3034         /*
3035          * Must be ->curr _and_ ->on_rq.  If dequeued, we would
3036          * project cycles that may never be accounted to this
3037          * thread, breaking clock_gettime().
3038          */
3039         if (task_current(rq, p) && task_on_rq_queued(p)) {
3040                 prefetch_curr_exec_start(p);
3041                 update_rq_clock(rq);
3042                 p->sched_class->update_curr(rq);
3043         }
3044         ns = p->se.sum_exec_runtime;
3045         task_rq_unlock(rq, p, &rf);
3046
3047         return ns;
3048 }
3049
3050 /*
3051  * This function gets called by the timer code, with HZ frequency.
3052  * We call it with interrupts disabled.
3053  */
3054 void scheduler_tick(void)
3055 {
3056         int cpu = smp_processor_id();
3057         struct rq *rq = cpu_rq(cpu);
3058         struct task_struct *curr = rq->curr;
3059         struct rq_flags rf;
3060
3061         sched_clock_tick();
3062
3063         rq_lock(rq, &rf);
3064
3065         walt_set_window_start(rq, &rf);
3066         walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
3067                         walt_ktime_clock(), 0);
3068         update_rq_clock(rq);
3069         curr->sched_class->task_tick(rq, curr, 0);
3070         cpu_load_update_active(rq);
3071         calc_global_load_tick(rq);
3072
3073         rq_unlock(rq, &rf);
3074
3075         perf_event_task_tick();
3076
3077 #ifdef CONFIG_SMP
3078         rq->idle_balance = idle_cpu(cpu);
3079         trigger_load_balance(rq);
3080 #endif
3081         rq_last_tick_reset(rq);
3082 }
3083
3084 #ifdef CONFIG_NO_HZ_FULL
3085 /**
3086  * scheduler_tick_max_deferment
3087  *
3088  * Keep at least one tick per second when a single
3089  * active task is running because the scheduler doesn't
3090  * yet completely support full dynticks environment.
3091  *
3092  * This makes sure that uptime, CFS vruntime, load
3093  * balancing, etc... continue to move forward, even
3094  * with a very low granularity.
3095  *
3096  * Return: Maximum deferment in nanoseconds.
3097  */
3098 u64 scheduler_tick_max_deferment(void)
3099 {
3100         struct rq *rq = this_rq();
3101         unsigned long next, now = READ_ONCE(jiffies);
3102
3103         next = rq->last_sched_tick + HZ;
3104
3105         if (time_before_eq(next, now))
3106                 return 0;
3107
3108         return jiffies_to_nsecs(next - now);
3109 }
3110 #endif
3111
3112 #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
3113                                 defined(CONFIG_PREEMPT_TRACER))
3114 /*
3115  * If the value passed in is equal to the current preempt count
3116  * then we just disabled preemption. Start timing the latency.
3117  */
3118 static inline void preempt_latency_start(int val)
3119 {
3120         if (preempt_count() == val) {
3121                 unsigned long ip = get_lock_parent_ip();
3122 #ifdef CONFIG_DEBUG_PREEMPT
3123                 current->preempt_disable_ip = ip;
3124 #endif
3125                 trace_preempt_off(CALLER_ADDR0, ip);
3126         }
3127 }
3128
3129 void preempt_count_add(int val)
3130 {
3131 #ifdef CONFIG_DEBUG_PREEMPT
3132         /*
3133          * Underflow?
3134          */
3135         if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0)))
3136                 return;
3137 #endif
3138         __preempt_count_add(val);
3139 #ifdef CONFIG_DEBUG_PREEMPT
3140         /*
3141          * Spinlock count overflowing soon?
3142          */
3143         DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >=
3144                                 PREEMPT_MASK - 10);
3145 #endif
3146         preempt_latency_start(val);
3147 }
3148 EXPORT_SYMBOL(preempt_count_add);
3149 NOKPROBE_SYMBOL(preempt_count_add);
3150
3151 /*
3152  * If the value passed in equals to the current preempt count
3153  * then we just enabled preemption. Stop timing the latency.
3154  */
3155 static inline void preempt_latency_stop(int val)
3156 {
3157         if (preempt_count() == val)
3158                 trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip());
3159 }
3160
3161 void preempt_count_sub(int val)
3162 {
3163 #ifdef CONFIG_DEBUG_PREEMPT
3164         /*
3165          * Underflow?
3166          */
3167         if (DEBUG_LOCKS_WARN_ON(val > preempt_count()))
3168                 return;
3169         /*
3170          * Is the spinlock portion underflowing?
3171          */
3172         if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) &&
3173                         !(preempt_count() & PREEMPT_MASK)))
3174                 return;
3175 #endif
3176
3177         preempt_latency_stop(val);
3178         __preempt_count_sub(val);
3179 }
3180 EXPORT_SYMBOL(preempt_count_sub);
3181 NOKPROBE_SYMBOL(preempt_count_sub);
3182
3183 #else
3184 static inline void preempt_latency_start(int val) { }
3185 static inline void preempt_latency_stop(int val) { }
3186 #endif
3187
3188 static inline unsigned long get_preempt_disable_ip(struct task_struct *p)
3189 {
3190 #ifdef CONFIG_DEBUG_PREEMPT
3191         return p->preempt_disable_ip;
3192 #else
3193         return 0;
3194 #endif
3195 }
3196
3197 /*
3198  * Print scheduling while atomic bug:
3199  */
3200 static noinline void __schedule_bug(struct task_struct *prev)
3201 {
3202         /* Save this before calling printk(), since that will clobber it */
3203         unsigned long preempt_disable_ip = get_preempt_disable_ip(current);
3204
3205         if (oops_in_progress)
3206                 return;
3207
3208         printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n",
3209                 prev->comm, prev->pid, preempt_count());
3210
3211         debug_show_held_locks(prev);
3212         print_modules();
3213         if (irqs_disabled())
3214                 print_irqtrace_events(prev);
3215         if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
3216             && in_atomic_preempt_off()) {
3217                 pr_err("Preemption disabled at:");
3218                 print_ip_sym(preempt_disable_ip);
3219                 pr_cont("\n");
3220         }
3221         if (panic_on_warn)
3222                 panic("scheduling while atomic\n");
3223
3224         dump_stack();
3225         add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
3226 }
3227
3228 /*
3229  * Various schedule()-time debugging checks and statistics:
3230  */
3231 static inline void schedule_debug(struct task_struct *prev)
3232 {
3233 #ifdef CONFIG_SCHED_STACK_END_CHECK
3234         if (task_stack_end_corrupted(prev))
3235                 panic("corrupted stack end detected inside scheduler\n");
3236 #endif
3237
3238         if (unlikely(in_atomic_preempt_off())) {
3239                 __schedule_bug(prev);
3240                 preempt_count_set(PREEMPT_DISABLED);
3241         }
3242         rcu_sleep_check();
3243
3244         profile_hit(SCHED_PROFILING, __builtin_return_address(0));
3245
3246         schedstat_inc(this_rq()->sched_count);
3247 }
3248
3249 /*
3250  * Pick up the highest-prio task:
3251  */
3252 static inline struct task_struct *
3253 pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
3254 {
3255         const struct sched_class *class;
3256         struct task_struct *p;
3257
3258         /*
3259          * Optimization: we know that if all tasks are in the fair class we can
3260          * call that function directly, but only if the @prev task wasn't of a
3261          * higher scheduling class, because otherwise those loose the
3262          * opportunity to pull in more work from other CPUs.
3263          */
3264         if (likely((prev->sched_class == &idle_sched_class ||
3265                     prev->sched_class == &fair_sched_class) &&
3266                    rq->nr_running == rq->cfs.h_nr_running)) {
3267
3268                 p = fair_sched_class.pick_next_task(rq, prev, rf);
3269                 if (unlikely(p == RETRY_TASK))
3270                         goto again;
3271
3272                 /* Assumes fair_sched_class->next == idle_sched_class */
3273                 if (unlikely(!p))
3274                         p = idle_sched_class.pick_next_task(rq, prev, rf);
3275
3276                 return p;
3277         }
3278
3279 again:
3280         for_each_class(class) {
3281                 p = class->pick_next_task(rq, prev, rf);
3282                 if (p) {
3283                         if (unlikely(p == RETRY_TASK))
3284                                 goto again;
3285                         return p;
3286                 }
3287         }
3288
3289         /* The idle class should always have a runnable task: */
3290         BUG();
3291 }
3292
3293 /*
3294  * __schedule() is the main scheduler function.
3295  *
3296  * The main means of driving the scheduler and thus entering this function are:
3297  *
3298  *   1. Explicit blocking: mutex, semaphore, waitqueue, etc.
3299  *
3300  *   2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return
3301  *      paths. For example, see arch/x86/entry_64.S.
3302  *
3303  *      To drive preemption between tasks, the scheduler sets the flag in timer
3304  *      interrupt handler scheduler_tick().
3305  *
3306  *   3. Wakeups don't really cause entry into schedule(). They add a
3307  *      task to the run-queue and that's it.
3308  *
3309  *      Now, if the new task added to the run-queue preempts the current
3310  *      task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets
3311  *      called on the nearest possible occasion:
3312  *
3313  *       - If the kernel is preemptible (CONFIG_PREEMPT=y):
3314  *
3315  *         - in syscall or exception context, at the next outmost
3316  *           preempt_enable(). (this might be as soon as the wake_up()'s
3317  *           spin_unlock()!)
3318  *
3319  *         - in IRQ context, return from interrupt-handler to
3320  *           preemptible context
3321  *
3322  *       - If the kernel is not preemptible (CONFIG_PREEMPT is not set)
3323  *         then at the next:
3324  *
3325  *          - cond_resched() call
3326  *          - explicit schedule() call
3327  *          - return from syscall or exception to user-space
3328  *          - return from interrupt-handler to user-space
3329  *
3330  * WARNING: must be called with preemption disabled!
3331  */
3332 static void __sched notrace __schedule(bool preempt)
3333 {
3334         struct task_struct *prev, *next;
3335         unsigned long *switch_count;
3336         struct rq_flags rf;
3337         struct rq *rq;
3338         int cpu;
3339         u64 wallclock;
3340
3341         cpu = smp_processor_id();
3342         rq = cpu_rq(cpu);
3343         prev = rq->curr;
3344
3345         schedule_debug(prev);
3346
3347         if (sched_feat(HRTICK))
3348                 hrtick_clear(rq);
3349
3350         local_irq_disable();
3351         rcu_note_context_switch(preempt);
3352
3353         /*
3354          * Make sure that signal_pending_state()->signal_pending() below
3355          * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE)
3356          * done by the caller to avoid the race with signal_wake_up().
3357          */
3358         rq_lock(rq, &rf);
3359         smp_mb__after_spinlock();
3360
3361         /* Promote REQ to ACT */
3362         rq->clock_update_flags <<= 1;
3363         update_rq_clock(rq);
3364
3365         switch_count = &prev->nivcsw;
3366         if (!preempt && prev->state) {
3367                 if (unlikely(signal_pending_state(prev->state, prev))) {
3368                         prev->state = TASK_RUNNING;
3369                 } else {
3370                         deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK);
3371                         prev->on_rq = 0;
3372
3373                         if (prev->in_iowait) {
3374                                 atomic_inc(&rq->nr_iowait);
3375                                 delayacct_blkio_start();
3376                         }
3377
3378                         /*
3379                          * If a worker went to sleep, notify and ask workqueue
3380                          * whether it wants to wake up a task to maintain
3381                          * concurrency.
3382                          */
3383                         if (prev->flags & PF_WQ_WORKER) {
3384                                 struct task_struct *to_wakeup;
3385
3386                                 to_wakeup = wq_worker_sleeping(prev);
3387                                 if (to_wakeup)
3388                                         try_to_wake_up_local(to_wakeup, &rf);
3389                         }
3390                 }
3391                 switch_count = &prev->nvcsw;
3392         }
3393
3394         next = pick_next_task(rq, prev, &rf);
3395         wallclock = walt_ktime_clock();
3396         walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
3397         walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
3398         clear_tsk_need_resched(prev);
3399         clear_preempt_need_resched();
3400
3401         if (likely(prev != next)) {
3402 #ifdef CONFIG_SCHED_WALT
3403                 if (!prev->on_rq)
3404                         prev->last_sleep_ts = wallclock;
3405 #endif
3406                 rq->nr_switches++;
3407                 rq->curr = next;
3408                 /*
3409                  * The membarrier system call requires each architecture
3410                  * to have a full memory barrier after updating
3411                  * rq->curr, before returning to user-space. For TSO
3412                  * (e.g. x86), the architecture must provide its own
3413                  * barrier in switch_mm(). For weakly ordered machines
3414                  * for which spin_unlock() acts as a full memory
3415                  * barrier, finish_lock_switch() in common code takes
3416                  * care of this barrier. For weakly ordered machines for
3417                  * which spin_unlock() acts as a RELEASE barrier (only
3418                  * arm64 and PowerPC), arm64 has a full barrier in
3419                  * switch_to(), and PowerPC has
3420                  * smp_mb__after_unlock_lock() before
3421                  * finish_lock_switch().
3422                  */
3423                 ++*switch_count;
3424
3425                 trace_sched_switch(preempt, prev, next);
3426
3427                 /* Also unlocks the rq: */
3428                 rq = context_switch(rq, prev, next, &rf);
3429         } else {
3430                 rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP);
3431                 rq_unlock_irq(rq, &rf);
3432         }
3433
3434         balance_callback(rq);
3435 }
3436
3437 void __noreturn do_task_dead(void)
3438 {
3439         /*
3440          * The setting of TASK_RUNNING by try_to_wake_up() may be delayed
3441          * when the following two conditions become true.
3442          *   - There is race condition of mmap_sem (It is acquired by
3443          *     exit_mm()), and
3444          *   - SMI occurs before setting TASK_RUNINNG.
3445          *     (or hypervisor of virtual machine switches to other guest)
3446          *  As a result, we may become TASK_RUNNING after becoming TASK_DEAD
3447          *
3448          * To avoid it, we have to wait for releasing tsk->pi_lock which
3449          * is held by try_to_wake_up()
3450          */
3451         raw_spin_lock_irq(&current->pi_lock);
3452         raw_spin_unlock_irq(&current->pi_lock);
3453
3454         /* Causes final put_task_struct in finish_task_switch(): */
3455         __set_current_state(TASK_DEAD);
3456
3457         /* Tell freezer to ignore us: */
3458         current->flags |= PF_NOFREEZE;
3459
3460         __schedule(false);
3461         BUG();
3462
3463         /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */
3464         for (;;)
3465                 cpu_relax();
3466 }
3467
3468 static inline void sched_submit_work(struct task_struct *tsk)
3469 {
3470         if (!tsk->state || tsk_is_pi_blocked(tsk))
3471                 return;
3472         /*
3473          * If we are going to sleep and we have plugged IO queued,
3474          * make sure to submit it to avoid deadlocks.
3475          */
3476         if (blk_needs_flush_plug(tsk))
3477                 blk_schedule_flush_plug(tsk);
3478 }
3479
3480 asmlinkage __visible void __sched schedule(void)
3481 {
3482         struct task_struct *tsk = current;
3483
3484         sched_submit_work(tsk);
3485         do {
3486                 preempt_disable();
3487                 __schedule(false);
3488                 sched_preempt_enable_no_resched();
3489         } while (need_resched());
3490 }
3491 EXPORT_SYMBOL(schedule);
3492
3493 /*
3494  * synchronize_rcu_tasks() makes sure that no task is stuck in preempted
3495  * state (have scheduled out non-voluntarily) by making sure that all
3496  * tasks have either left the run queue or have gone into user space.
3497  * As idle tasks do not do either, they must not ever be preempted
3498  * (schedule out non-voluntarily).
3499  *
3500  * schedule_idle() is similar to schedule_preempt_disable() except that it
3501  * never enables preemption because it does not call sched_submit_work().
3502  */
3503 void __sched schedule_idle(void)
3504 {
3505         /*
3506          * As this skips calling sched_submit_work(), which the idle task does
3507          * regardless because that function is a nop when the task is in a
3508          * TASK_RUNNING state, make sure this isn't used someplace that the
3509          * current task can be in any other state. Note, idle is always in the
3510          * TASK_RUNNING state.
3511          */
3512         WARN_ON_ONCE(current->state);
3513         do {
3514                 __schedule(false);
3515         } while (need_resched());
3516 }
3517
3518 #ifdef CONFIG_CONTEXT_TRACKING
3519 asmlinkage __visible void __sched schedule_user(void)
3520 {
3521         /*
3522          * If we come here after a random call to set_need_resched(),
3523          * or we have been woken up remotely but the IPI has not yet arrived,
3524          * we haven't yet exited the RCU idle mode. Do it here manually until
3525          * we find a better solution.
3526          *
3527          * NB: There are buggy callers of this function.  Ideally we
3528          * should warn if prev_state != CONTEXT_USER, but that will trigger
3529          * too frequently to make sense yet.
3530          */
3531         enum ctx_state prev_state = exception_enter();
3532         schedule();
3533         exception_exit(prev_state);
3534 }
3535 #endif
3536
3537 /**
3538  * schedule_preempt_disabled - called with preemption disabled
3539  *
3540  * Returns with preemption disabled. Note: preempt_count must be 1
3541  */
3542 void __sched schedule_preempt_disabled(void)
3543 {
3544         sched_preempt_enable_no_resched();
3545         schedule();
3546         preempt_disable();
3547 }
3548
3549 static void __sched notrace preempt_schedule_common(void)
3550 {
3551         do {
3552                 /*
3553                  * Because the function tracer can trace preempt_count_sub()
3554                  * and it also uses preempt_enable/disable_notrace(), if
3555                  * NEED_RESCHED is set, the preempt_enable_notrace() called
3556                  * by the function tracer will call this function again and
3557                  * cause infinite recursion.
3558                  *
3559                  * Preemption must be disabled here before the function
3560                  * tracer can trace. Break up preempt_disable() into two
3561                  * calls. One to disable preemption without fear of being
3562                  * traced. The other to still record the preemption latency,
3563                  * which can also be traced by the function tracer.
3564                  */
3565                 preempt_disable_notrace();
3566                 preempt_latency_start(1);
3567                 __schedule(true);
3568                 preempt_latency_stop(1);
3569                 preempt_enable_no_resched_notrace();
3570
3571                 /*
3572                  * Check again in case we missed a preemption opportunity
3573                  * between schedule and now.
3574                  */
3575         } while (need_resched());
3576 }
3577
3578 #ifdef CONFIG_PREEMPT
3579 /*
3580  * this is the entry point to schedule() from in-kernel preemption
3581  * off of preempt_enable. Kernel preemptions off return from interrupt
3582  * occur there and call schedule directly.
3583  */
3584 asmlinkage __visible void __sched notrace preempt_schedule(void)
3585 {
3586         /*
3587          * If there is a non-zero preempt_count or interrupts are disabled,
3588          * we do not want to preempt the current task. Just return..
3589          */
3590         if (likely(!preemptible()))
3591                 return;
3592
3593         preempt_schedule_common();
3594 }
3595 NOKPROBE_SYMBOL(preempt_schedule);
3596 EXPORT_SYMBOL(preempt_schedule);
3597
3598 /**
3599  * preempt_schedule_notrace - preempt_schedule called by tracing
3600  *
3601  * The tracing infrastructure uses preempt_enable_notrace to prevent
3602  * recursion and tracing preempt enabling caused by the tracing
3603  * infrastructure itself. But as tracing can happen in areas coming
3604  * from userspace or just about to enter userspace, a preempt enable
3605  * can occur before user_exit() is called. This will cause the scheduler
3606  * to be called when the system is still in usermode.
3607  *
3608  * To prevent this, the preempt_enable_notrace will use this function
3609  * instead of preempt_schedule() to exit user context if needed before
3610  * calling the scheduler.
3611  */
3612 asmlinkage __visible void __sched notrace preempt_schedule_notrace(void)
3613 {
3614         enum ctx_state prev_ctx;
3615
3616         if (likely(!preemptible()))
3617                 return;
3618
3619         do {
3620                 /*
3621                  * Because the function tracer can trace preempt_count_sub()
3622                  * and it also uses preempt_enable/disable_notrace(), if
3623                  * NEED_RESCHED is set, the preempt_enable_notrace() called
3624                  * by the function tracer will call this function again and
3625                  * cause infinite recursion.
3626                  *
3627                  * Preemption must be disabled here before the function
3628                  * tracer can trace. Break up preempt_disable() into two
3629                  * calls. One to disable preemption without fear of being
3630                  * traced. The other to still record the preemption latency,
3631                  * which can also be traced by the function tracer.
3632                  */
3633                 preempt_disable_notrace();
3634                 preempt_latency_start(1);
3635                 /*
3636                  * Needs preempt disabled in case user_exit() is traced
3637                  * and the tracer calls preempt_enable_notrace() causing
3638                  * an infinite recursion.
3639                  */
3640                 prev_ctx = exception_enter();
3641                 __schedule(true);
3642                 exception_exit(prev_ctx);
3643
3644                 preempt_latency_stop(1);
3645                 preempt_enable_no_resched_notrace();
3646         } while (need_resched());
3647 }
3648 EXPORT_SYMBOL_GPL(preempt_schedule_notrace);
3649
3650 #endif /* CONFIG_PREEMPT */
3651
3652 /*
3653  * this is the entry point to schedule() from kernel preemption
3654  * off of irq context.
3655  * Note, that this is called and return with irqs disabled. This will
3656  * protect us against recursive calling from irq.
3657  */
3658 asmlinkage __visible void __sched preempt_schedule_irq(void)
3659 {
3660         enum ctx_state prev_state;
3661
3662         /* Catch callers which need to be fixed */
3663         BUG_ON(preempt_count() || !irqs_disabled());
3664
3665         prev_state = exception_enter();
3666
3667         do {
3668                 preempt_disable();
3669                 local_irq_enable();
3670                 __schedule(true);
3671                 local_irq_disable();
3672                 sched_preempt_enable_no_resched();
3673         } while (need_resched());
3674
3675         exception_exit(prev_state);
3676 }
3677
3678 int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags,
3679                           void *key)
3680 {
3681         return try_to_wake_up(curr->private, mode, wake_flags, 1);
3682 }
3683 EXPORT_SYMBOL(default_wake_function);
3684
3685 #ifdef CONFIG_RT_MUTEXES
3686
3687 static inline int __rt_effective_prio(struct task_struct *pi_task, int prio)
3688 {
3689         if (pi_task)
3690                 prio = min(prio, pi_task->prio);
3691
3692         return prio;
3693 }
3694
3695 static inline int rt_effective_prio(struct task_struct *p, int prio)
3696 {
3697         struct task_struct *pi_task = rt_mutex_get_top_task(p);
3698
3699         return __rt_effective_prio(pi_task, prio);
3700 }
3701
3702 /*
3703  * rt_mutex_setprio - set the current priority of a task
3704  * @p: task to boost
3705  * @pi_task: donor task
3706  *
3707  * This function changes the 'effective' priority of a task. It does
3708  * not touch ->normal_prio like __setscheduler().
3709  *
3710  * Used by the rt_mutex code to implement priority inheritance
3711  * logic. Call site only calls if the priority of the task changed.
3712  */
3713 void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task)
3714 {
3715         int prio, oldprio, queued, running, queue_flag =
3716                 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
3717         const struct sched_class *prev_class;
3718         struct rq_flags rf;
3719         struct rq *rq;
3720
3721         /* XXX used to be waiter->prio, not waiter->task->prio */
3722         prio = __rt_effective_prio(pi_task, p->normal_prio);
3723
3724         /*
3725          * If nothing changed; bail early.
3726          */
3727         if (p->pi_top_task == pi_task && prio == p->prio && !dl_prio(prio))
3728                 return;
3729
3730         rq = __task_rq_lock(p, &rf);
3731         update_rq_clock(rq);
3732         /*
3733          * Set under pi_lock && rq->lock, such that the value can be used under
3734          * either lock.
3735          *
3736          * Note that there is loads of tricky to make this pointer cache work
3737          * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to
3738          * ensure a task is de-boosted (pi_task is set to NULL) before the
3739          * task is allowed to run again (and can exit). This ensures the pointer
3740          * points to a blocked task -- which guaratees the task is present.
3741          */
3742         p->pi_top_task = pi_task;
3743
3744         /*
3745          * For FIFO/RR we only need to set prio, if that matches we're done.
3746          */
3747         if (prio == p->prio && !dl_prio(prio))
3748                 goto out_unlock;
3749
3750         /*
3751          * Idle task boosting is a nono in general. There is one
3752          * exception, when PREEMPT_RT and NOHZ is active:
3753          *
3754          * The idle task calls get_next_timer_interrupt() and holds
3755          * the timer wheel base->lock on the CPU and another CPU wants
3756          * to access the timer (probably to cancel it). We can safely
3757          * ignore the boosting request, as the idle CPU runs this code
3758          * with interrupts disabled and will complete the lock
3759          * protected section without being interrupted. So there is no
3760          * real need to boost.
3761          */
3762         if (unlikely(p == rq->idle)) {
3763                 WARN_ON(p != rq->curr);
3764                 WARN_ON(p->pi_blocked_on);
3765                 goto out_unlock;
3766         }
3767
3768         trace_sched_pi_setprio(p, pi_task);
3769         oldprio = p->prio;
3770
3771         if (oldprio == prio)
3772                 queue_flag &= ~DEQUEUE_MOVE;
3773
3774         prev_class = p->sched_class;
3775         queued = task_on_rq_queued(p);
3776         running = task_current(rq, p);
3777         if (queued)
3778                 dequeue_task(rq, p, queue_flag);
3779         if (running)
3780                 put_prev_task(rq, p);
3781
3782         /*
3783          * Boosting condition are:
3784          * 1. -rt task is running and holds mutex A
3785          *      --> -dl task blocks on mutex A
3786          *
3787          * 2. -dl task is running and holds mutex A
3788          *      --> -dl task blocks on mutex A and could preempt the
3789          *          running task
3790          */
3791         if (dl_prio(prio)) {
3792                 if (!dl_prio(p->normal_prio) ||
3793                     (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
3794                         p->dl.dl_boosted = 1;
3795                         queue_flag |= ENQUEUE_REPLENISH;
3796                 } else
3797                         p->dl.dl_boosted = 0;
3798                 p->sched_class = &dl_sched_class;
3799         } else if (rt_prio(prio)) {
3800                 if (dl_prio(oldprio))
3801                         p->dl.dl_boosted = 0;
3802                 if (oldprio < prio)
3803                         queue_flag |= ENQUEUE_HEAD;
3804                 p->sched_class = &rt_sched_class;
3805         } else {
3806                 if (dl_prio(oldprio))
3807                         p->dl.dl_boosted = 0;
3808                 if (rt_prio(oldprio))
3809                         p->rt.timeout = 0;
3810                 p->sched_class = &fair_sched_class;
3811         }
3812
3813         p->prio = prio;
3814
3815         if (queued)
3816                 enqueue_task(rq, p, queue_flag);
3817         if (running)
3818                 set_curr_task(rq, p);
3819
3820         check_class_changed(rq, p, prev_class, oldprio);
3821 out_unlock:
3822         /* Avoid rq from going away on us: */
3823         preempt_disable();
3824         __task_rq_unlock(rq, &rf);
3825
3826         balance_callback(rq);
3827         preempt_enable();
3828 }
3829 #else
3830 static inline int rt_effective_prio(struct task_struct *p, int prio)
3831 {
3832         return prio;
3833 }
3834 #endif
3835
3836 void set_user_nice(struct task_struct *p, long nice)
3837 {
3838         bool queued, running;
3839         int old_prio, delta;
3840         struct rq_flags rf;
3841         struct rq *rq;
3842
3843         if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE)
3844                 return;
3845         /*
3846          * We have to be careful, if called from sys_setpriority(),
3847          * the task might be in the middle of scheduling on another CPU.
3848          */
3849         rq = task_rq_lock(p, &rf);
3850         update_rq_clock(rq);
3851
3852         /*
3853          * The RT priorities are set via sched_setscheduler(), but we still
3854          * allow the 'normal' nice value to be set - but as expected
3855          * it wont have any effect on scheduling until the task is
3856          * SCHED_DEADLINE, SCHED_FIFO or SCHED_RR:
3857          */
3858         if (task_has_dl_policy(p) || task_has_rt_policy(p)) {
3859                 p->static_prio = NICE_TO_PRIO(nice);
3860                 goto out_unlock;
3861         }
3862         queued = task_on_rq_queued(p);
3863         running = task_current(rq, p);
3864         if (queued)
3865                 dequeue_task(rq, p, DEQUEUE_SAVE | DEQUEUE_NOCLOCK);
3866         if (running)
3867                 put_prev_task(rq, p);
3868
3869         p->static_prio = NICE_TO_PRIO(nice);
3870         set_load_weight(p);
3871         old_prio = p->prio;
3872         p->prio = effective_prio(p);
3873         delta = p->prio - old_prio;
3874
3875         if (queued) {
3876                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
3877                 /*
3878                  * If the task increased its priority or is running and
3879                  * lowered its priority, then reschedule its CPU:
3880                  */
3881                 if (delta < 0 || (delta > 0 && task_running(rq, p)))
3882                         resched_curr(rq);
3883         }
3884         if (running)
3885                 set_curr_task(rq, p);
3886 out_unlock:
3887         task_rq_unlock(rq, p, &rf);
3888 }
3889 EXPORT_SYMBOL(set_user_nice);
3890
3891 /*
3892  * can_nice - check if a task can reduce its nice value
3893  * @p: task
3894  * @nice: nice value
3895  */
3896 int can_nice(const struct task_struct *p, const int nice)
3897 {
3898         /* Convert nice value [19,-20] to rlimit style value [1,40]: */
3899         int nice_rlim = nice_to_rlimit(nice);
3900
3901         return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) ||
3902                 capable(CAP_SYS_NICE));
3903 }
3904
3905 #ifdef __ARCH_WANT_SYS_NICE
3906
3907 /*
3908  * sys_nice - change the priority of the current process.
3909  * @increment: priority increment
3910  *
3911  * sys_setpriority is a more generic, but much slower function that
3912  * does similar things.
3913  */
3914 SYSCALL_DEFINE1(nice, int, increment)
3915 {
3916         long nice, retval;
3917
3918         /*
3919          * Setpriority might change our priority at the same moment.
3920          * We don't have to worry. Conceptually one call occurs first
3921          * and we have a single winner.
3922          */
3923         increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH);
3924         nice = task_nice(current) + increment;
3925
3926         nice = clamp_val(nice, MIN_NICE, MAX_NICE);
3927         if (increment < 0 && !can_nice(current, nice))
3928                 return -EPERM;
3929
3930         retval = security_task_setnice(current, nice);
3931         if (retval)
3932                 return retval;
3933
3934         set_user_nice(current, nice);
3935         return 0;
3936 }
3937
3938 #endif
3939
3940 /**
3941  * task_prio - return the priority value of a given task.
3942  * @p: the task in question.
3943  *
3944  * Return: The priority value as seen by users in /proc.
3945  * RT tasks are offset by -200. Normal tasks are centered
3946  * around 0, value goes from -16 to +15.
3947  */
3948 int task_prio(const struct task_struct *p)
3949 {
3950         return p->prio - MAX_RT_PRIO;
3951 }
3952
3953 /**
3954  * idle_cpu - is a given CPU idle currently?
3955  * @cpu: the processor in question.
3956  *
3957  * Return: 1 if the CPU is currently idle. 0 otherwise.
3958  */
3959 int idle_cpu(int cpu)
3960 {
3961         struct rq *rq = cpu_rq(cpu);
3962
3963         if (rq->curr != rq->idle)
3964                 return 0;
3965
3966         if (rq->nr_running)
3967                 return 0;
3968
3969 #ifdef CONFIG_SMP
3970         if (!llist_empty(&rq->wake_list))
3971                 return 0;
3972 #endif
3973
3974         return 1;
3975 }
3976
3977 /**
3978  * idle_task - return the idle task for a given CPU.
3979  * @cpu: the processor in question.
3980  *
3981  * Return: The idle task for the CPU @cpu.
3982  */
3983 struct task_struct *idle_task(int cpu)
3984 {
3985         return cpu_rq(cpu)->idle;
3986 }
3987
3988 /**
3989  * find_process_by_pid - find a process with a matching PID value.
3990  * @pid: the pid in question.
3991  *
3992  * The task of @pid, if found. %NULL otherwise.
3993  */
3994 static struct task_struct *find_process_by_pid(pid_t pid)
3995 {
3996         return pid ? find_task_by_vpid(pid) : current;
3997 }
3998
3999 /*
4000  * sched_setparam() passes in -1 for its policy, to let the functions
4001  * it calls know not to change it.
4002  */
4003 #define SETPARAM_POLICY -1
4004
4005 static void __setscheduler_params(struct task_struct *p,
4006                 const struct sched_attr *attr)
4007 {
4008         int policy = attr->sched_policy;
4009
4010         if (policy == SETPARAM_POLICY)
4011                 policy = p->policy;
4012
4013         p->policy = policy;
4014
4015         if (dl_policy(policy))
4016                 __setparam_dl(p, attr);
4017         else if (fair_policy(policy))
4018                 p->static_prio = NICE_TO_PRIO(attr->sched_nice);
4019
4020         /*
4021          * __sched_setscheduler() ensures attr->sched_priority == 0 when
4022          * !rt_policy. Always setting this ensures that things like
4023          * getparam()/getattr() don't report silly values for !rt tasks.
4024          */
4025         p->rt_priority = attr->sched_priority;
4026         p->normal_prio = normal_prio(p);
4027         set_load_weight(p);
4028 }
4029
4030 /* Actually do priority change: must hold pi & rq lock. */
4031 static void __setscheduler(struct rq *rq, struct task_struct *p,
4032                            const struct sched_attr *attr, bool keep_boost)
4033 {
4034         __setscheduler_params(p, attr);
4035
4036         /*
4037          * Keep a potential priority boosting if called from
4038          * sched_setscheduler().
4039          */
4040         p->prio = normal_prio(p);
4041         if (keep_boost)
4042                 p->prio = rt_effective_prio(p, p->prio);
4043
4044         if (dl_prio(p->prio))
4045                 p->sched_class = &dl_sched_class;
4046         else if (rt_prio(p->prio))
4047                 p->sched_class = &rt_sched_class;
4048         else
4049                 p->sched_class = &fair_sched_class;
4050 }
4051
4052 /*
4053  * Check the target process has a UID that matches the current process's:
4054  */
4055 static bool check_same_owner(struct task_struct *p)
4056 {
4057         const struct cred *cred = current_cred(), *pcred;
4058         bool match;
4059
4060         rcu_read_lock();
4061         pcred = __task_cred(p);
4062         match = (uid_eq(cred->euid, pcred->euid) ||
4063                  uid_eq(cred->euid, pcred->uid));
4064         rcu_read_unlock();
4065         return match;
4066 }
4067
4068 static int __sched_setscheduler(struct task_struct *p,
4069                                 const struct sched_attr *attr,
4070                                 bool user, bool pi)
4071 {
4072         int newprio = dl_policy(attr->sched_policy) ? MAX_DL_PRIO - 1 :
4073                       MAX_RT_PRIO - 1 - attr->sched_priority;
4074         int retval, oldprio, oldpolicy = -1, queued, running;
4075         int new_effective_prio, policy = attr->sched_policy;
4076         const struct sched_class *prev_class;
4077         struct rq_flags rf;
4078         int reset_on_fork;
4079         int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
4080         struct rq *rq;
4081
4082         /* The pi code expects interrupts enabled */
4083         BUG_ON(pi && in_interrupt());
4084 recheck:
4085         /* Double check policy once rq lock held: */
4086         if (policy < 0) {
4087                 reset_on_fork = p->sched_reset_on_fork;
4088                 policy = oldpolicy = p->policy;
4089         } else {
4090                 reset_on_fork = !!(attr->sched_flags & SCHED_FLAG_RESET_ON_FORK);
4091
4092                 if (!valid_policy(policy))
4093                         return -EINVAL;
4094         }
4095
4096         if (attr->sched_flags &
4097                 ~(SCHED_FLAG_RESET_ON_FORK | SCHED_FLAG_RECLAIM))
4098                 return -EINVAL;
4099
4100         /*
4101          * Valid priorities for SCHED_FIFO and SCHED_RR are
4102          * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL,
4103          * SCHED_BATCH and SCHED_IDLE is 0.
4104          */
4105         if ((p->mm && attr->sched_priority > MAX_USER_RT_PRIO-1) ||
4106             (!p->mm && attr->sched_priority > MAX_RT_PRIO-1))
4107                 return -EINVAL;
4108         if ((dl_policy(policy) && !__checkparam_dl(attr)) ||
4109             (rt_policy(policy) != (attr->sched_priority != 0)))
4110                 return -EINVAL;
4111
4112         /*
4113          * Allow unprivileged RT tasks to decrease priority:
4114          */
4115         if (user && !capable(CAP_SYS_NICE)) {
4116                 if (fair_policy(policy)) {
4117                         if (attr->sched_nice < task_nice(p) &&
4118                             !can_nice(p, attr->sched_nice))
4119                                 return -EPERM;
4120                 }
4121
4122                 if (rt_policy(policy)) {
4123                         unsigned long rlim_rtprio =
4124                                         task_rlimit(p, RLIMIT_RTPRIO);
4125
4126                         /* Can't set/change the rt policy: */
4127                         if (policy != p->policy && !rlim_rtprio)
4128                                 return -EPERM;
4129
4130                         /* Can't increase priority: */
4131                         if (attr->sched_priority > p->rt_priority &&
4132                             attr->sched_priority > rlim_rtprio)
4133                                 return -EPERM;
4134                 }
4135
4136                  /*
4137                   * Can't set/change SCHED_DEADLINE policy at all for now
4138                   * (safest behavior); in the future we would like to allow
4139                   * unprivileged DL tasks to increase their relative deadline
4140                   * or reduce their runtime (both ways reducing utilization)
4141                   */
4142                 if (dl_policy(policy))
4143                         return -EPERM;
4144
4145                 /*
4146                  * Treat SCHED_IDLE as nice 20. Only allow a switch to
4147                  * SCHED_NORMAL if the RLIMIT_NICE would normally permit it.
4148                  */
4149                 if (idle_policy(p->policy) && !idle_policy(policy)) {
4150                         if (!can_nice(p, task_nice(p)))
4151                                 return -EPERM;
4152                 }
4153
4154                 /* Can't change other user's priorities: */
4155                 if (!check_same_owner(p))
4156                         return -EPERM;
4157
4158                 /* Normal users shall not reset the sched_reset_on_fork flag: */
4159                 if (p->sched_reset_on_fork && !reset_on_fork)
4160                         return -EPERM;
4161         }
4162
4163         if (user) {
4164                 retval = security_task_setscheduler(p);
4165                 if (retval)
4166                         return retval;
4167         }
4168
4169         /*
4170          * Make sure no PI-waiters arrive (or leave) while we are
4171          * changing the priority of the task:
4172          *
4173          * To be able to change p->policy safely, the appropriate
4174          * runqueue lock must be held.
4175          */
4176         rq = task_rq_lock(p, &rf);
4177         update_rq_clock(rq);
4178
4179         /*
4180          * Changing the policy of the stop threads its a very bad idea:
4181          */
4182         if (p == rq->stop) {
4183                 task_rq_unlock(rq, p, &rf);
4184                 return -EINVAL;
4185         }
4186
4187         /*
4188          * If not changing anything there's no need to proceed further,
4189          * but store a possible modification of reset_on_fork.
4190          */
4191         if (unlikely(policy == p->policy)) {
4192                 if (fair_policy(policy) && attr->sched_nice != task_nice(p))
4193                         goto change;
4194                 if (rt_policy(policy) && attr->sched_priority != p->rt_priority)
4195                         goto change;
4196                 if (dl_policy(policy) && dl_param_changed(p, attr))
4197                         goto change;
4198
4199                 p->sched_reset_on_fork = reset_on_fork;
4200                 task_rq_unlock(rq, p, &rf);
4201                 return 0;
4202         }
4203 change:
4204
4205         if (user) {
4206 #ifdef CONFIG_RT_GROUP_SCHED
4207                 /*
4208                  * Do not allow realtime tasks into groups that have no runtime
4209                  * assigned.
4210                  */
4211                 if (rt_bandwidth_enabled() && rt_policy(policy) &&
4212                                 task_group(p)->rt_bandwidth.rt_runtime == 0 &&
4213                                 !task_group_is_autogroup(task_group(p))) {
4214                         task_rq_unlock(rq, p, &rf);
4215                         return -EPERM;
4216                 }
4217 #endif
4218 #ifdef CONFIG_SMP
4219                 if (dl_bandwidth_enabled() && dl_policy(policy)) {
4220                         cpumask_t *span = rq->rd->span;
4221
4222                         /*
4223                          * Don't allow tasks with an affinity mask smaller than
4224                          * the entire root_domain to become SCHED_DEADLINE. We
4225                          * will also fail if there's no bandwidth available.
4226                          */
4227                         if (!cpumask_subset(span, &p->cpus_allowed) ||
4228                             rq->rd->dl_bw.bw == 0) {
4229                                 task_rq_unlock(rq, p, &rf);
4230                                 return -EPERM;
4231                         }
4232                 }
4233 #endif
4234         }
4235
4236         /* Re-check policy now with rq lock held: */
4237         if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) {
4238                 policy = oldpolicy = -1;
4239                 task_rq_unlock(rq, p, &rf);
4240                 goto recheck;
4241         }
4242
4243         /*
4244          * If setscheduling to SCHED_DEADLINE (or changing the parameters
4245          * of a SCHED_DEADLINE task) we need to check if enough bandwidth
4246          * is available.
4247          */
4248         if ((dl_policy(policy) || dl_task(p)) && sched_dl_overflow(p, policy, attr)) {
4249                 task_rq_unlock(rq, p, &rf);
4250                 return -EBUSY;
4251         }
4252
4253         p->sched_reset_on_fork = reset_on_fork;
4254         oldprio = p->prio;
4255
4256         if (pi) {
4257                 /*
4258                  * Take priority boosted tasks into account. If the new
4259                  * effective priority is unchanged, we just store the new
4260                  * normal parameters and do not touch the scheduler class and
4261                  * the runqueue. This will be done when the task deboost
4262                  * itself.
4263                  */
4264                 new_effective_prio = rt_effective_prio(p, newprio);
4265                 if (new_effective_prio == oldprio)
4266                         queue_flags &= ~DEQUEUE_MOVE;
4267         }
4268
4269         queued = task_on_rq_queued(p);
4270         running = task_current(rq, p);
4271         if (queued)
4272                 dequeue_task(rq, p, queue_flags);
4273         if (running)
4274                 put_prev_task(rq, p);
4275
4276         prev_class = p->sched_class;
4277         __setscheduler(rq, p, attr, pi);
4278
4279         if (queued) {
4280                 /*
4281                  * We enqueue to tail when the priority of a task is
4282                  * increased (user space view).
4283                  */
4284                 if (oldprio < p->prio)
4285                         queue_flags |= ENQUEUE_HEAD;
4286
4287                 enqueue_task(rq, p, queue_flags);
4288         }
4289         if (running)
4290                 set_curr_task(rq, p);
4291
4292         check_class_changed(rq, p, prev_class, oldprio);
4293
4294         /* Avoid rq from going away on us: */
4295         preempt_disable();
4296         task_rq_unlock(rq, p, &rf);
4297
4298         if (pi)
4299                 rt_mutex_adjust_pi(p);
4300
4301         /* Run balance callbacks after we've adjusted the PI chain: */
4302         balance_callback(rq);
4303         preempt_enable();
4304
4305         return 0;
4306 }
4307
4308 static int _sched_setscheduler(struct task_struct *p, int policy,
4309                                const struct sched_param *param, bool check)
4310 {
4311         struct sched_attr attr = {
4312                 .sched_policy   = policy,
4313                 .sched_priority = param->sched_priority,
4314                 .sched_nice     = PRIO_TO_NICE(p->static_prio),
4315         };
4316
4317         /* Fixup the legacy SCHED_RESET_ON_FORK hack. */
4318         if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) {
4319                 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4320                 policy &= ~SCHED_RESET_ON_FORK;
4321                 attr.sched_policy = policy;
4322         }
4323
4324         return __sched_setscheduler(p, &attr, check, true);
4325 }
4326 /**
4327  * sched_setscheduler - change the scheduling policy and/or RT priority of a thread.
4328  * @p: the task in question.
4329  * @policy: new policy.
4330  * @param: structure containing the new RT priority.
4331  *
4332  * Return: 0 on success. An error code otherwise.
4333  *
4334  * NOTE that the task may be already dead.
4335  */
4336 int sched_setscheduler(struct task_struct *p, int policy,
4337                        const struct sched_param *param)
4338 {
4339         return _sched_setscheduler(p, policy, param, true);
4340 }
4341 EXPORT_SYMBOL_GPL(sched_setscheduler);
4342
4343 int sched_setattr(struct task_struct *p, const struct sched_attr *attr)
4344 {
4345         return __sched_setscheduler(p, attr, true, true);
4346 }
4347 EXPORT_SYMBOL_GPL(sched_setattr);
4348
4349 /**
4350  * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace.
4351  * @p: the task in question.
4352  * @policy: new policy.
4353  * @param: structure containing the new RT priority.
4354  *
4355  * Just like sched_setscheduler, only don't bother checking if the
4356  * current context has permission.  For example, this is needed in
4357  * stop_machine(): we create temporary high priority worker threads,
4358  * but our caller might not have that capability.
4359  *
4360  * Return: 0 on success. An error code otherwise.
4361  */
4362 int sched_setscheduler_nocheck(struct task_struct *p, int policy,
4363                                const struct sched_param *param)
4364 {
4365         return _sched_setscheduler(p, policy, param, false);
4366 }
4367 EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck);
4368
4369 static int
4370 do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param)
4371 {
4372         struct sched_param lparam;
4373         struct task_struct *p;
4374         int retval;
4375
4376         if (!param || pid < 0)
4377                 return -EINVAL;
4378         if (copy_from_user(&lparam, param, sizeof(struct sched_param)))
4379                 return -EFAULT;
4380
4381         rcu_read_lock();
4382         retval = -ESRCH;
4383         p = find_process_by_pid(pid);
4384         if (p != NULL)
4385                 retval = sched_setscheduler(p, policy, &lparam);
4386         rcu_read_unlock();
4387
4388         return retval;
4389 }
4390
4391 /*
4392  * Mimics kernel/events/core.c perf_copy_attr().
4393  */
4394 static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *attr)
4395 {
4396         u32 size;
4397         int ret;
4398
4399         if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0))
4400                 return -EFAULT;
4401
4402         /* Zero the full structure, so that a short copy will be nice: */
4403         memset(attr, 0, sizeof(*attr));
4404
4405         ret = get_user(size, &uattr->size);
4406         if (ret)
4407                 return ret;
4408
4409         /* Bail out on silly large: */
4410         if (size > PAGE_SIZE)
4411                 goto err_size;
4412
4413         /* ABI compatibility quirk: */
4414         if (!size)
4415                 size = SCHED_ATTR_SIZE_VER0;
4416
4417         if (size < SCHED_ATTR_SIZE_VER0)
4418                 goto err_size;
4419
4420         /*
4421          * If we're handed a bigger struct than we know of,
4422          * ensure all the unknown bits are 0 - i.e. new
4423          * user-space does not rely on any kernel feature
4424          * extensions we dont know about yet.
4425          */
4426         if (size > sizeof(*attr)) {
4427                 unsigned char __user *addr;
4428                 unsigned char __user *end;
4429                 unsigned char val;
4430
4431                 addr = (void __user *)uattr + sizeof(*attr);
4432                 end  = (void __user *)uattr + size;
4433
4434                 for (; addr < end; addr++) {
4435                         ret = get_user(val, addr);
4436                         if (ret)
4437                                 return ret;
4438                         if (val)
4439                                 goto err_size;
4440                 }
4441                 size = sizeof(*attr);
4442         }
4443
4444         ret = copy_from_user(attr, uattr, size);
4445         if (ret)
4446                 return -EFAULT;
4447
4448         /*
4449          * XXX: Do we want to be lenient like existing syscalls; or do we want
4450          * to be strict and return an error on out-of-bounds values?
4451          */
4452         attr->sched_nice = clamp(attr->sched_nice, MIN_NICE, MAX_NICE);
4453
4454         return 0;
4455
4456 err_size:
4457         put_user(sizeof(*attr), &uattr->size);
4458         return -E2BIG;
4459 }
4460
4461 /**
4462  * sys_sched_setscheduler - set/change the scheduler policy and RT priority
4463  * @pid: the pid in question.
4464  * @policy: new policy.
4465  * @param: structure containing the new RT priority.
4466  *
4467  * Return: 0 on success. An error code otherwise.
4468  */
4469 SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param)
4470 {
4471         if (policy < 0)
4472                 return -EINVAL;
4473
4474         return do_sched_setscheduler(pid, policy, param);
4475 }
4476
4477 /**
4478  * sys_sched_setparam - set/change the RT priority of a thread
4479  * @pid: the pid in question.
4480  * @param: structure containing the new RT priority.
4481  *
4482  * Return: 0 on success. An error code otherwise.
4483  */
4484 SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param)
4485 {
4486         return do_sched_setscheduler(pid, SETPARAM_POLICY, param);
4487 }
4488
4489 /**
4490  * sys_sched_setattr - same as above, but with extended sched_attr
4491  * @pid: the pid in question.
4492  * @uattr: structure containing the extended parameters.
4493  * @flags: for future extension.
4494  */
4495 SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr,
4496                                unsigned int, flags)
4497 {
4498         struct sched_attr attr;
4499         struct task_struct *p;
4500         int retval;
4501
4502         if (!uattr || pid < 0 || flags)
4503                 return -EINVAL;
4504
4505         retval = sched_copy_attr(uattr, &attr);
4506         if (retval)
4507                 return retval;
4508
4509         if ((int)attr.sched_policy < 0)
4510                 return -EINVAL;
4511
4512         rcu_read_lock();
4513         retval = -ESRCH;
4514         p = find_process_by_pid(pid);
4515         if (p != NULL)
4516                 retval = sched_setattr(p, &attr);
4517         rcu_read_unlock();
4518
4519         return retval;
4520 }
4521
4522 /**
4523  * sys_sched_getscheduler - get the policy (scheduling class) of a thread
4524  * @pid: the pid in question.
4525  *
4526  * Return: On success, the policy of the thread. Otherwise, a negative error
4527  * code.
4528  */
4529 SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid)
4530 {
4531         struct task_struct *p;
4532         int retval;
4533
4534         if (pid < 0)
4535                 return -EINVAL;
4536
4537         retval = -ESRCH;
4538         rcu_read_lock();
4539         p = find_process_by_pid(pid);
4540         if (p) {
4541                 retval = security_task_getscheduler(p);
4542                 if (!retval)
4543                         retval = p->policy
4544                                 | (p->sched_reset_on_fork ? SCHED_RESET_ON_FORK : 0);
4545         }
4546         rcu_read_unlock();
4547         return retval;
4548 }
4549
4550 /**
4551  * sys_sched_getparam - get the RT priority of a thread
4552  * @pid: the pid in question.
4553  * @param: structure containing the RT priority.
4554  *
4555  * Return: On success, 0 and the RT priority is in @param. Otherwise, an error
4556  * code.
4557  */
4558 SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param)
4559 {
4560         struct sched_param lp = { .sched_priority = 0 };
4561         struct task_struct *p;
4562         int retval;
4563
4564         if (!param || pid < 0)
4565                 return -EINVAL;
4566
4567         rcu_read_lock();
4568         p = find_process_by_pid(pid);
4569         retval = -ESRCH;
4570         if (!p)
4571                 goto out_unlock;
4572
4573         retval = security_task_getscheduler(p);
4574         if (retval)
4575                 goto out_unlock;
4576
4577         if (task_has_rt_policy(p))
4578                 lp.sched_priority = p->rt_priority;
4579         rcu_read_unlock();
4580
4581         /*
4582          * This one might sleep, we cannot do it with a spinlock held ...
4583          */
4584         retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;
4585
4586         return retval;
4587
4588 out_unlock:
4589         rcu_read_unlock();
4590         return retval;
4591 }
4592
4593 static int sched_read_attr(struct sched_attr __user *uattr,
4594                            struct sched_attr *attr,
4595                            unsigned int usize)
4596 {
4597         int ret;
4598
4599         if (!access_ok(VERIFY_WRITE, uattr, usize))
4600                 return -EFAULT;
4601
4602         /*
4603          * If we're handed a smaller struct than we know of,
4604          * ensure all the unknown bits are 0 - i.e. old
4605          * user-space does not get uncomplete information.
4606          */
4607         if (usize < sizeof(*attr)) {
4608                 unsigned char *addr;
4609                 unsigned char *end;
4610
4611                 addr = (void *)attr + usize;
4612                 end  = (void *)attr + sizeof(*attr);
4613
4614                 for (; addr < end; addr++) {
4615                         if (*addr)
4616                                 return -EFBIG;
4617                 }
4618
4619                 attr->size = usize;
4620         }
4621
4622         ret = copy_to_user(uattr, attr, attr->size);
4623         if (ret)
4624                 return -EFAULT;
4625
4626         return 0;
4627 }
4628
4629 /**
4630  * sys_sched_getattr - similar to sched_getparam, but with sched_attr
4631  * @pid: the pid in question.
4632  * @uattr: structure containing the extended parameters.
4633  * @size: sizeof(attr) for fwd/bwd comp.
4634  * @flags: for future extension.
4635  */
4636 SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr,
4637                 unsigned int, size, unsigned int, flags)
4638 {
4639         struct sched_attr attr = {
4640                 .size = sizeof(struct sched_attr),
4641         };
4642         struct task_struct *p;
4643         int retval;
4644
4645         if (!uattr || pid < 0 || size > PAGE_SIZE ||
4646             size < SCHED_ATTR_SIZE_VER0 || flags)
4647                 return -EINVAL;
4648
4649         rcu_read_lock();
4650         p = find_process_by_pid(pid);
4651         retval = -ESRCH;
4652         if (!p)
4653                 goto out_unlock;
4654
4655         retval = security_task_getscheduler(p);
4656         if (retval)
4657                 goto out_unlock;
4658
4659         attr.sched_policy = p->policy;
4660         if (p->sched_reset_on_fork)
4661                 attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK;
4662         if (task_has_dl_policy(p))
4663                 __getparam_dl(p, &attr);
4664         else if (task_has_rt_policy(p))
4665                 attr.sched_priority = p->rt_priority;
4666         else
4667                 attr.sched_nice = task_nice(p);
4668
4669         rcu_read_unlock();
4670
4671         retval = sched_read_attr(uattr, &attr, size);
4672         return retval;
4673
4674 out_unlock:
4675         rcu_read_unlock();
4676         return retval;
4677 }
4678
4679 long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
4680 {
4681         cpumask_var_t cpus_allowed, new_mask;
4682         struct task_struct *p;
4683         int retval;
4684
4685         rcu_read_lock();
4686
4687         p = find_process_by_pid(pid);
4688         if (!p) {
4689                 rcu_read_unlock();
4690                 return -ESRCH;
4691         }
4692
4693         /* Prevent p going away */
4694         get_task_struct(p);
4695         rcu_read_unlock();
4696
4697         if (p->flags & PF_NO_SETAFFINITY) {
4698                 retval = -EINVAL;
4699                 goto out_put_task;
4700         }
4701         if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) {
4702                 retval = -ENOMEM;
4703                 goto out_put_task;
4704         }
4705         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) {
4706                 retval = -ENOMEM;
4707                 goto out_free_cpus_allowed;
4708         }
4709         retval = -EPERM;
4710         if (!check_same_owner(p)) {
4711                 rcu_read_lock();
4712                 if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) {
4713                         rcu_read_unlock();
4714                         goto out_free_new_mask;
4715                 }
4716                 rcu_read_unlock();
4717         }
4718
4719         retval = security_task_setscheduler(p);
4720         if (retval)
4721                 goto out_free_new_mask;
4722
4723
4724         cpuset_cpus_allowed(p, cpus_allowed);
4725         cpumask_and(new_mask, in_mask, cpus_allowed);
4726
4727         /*
4728          * Since bandwidth control happens on root_domain basis,
4729          * if admission test is enabled, we only admit -deadline
4730          * tasks allowed to run on all the CPUs in the task's
4731          * root_domain.
4732          */
4733 #ifdef CONFIG_SMP
4734         if (task_has_dl_policy(p) && dl_bandwidth_enabled()) {
4735                 rcu_read_lock();
4736                 if (!cpumask_subset(task_rq(p)->rd->span, new_mask)) {
4737                         retval = -EBUSY;
4738                         rcu_read_unlock();
4739                         goto out_free_new_mask;
4740                 }
4741                 rcu_read_unlock();
4742         }
4743 #endif
4744 again:
4745         retval = __set_cpus_allowed_ptr(p, new_mask, true);
4746
4747         if (!retval) {
4748                 cpuset_cpus_allowed(p, cpus_allowed);
4749                 if (!cpumask_subset(new_mask, cpus_allowed)) {
4750                         /*
4751                          * We must have raced with a concurrent cpuset
4752                          * update. Just reset the cpus_allowed to the
4753                          * cpuset's cpus_allowed
4754                          */
4755                         cpumask_copy(new_mask, cpus_allowed);
4756                         goto again;
4757                 }
4758         }
4759 out_free_new_mask:
4760         free_cpumask_var(new_mask);
4761 out_free_cpus_allowed:
4762         free_cpumask_var(cpus_allowed);
4763 out_put_task:
4764         put_task_struct(p);
4765         return retval;
4766 }
4767
4768 static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len,
4769                              struct cpumask *new_mask)
4770 {
4771         if (len < cpumask_size())
4772                 cpumask_clear(new_mask);
4773         else if (len > cpumask_size())
4774                 len = cpumask_size();
4775
4776         return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0;
4777 }
4778
4779 /**
4780  * sys_sched_setaffinity - set the CPU affinity of a process
4781  * @pid: pid of the process
4782  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4783  * @user_mask_ptr: user-space pointer to the new CPU mask
4784  *
4785  * Return: 0 on success. An error code otherwise.
4786  */
4787 SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len,
4788                 unsigned long __user *, user_mask_ptr)
4789 {
4790         cpumask_var_t new_mask;
4791         int retval;
4792
4793         if (!alloc_cpumask_var(&new_mask, GFP_KERNEL))
4794                 return -ENOMEM;
4795
4796         retval = get_user_cpu_mask(user_mask_ptr, len, new_mask);
4797         if (retval == 0)
4798                 retval = sched_setaffinity(pid, new_mask);
4799         free_cpumask_var(new_mask);
4800         return retval;
4801 }
4802
4803 long sched_getaffinity(pid_t pid, struct cpumask *mask)
4804 {
4805         struct task_struct *p;
4806         unsigned long flags;
4807         int retval;
4808
4809         rcu_read_lock();
4810
4811         retval = -ESRCH;
4812         p = find_process_by_pid(pid);
4813         if (!p)
4814                 goto out_unlock;
4815
4816         retval = security_task_getscheduler(p);
4817         if (retval)
4818                 goto out_unlock;
4819
4820         raw_spin_lock_irqsave(&p->pi_lock, flags);
4821         cpumask_and(mask, &p->cpus_allowed, cpu_active_mask);
4822         raw_spin_unlock_irqrestore(&p->pi_lock, flags);
4823
4824 out_unlock:
4825         rcu_read_unlock();
4826
4827         return retval;
4828 }
4829
4830 /**
4831  * sys_sched_getaffinity - get the CPU affinity of a process
4832  * @pid: pid of the process
4833  * @len: length in bytes of the bitmask pointed to by user_mask_ptr
4834  * @user_mask_ptr: user-space pointer to hold the current CPU mask
4835  *
4836  * Return: size of CPU mask copied to user_mask_ptr on success. An
4837  * error code otherwise.
4838  */
4839 SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len,
4840                 unsigned long __user *, user_mask_ptr)
4841 {
4842         int ret;
4843         cpumask_var_t mask;
4844
4845         if ((len * BITS_PER_BYTE) < nr_cpu_ids)
4846                 return -EINVAL;
4847         if (len & (sizeof(unsigned long)-1))
4848                 return -EINVAL;
4849
4850         if (!alloc_cpumask_var(&mask, GFP_KERNEL))
4851                 return -ENOMEM;
4852
4853         ret = sched_getaffinity(pid, mask);
4854         if (ret == 0) {
4855                 size_t retlen = min_t(size_t, len, cpumask_size());
4856
4857                 if (copy_to_user(user_mask_ptr, mask, retlen))
4858                         ret = -EFAULT;
4859                 else
4860                         ret = retlen;
4861         }
4862         free_cpumask_var(mask);
4863
4864         return ret;
4865 }
4866
4867 /**
4868  * sys_sched_yield - yield the current processor to other threads.
4869  *
4870  * This function yields the current CPU to other tasks. If there are no
4871  * other threads running on this CPU then this function will return.
4872  *
4873  * Return: 0.
4874  */
4875 SYSCALL_DEFINE0(sched_yield)
4876 {
4877         struct rq_flags rf;
4878         struct rq *rq;
4879
4880         local_irq_disable();
4881         rq = this_rq();
4882         rq_lock(rq, &rf);
4883
4884         schedstat_inc(rq->yld_count);
4885         current->sched_class->yield_task(rq);
4886
4887         /*
4888          * Since we are going to call schedule() anyway, there's
4889          * no need to preempt or enable interrupts:
4890          */
4891         preempt_disable();
4892         rq_unlock(rq, &rf);
4893         sched_preempt_enable_no_resched();
4894
4895         schedule();
4896
4897         return 0;
4898 }
4899
4900 #ifndef CONFIG_PREEMPT
4901 int __sched _cond_resched(void)
4902 {
4903         if (should_resched(0)) {
4904                 preempt_schedule_common();
4905                 return 1;
4906         }
4907         return 0;
4908 }
4909 EXPORT_SYMBOL(_cond_resched);
4910 #endif
4911
4912 /*
4913  * __cond_resched_lock() - if a reschedule is pending, drop the given lock,
4914  * call schedule, and on return reacquire the lock.
4915  *
4916  * This works OK both with and without CONFIG_PREEMPT. We do strange low-level
4917  * operations here to prevent schedule() from being called twice (once via
4918  * spin_unlock(), once by hand).
4919  */
4920 int __cond_resched_lock(spinlock_t *lock)
4921 {
4922         int resched = should_resched(PREEMPT_LOCK_OFFSET);
4923         int ret = 0;
4924
4925         lockdep_assert_held(lock);
4926
4927         if (spin_needbreak(lock) || resched) {
4928                 spin_unlock(lock);
4929                 if (resched)
4930                         preempt_schedule_common();
4931                 else
4932                         cpu_relax();
4933                 ret = 1;
4934                 spin_lock(lock);
4935         }
4936         return ret;
4937 }
4938 EXPORT_SYMBOL(__cond_resched_lock);
4939
4940 int __sched __cond_resched_softirq(void)
4941 {
4942         BUG_ON(!in_softirq());
4943
4944         if (should_resched(SOFTIRQ_DISABLE_OFFSET)) {
4945                 local_bh_enable();
4946                 preempt_schedule_common();
4947                 local_bh_disable();
4948                 return 1;
4949         }
4950         return 0;
4951 }
4952 EXPORT_SYMBOL(__cond_resched_softirq);
4953
4954 /**
4955  * yield - yield the current processor to other threads.
4956  *
4957  * Do not ever use this function, there's a 99% chance you're doing it wrong.
4958  *
4959  * The scheduler is at all times free to pick the calling task as the most
4960  * eligible task to run, if removing the yield() call from your code breaks
4961  * it, its already broken.
4962  *
4963  * Typical broken usage is:
4964  *
4965  * while (!event)
4966  *      yield();
4967  *
4968  * where one assumes that yield() will let 'the other' process run that will
4969  * make event true. If the current task is a SCHED_FIFO task that will never
4970  * happen. Never use yield() as a progress guarantee!!
4971  *
4972  * If you want to use yield() to wait for something, use wait_event().
4973  * If you want to use yield() to be 'nice' for others, use cond_resched().
4974  * If you still want to use yield(), do not!
4975  */
4976 void __sched yield(void)
4977 {
4978         set_current_state(TASK_RUNNING);
4979         sys_sched_yield();
4980 }
4981 EXPORT_SYMBOL(yield);
4982
4983 /**
4984  * yield_to - yield the current processor to another thread in
4985  * your thread group, or accelerate that thread toward the
4986  * processor it's on.
4987  * @p: target task
4988  * @preempt: whether task preemption is allowed or not
4989  *
4990  * It's the caller's job to ensure that the target task struct
4991  * can't go away on us before we can do any checks.
4992  *
4993  * Return:
4994  *      true (>0) if we indeed boosted the target task.
4995  *      false (0) if we failed to boost the target.
4996  *      -ESRCH if there's no task to yield to.
4997  */
4998 int __sched yield_to(struct task_struct *p, bool preempt)
4999 {
5000         struct task_struct *curr = current;
5001         struct rq *rq, *p_rq;
5002         unsigned long flags;
5003         int yielded = 0;
5004
5005         local_irq_save(flags);
5006         rq = this_rq();
5007
5008 again:
5009         p_rq = task_rq(p);
5010         /*
5011          * If we're the only runnable task on the rq and target rq also
5012          * has only one task, there's absolutely no point in yielding.
5013          */
5014         if (rq->nr_running == 1 && p_rq->nr_running == 1) {
5015                 yielded = -ESRCH;
5016                 goto out_irq;
5017         }
5018
5019         double_rq_lock(rq, p_rq);
5020         if (task_rq(p) != p_rq) {
5021                 double_rq_unlock(rq, p_rq);
5022                 goto again;
5023         }
5024
5025         if (!curr->sched_class->yield_to_task)
5026                 goto out_unlock;
5027
5028         if (curr->sched_class != p->sched_class)
5029                 goto out_unlock;
5030
5031         if (task_running(p_rq, p) || p->state)
5032                 goto out_unlock;
5033
5034         yielded = curr->sched_class->yield_to_task(rq, p, preempt);
5035         if (yielded) {
5036                 schedstat_inc(rq->yld_count);
5037                 /*
5038                  * Make p's CPU reschedule; pick_next_entity takes care of
5039                  * fairness.
5040                  */
5041                 if (preempt && rq != p_rq)
5042                         resched_curr(p_rq);
5043         }
5044
5045 out_unlock:
5046         double_rq_unlock(rq, p_rq);
5047 out_irq:
5048         local_irq_restore(flags);
5049
5050         if (yielded > 0)
5051                 schedule();
5052
5053         return yielded;
5054 }
5055 EXPORT_SYMBOL_GPL(yield_to);
5056
5057 int io_schedule_prepare(void)
5058 {
5059         int old_iowait = current->in_iowait;
5060
5061         current->in_iowait = 1;
5062         blk_schedule_flush_plug(current);
5063
5064         return old_iowait;
5065 }
5066
5067 void io_schedule_finish(int token)
5068 {
5069         current->in_iowait = token;
5070 }
5071
5072 /*
5073  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
5074  * that process accounting knows that this is a task in IO wait state.
5075  */
5076 long __sched io_schedule_timeout(long timeout)
5077 {
5078         int token;
5079         long ret;
5080
5081         token = io_schedule_prepare();
5082         ret = schedule_timeout(timeout);
5083         io_schedule_finish(token);
5084
5085         return ret;
5086 }
5087 EXPORT_SYMBOL(io_schedule_timeout);
5088
5089 void io_schedule(void)
5090 {
5091         int token;
5092
5093         token = io_schedule_prepare();
5094         schedule();
5095         io_schedule_finish(token);
5096 }
5097 EXPORT_SYMBOL(io_schedule);
5098
5099 /**
5100  * sys_sched_get_priority_max - return maximum RT priority.
5101  * @policy: scheduling class.
5102  *
5103  * Return: On success, this syscall returns the maximum
5104  * rt_priority that can be used by a given scheduling class.
5105  * On failure, a negative error code is returned.
5106  */
5107 SYSCALL_DEFINE1(sched_get_priority_max, int, policy)
5108 {
5109         int ret = -EINVAL;
5110
5111         switch (policy) {
5112         case SCHED_FIFO:
5113         case SCHED_RR:
5114                 ret = MAX_USER_RT_PRIO-1;
5115                 break;
5116         case SCHED_DEADLINE:
5117         case SCHED_NORMAL:
5118         case SCHED_BATCH:
5119         case SCHED_IDLE:
5120                 ret = 0;
5121                 break;
5122         }
5123         return ret;
5124 }
5125
5126 /**
5127  * sys_sched_get_priority_min - return minimum RT priority.
5128  * @policy: scheduling class.
5129  *
5130  * Return: On success, this syscall returns the minimum
5131  * rt_priority that can be used by a given scheduling class.
5132  * On failure, a negative error code is returned.
5133  */
5134 SYSCALL_DEFINE1(sched_get_priority_min, int, policy)
5135 {
5136         int ret = -EINVAL;
5137
5138         switch (policy) {
5139         case SCHED_FIFO:
5140         case SCHED_RR:
5141                 ret = 1;
5142                 break;
5143         case SCHED_DEADLINE:
5144         case SCHED_NORMAL:
5145         case SCHED_BATCH:
5146         case SCHED_IDLE:
5147                 ret = 0;
5148         }
5149         return ret;
5150 }
5151
5152 /**
5153  * sys_sched_rr_get_interval - return the default timeslice of a process.
5154  * @pid: pid of the process.
5155  * @interval: userspace pointer to the timeslice value.
5156  *
5157  * this syscall writes the default timeslice value of a given process
5158  * into the user-space timespec buffer. A value of '0' means infinity.
5159  *
5160  * Return: On success, 0 and the timeslice is in @interval. Otherwise,
5161  * an error code.
5162  */
5163 SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid,
5164                 struct timespec __user *, interval)
5165 {
5166         struct task_struct *p;
5167         unsigned int time_slice;
5168         struct rq_flags rf;
5169         struct timespec t;
5170         struct rq *rq;
5171         int retval;
5172
5173         if (pid < 0)
5174                 return -EINVAL;
5175
5176         retval = -ESRCH;
5177         rcu_read_lock();
5178         p = find_process_by_pid(pid);
5179         if (!p)
5180                 goto out_unlock;
5181
5182         retval = security_task_getscheduler(p);
5183         if (retval)
5184                 goto out_unlock;
5185
5186         rq = task_rq_lock(p, &rf);
5187         time_slice = 0;
5188         if (p->sched_class->get_rr_interval)
5189                 time_slice = p->sched_class->get_rr_interval(rq, p);
5190         task_rq_unlock(rq, p, &rf);
5191
5192         rcu_read_unlock();
5193         jiffies_to_timespec(time_slice, &t);
5194         retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
5195         return retval;
5196
5197 out_unlock:
5198         rcu_read_unlock();
5199         return retval;
5200 }
5201
5202 void sched_show_task(struct task_struct *p)
5203 {
5204         unsigned long free = 0;
5205         int ppid;
5206
5207         if (!try_get_task_stack(p))
5208                 return;
5209
5210         printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p));
5211
5212         if (p->state == TASK_RUNNING)
5213                 printk(KERN_CONT "  running task    ");
5214 #ifdef CONFIG_DEBUG_STACK_USAGE
5215         free = stack_not_used(p);
5216 #endif
5217         ppid = 0;
5218         rcu_read_lock();
5219         if (pid_alive(p))
5220                 ppid = task_pid_nr(rcu_dereference(p->real_parent));
5221         rcu_read_unlock();
5222         printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free,
5223                 task_pid_nr(p), ppid,
5224                 (unsigned long)task_thread_info(p)->flags);
5225
5226         print_worker_info(KERN_INFO, p);
5227         show_stack(p, NULL);
5228         put_task_stack(p);
5229 }
5230
5231 static inline bool
5232 state_filter_match(unsigned long state_filter, struct task_struct *p)
5233 {
5234         /* no filter, everything matches */
5235         if (!state_filter)
5236                 return true;
5237
5238         /* filter, but doesn't match */
5239         if (!(p->state & state_filter))
5240                 return false;
5241
5242         /*
5243          * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows
5244          * TASK_KILLABLE).
5245          */
5246         if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE)
5247                 return false;
5248
5249         return true;
5250 }
5251
5252
5253 void show_state_filter(unsigned long state_filter)
5254 {
5255         struct task_struct *g, *p;
5256
5257 #if BITS_PER_LONG == 32
5258         printk(KERN_INFO
5259                 "  task                PC stack   pid father\n");
5260 #else
5261         printk(KERN_INFO
5262                 "  task                        PC stack   pid father\n");
5263 #endif
5264         rcu_read_lock();
5265         for_each_process_thread(g, p) {
5266                 /*
5267                  * reset the NMI-timeout, listing all files on a slow
5268                  * console might take a lot of time:
5269                  * Also, reset softlockup watchdogs on all CPUs, because
5270                  * another CPU might be blocked waiting for us to process
5271                  * an IPI.
5272                  */
5273                 touch_nmi_watchdog();
5274                 touch_all_softlockup_watchdogs();
5275                 if (state_filter_match(state_filter, p))
5276                         sched_show_task(p);
5277         }
5278
5279 #ifdef CONFIG_SCHED_DEBUG
5280         if (!state_filter)
5281                 sysrq_sched_debug_show();
5282 #endif
5283         rcu_read_unlock();
5284         /*
5285          * Only show locks if all tasks are dumped:
5286          */
5287         if (!state_filter)
5288                 debug_show_all_locks();
5289 }
5290
5291 /**
5292  * init_idle - set up an idle thread for a given CPU
5293  * @idle: task in question
5294  * @cpu: CPU the idle task belongs to
5295  *
5296  * NOTE: this function does not set the idle thread's NEED_RESCHED
5297  * flag, to make booting more robust.
5298  */
5299 void init_idle(struct task_struct *idle, int cpu)
5300 {
5301         struct rq *rq = cpu_rq(cpu);
5302         unsigned long flags;
5303
5304         raw_spin_lock_irqsave(&idle->pi_lock, flags);
5305         raw_spin_lock(&rq->lock);
5306
5307         __sched_fork(0, idle);
5308         idle->state = TASK_RUNNING;
5309         idle->se.exec_start = sched_clock();
5310         idle->flags |= PF_IDLE;
5311
5312         kasan_unpoison_task_stack(idle);
5313
5314 #ifdef CONFIG_SMP
5315         /*
5316          * Its possible that init_idle() gets called multiple times on a task,
5317          * in that case do_set_cpus_allowed() will not do the right thing.
5318          *
5319          * And since this is boot we can forgo the serialization.
5320          */
5321         set_cpus_allowed_common(idle, cpumask_of(cpu));
5322 #endif
5323         /*
5324          * We're having a chicken and egg problem, even though we are
5325          * holding rq->lock, the CPU isn't yet set to this CPU so the
5326          * lockdep check in task_group() will fail.
5327          *
5328          * Similar case to sched_fork(). / Alternatively we could
5329          * use task_rq_lock() here and obtain the other rq->lock.
5330          *
5331          * Silence PROVE_RCU
5332          */
5333         rcu_read_lock();
5334         __set_task_cpu(idle, cpu);
5335         rcu_read_unlock();
5336
5337         rq->curr = rq->idle = idle;
5338         idle->on_rq = TASK_ON_RQ_QUEUED;
5339 #ifdef CONFIG_SMP
5340         idle->on_cpu = 1;
5341 #endif
5342         raw_spin_unlock(&rq->lock);
5343         raw_spin_unlock_irqrestore(&idle->pi_lock, flags);
5344
5345         /* Set the preempt count _outside_ the spinlocks! */
5346         init_idle_preempt_count(idle, cpu);
5347
5348         /*
5349          * The idle tasks have their own, simple scheduling class:
5350          */
5351         idle->sched_class = &idle_sched_class;
5352         ftrace_graph_init_idle_task(idle, cpu);
5353         vtime_init_idle(idle, cpu);
5354 #ifdef CONFIG_SMP
5355         sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu);
5356 #endif
5357 }
5358
5359 #ifdef CONFIG_SMP
5360
5361 int cpuset_cpumask_can_shrink(const struct cpumask *cur,
5362                               const struct cpumask *trial)
5363 {
5364         int ret = 1;
5365
5366         if (!cpumask_weight(cur))
5367                 return ret;
5368
5369         ret = dl_cpuset_cpumask_can_shrink(cur, trial);
5370
5371         return ret;
5372 }
5373
5374 int task_can_attach(struct task_struct *p,
5375                     const struct cpumask *cs_cpus_allowed)
5376 {
5377         int ret = 0;
5378
5379         /*
5380          * Kthreads which disallow setaffinity shouldn't be moved
5381          * to a new cpuset; we don't want to change their CPU
5382          * affinity and isolating such threads by their set of
5383          * allowed nodes is unnecessary.  Thus, cpusets are not
5384          * applicable for such threads.  This prevents checking for
5385          * success of set_cpus_allowed_ptr() on all attached tasks
5386          * before cpus_allowed may be changed.
5387          */
5388         if (p->flags & PF_NO_SETAFFINITY) {
5389                 ret = -EINVAL;
5390                 goto out;
5391         }
5392
5393         if (dl_task(p) && !cpumask_intersects(task_rq(p)->rd->span,
5394                                               cs_cpus_allowed))
5395                 ret = dl_task_can_attach(p, cs_cpus_allowed);
5396
5397 out:
5398         return ret;
5399 }
5400
5401 bool sched_smp_initialized __read_mostly;
5402
5403 #ifdef CONFIG_NUMA_BALANCING
5404 /* Migrate current task p to target_cpu */
5405 int migrate_task_to(struct task_struct *p, int target_cpu)
5406 {
5407         struct migration_arg arg = { p, target_cpu };
5408         int curr_cpu = task_cpu(p);
5409
5410         if (curr_cpu == target_cpu)
5411                 return 0;
5412
5413         if (!cpumask_test_cpu(target_cpu, &p->cpus_allowed))
5414                 return -EINVAL;
5415
5416         /* TODO: This is not properly updating schedstats */
5417
5418         trace_sched_move_numa(p, curr_cpu, target_cpu);
5419         return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
5420 }
5421
5422 /*
5423  * Requeue a task on a given node and accurately track the number of NUMA
5424  * tasks on the runqueues
5425  */
5426 void sched_setnuma(struct task_struct *p, int nid)
5427 {
5428         bool queued, running;
5429         struct rq_flags rf;
5430         struct rq *rq;
5431
5432         rq = task_rq_lock(p, &rf);
5433         queued = task_on_rq_queued(p);
5434         running = task_current(rq, p);
5435
5436         if (queued)
5437                 dequeue_task(rq, p, DEQUEUE_SAVE);
5438         if (running)
5439                 put_prev_task(rq, p);
5440
5441         p->numa_preferred_nid = nid;
5442
5443         if (queued)
5444                 enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
5445         if (running)
5446                 set_curr_task(rq, p);
5447         task_rq_unlock(rq, p, &rf);
5448 }
5449 #endif /* CONFIG_NUMA_BALANCING */
5450
5451 #ifdef CONFIG_HOTPLUG_CPU
5452 /*
5453  * Ensure that the idle task is using init_mm right before its CPU goes
5454  * offline.
5455  */
5456 void idle_task_exit(void)
5457 {
5458         struct mm_struct *mm = current->active_mm;
5459
5460         BUG_ON(cpu_online(smp_processor_id()));
5461
5462         if (mm != &init_mm) {
5463                 switch_mm(mm, &init_mm, current);
5464                 finish_arch_post_lock_switch();
5465         }
5466         mmdrop(mm);
5467 }
5468
5469 /*
5470  * Since this CPU is going 'away' for a while, fold any nr_active delta
5471  * we might have. Assumes we're called after migrate_tasks() so that the
5472  * nr_active count is stable. We need to take the teardown thread which
5473  * is calling this into account, so we hand in adjust = 1 to the load
5474  * calculation.
5475  *
5476  * Also see the comment "Global load-average calculations".
5477  */
5478 static void calc_load_migrate(struct rq *rq)
5479 {
5480         long delta = calc_load_fold_active(rq, 1);
5481         if (delta)
5482                 atomic_long_add(delta, &calc_load_tasks);
5483 }
5484
5485 static void put_prev_task_fake(struct rq *rq, struct task_struct *prev)
5486 {
5487 }
5488
5489 static const struct sched_class fake_sched_class = {
5490         .put_prev_task = put_prev_task_fake,
5491 };
5492
5493 static struct task_struct fake_task = {
5494         /*
5495          * Avoid pull_{rt,dl}_task()
5496          */
5497         .prio = MAX_PRIO + 1,
5498         .sched_class = &fake_sched_class,
5499 };
5500
5501 /*
5502  * Migrate all tasks from the rq, sleeping tasks will be migrated by
5503  * try_to_wake_up()->select_task_rq().
5504  *
5505  * Called with rq->lock held even though we'er in stop_machine() and
5506  * there's no concurrency possible, we hold the required locks anyway
5507  * because of lock validation efforts.
5508  */
5509 static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf)
5510 {
5511         struct rq *rq = dead_rq;
5512         struct task_struct *next, *stop = rq->stop;
5513         struct rq_flags orf = *rf;
5514         int dest_cpu;
5515
5516         /*
5517          * Fudge the rq selection such that the below task selection loop
5518          * doesn't get stuck on the currently eligible stop task.
5519          *
5520          * We're currently inside stop_machine() and the rq is either stuck
5521          * in the stop_machine_cpu_stop() loop, or we're executing this code,
5522          * either way we should never end up calling schedule() until we're
5523          * done here.
5524          */
5525         rq->stop = NULL;
5526
5527         /*
5528          * put_prev_task() and pick_next_task() sched
5529          * class method both need to have an up-to-date
5530          * value of rq->clock[_task]
5531          */
5532         update_rq_clock(rq);
5533
5534         for (;;) {
5535                 /*
5536                  * There's this thread running, bail when that's the only
5537                  * remaining thread:
5538                  */
5539                 if (rq->nr_running == 1)
5540                         break;
5541
5542                 /*
5543                  * pick_next_task() assumes pinned rq->lock:
5544                  */
5545                 next = pick_next_task(rq, &fake_task, rf);
5546                 BUG_ON(!next);
5547                 put_prev_task(rq, next);
5548
5549                 /*
5550                  * Rules for changing task_struct::cpus_allowed are holding
5551                  * both pi_lock and rq->lock, such that holding either
5552                  * stabilizes the mask.
5553                  *
5554                  * Drop rq->lock is not quite as disastrous as it usually is
5555                  * because !cpu_active at this point, which means load-balance
5556                  * will not interfere. Also, stop-machine.
5557                  */
5558                 rq_unlock(rq, rf);
5559                 raw_spin_lock(&next->pi_lock);
5560                 rq_relock(rq, rf);
5561
5562                 /*
5563                  * Since we're inside stop-machine, _nothing_ should have
5564                  * changed the task, WARN if weird stuff happened, because in
5565                  * that case the above rq->lock drop is a fail too.
5566                  */
5567                 if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) {
5568                         raw_spin_unlock(&next->pi_lock);
5569                         continue;
5570                 }
5571
5572                 /* Find suitable destination for @next, with force if needed. */
5573                 dest_cpu = select_fallback_rq(dead_rq->cpu, next);
5574                 rq = __migrate_task(rq, rf, next, dest_cpu);
5575                 if (rq != dead_rq) {
5576                         rq_unlock(rq, rf);
5577                         rq = dead_rq;
5578                         *rf = orf;
5579                         rq_relock(rq, rf);
5580                 }
5581                 raw_spin_unlock(&next->pi_lock);
5582         }
5583
5584         rq->stop = stop;
5585 }
5586 #endif /* CONFIG_HOTPLUG_CPU */
5587
5588 void set_rq_online(struct rq *rq)
5589 {
5590         if (!rq->online) {
5591                 const struct sched_class *class;
5592
5593                 cpumask_set_cpu(rq->cpu, rq->rd->online);
5594                 rq->online = 1;
5595
5596                 for_each_class(class) {
5597                         if (class->rq_online)
5598                                 class->rq_online(rq);
5599                 }
5600         }
5601 }
5602
5603 void set_rq_offline(struct rq *rq)
5604 {
5605         if (rq->online) {
5606                 const struct sched_class *class;
5607
5608                 for_each_class(class) {
5609                         if (class->rq_offline)
5610                                 class->rq_offline(rq);
5611                 }
5612
5613                 cpumask_clear_cpu(rq->cpu, rq->rd->online);
5614                 rq->online = 0;
5615         }
5616 }
5617
5618 static void set_cpu_rq_start_time(unsigned int cpu)
5619 {
5620         struct rq *rq = cpu_rq(cpu);
5621
5622         rq->age_stamp = sched_clock_cpu(cpu);
5623 }
5624
5625 /*
5626  * used to mark begin/end of suspend/resume:
5627  */
5628 static int num_cpus_frozen;
5629
5630 /*
5631  * Update cpusets according to cpu_active mask.  If cpusets are
5632  * disabled, cpuset_update_active_cpus() becomes a simple wrapper
5633  * around partition_sched_domains().
5634  *
5635  * If we come here as part of a suspend/resume, don't touch cpusets because we
5636  * want to restore it back to its original state upon resume anyway.
5637  */
5638 static void cpuset_cpu_active(void)
5639 {
5640         if (cpuhp_tasks_frozen) {
5641                 /*
5642                  * num_cpus_frozen tracks how many CPUs are involved in suspend
5643                  * resume sequence. As long as this is not the last online
5644                  * operation in the resume sequence, just build a single sched
5645                  * domain, ignoring cpusets.
5646                  */
5647                 partition_sched_domains(1, NULL, NULL);
5648                 if (--num_cpus_frozen)
5649                         return;
5650                 /*
5651                  * This is the last CPU online operation. So fall through and
5652                  * restore the original sched domains by considering the
5653                  * cpuset configurations.
5654                  */
5655                 cpuset_force_rebuild();
5656         }
5657         cpuset_update_active_cpus();
5658 }
5659
5660 static int cpuset_cpu_inactive(unsigned int cpu)
5661 {
5662         if (!cpuhp_tasks_frozen) {
5663                 if (dl_cpu_busy(cpu))
5664                         return -EBUSY;
5665                 cpuset_update_active_cpus();
5666         } else {
5667                 num_cpus_frozen++;
5668                 partition_sched_domains(1, NULL, NULL);
5669         }
5670         return 0;
5671 }
5672
5673 int sched_cpu_activate(unsigned int cpu)
5674 {
5675         struct rq *rq = cpu_rq(cpu);
5676         struct rq_flags rf;
5677
5678         set_cpu_active(cpu, true);
5679
5680         if (sched_smp_initialized) {
5681                 sched_domains_numa_masks_set(cpu);
5682                 cpuset_cpu_active();
5683         }
5684
5685         /*
5686          * Put the rq online, if not already. This happens:
5687          *
5688          * 1) In the early boot process, because we build the real domains
5689          *    after all CPUs have been brought up.
5690          *
5691          * 2) At runtime, if cpuset_cpu_active() fails to rebuild the
5692          *    domains.
5693          */
5694         rq_lock_irqsave(rq, &rf);
5695         if (rq->rd) {
5696                 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5697                 set_rq_online(rq);
5698         }
5699         rq_unlock_irqrestore(rq, &rf);
5700
5701         update_max_interval();
5702
5703         return 0;
5704 }
5705
5706 int sched_cpu_deactivate(unsigned int cpu)
5707 {
5708         int ret;
5709
5710         set_cpu_active(cpu, false);
5711         /*
5712          * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU
5713          * users of this state to go away such that all new such users will
5714          * observe it.
5715          *
5716          * Do sync before park smpboot threads to take care the rcu boost case.
5717          */
5718         synchronize_rcu_mult(call_rcu, call_rcu_sched);
5719
5720         if (!sched_smp_initialized)
5721                 return 0;
5722
5723         ret = cpuset_cpu_inactive(cpu);
5724         if (ret) {
5725                 set_cpu_active(cpu, true);
5726                 return ret;
5727         }
5728         sched_domains_numa_masks_clear(cpu);
5729         return 0;
5730 }
5731
5732 static void sched_rq_cpu_starting(unsigned int cpu)
5733 {
5734         struct rq *rq = cpu_rq(cpu);
5735
5736         rq->calc_load_update = calc_load_update;
5737         update_max_interval();
5738 }
5739
5740 int sched_cpu_starting(unsigned int cpu)
5741 {
5742         set_cpu_rq_start_time(cpu);
5743         sched_rq_cpu_starting(cpu);
5744         return 0;
5745 }
5746
5747 #ifdef CONFIG_HOTPLUG_CPU
5748 int sched_cpu_dying(unsigned int cpu)
5749 {
5750         struct rq *rq = cpu_rq(cpu);
5751         struct rq_flags rf;
5752
5753         /* Handle pending wakeups and then migrate everything off */
5754         sched_ttwu_pending();
5755
5756         rq_lock_irqsave(rq, &rf);
5757
5758         walt_migrate_sync_cpu(cpu);
5759
5760         if (rq->rd) {
5761                 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
5762                 set_rq_offline(rq);
5763         }
5764         migrate_tasks(rq, &rf);
5765         BUG_ON(rq->nr_running != 1);
5766         rq_unlock_irqrestore(rq, &rf);
5767
5768         calc_load_migrate(rq);
5769         update_max_interval();
5770         nohz_balance_exit_idle(cpu);
5771         hrtick_clear(rq);
5772         return 0;
5773 }
5774 #endif
5775
5776 #ifdef CONFIG_SCHED_SMT
5777 DEFINE_STATIC_KEY_FALSE(sched_smt_present);
5778
5779 static void sched_init_smt(void)
5780 {
5781         /*
5782          * We've enumerated all CPUs and will assume that if any CPU
5783          * has SMT siblings, CPU0 will too.
5784          */
5785         if (cpumask_weight(cpu_smt_mask(0)) > 1)
5786                 static_branch_enable(&sched_smt_present);
5787 }
5788 #else
5789 static inline void sched_init_smt(void) { }
5790 #endif
5791
5792 void __init sched_init_smp(void)
5793 {
5794         cpumask_var_t non_isolated_cpus;
5795
5796         alloc_cpumask_var(&non_isolated_cpus, GFP_KERNEL);
5797
5798         sched_init_numa();
5799
5800         /*
5801          * There's no userspace yet to cause hotplug operations; hence all the
5802          * CPU masks are stable and all blatant races in the below code cannot
5803          * happen.
5804          */
5805         mutex_lock(&sched_domains_mutex);
5806         sched_init_domains(cpu_active_mask);
5807         cpumask_andnot(non_isolated_cpus, cpu_possible_mask, cpu_isolated_map);
5808         if (cpumask_empty(non_isolated_cpus))
5809                 cpumask_set_cpu(smp_processor_id(), non_isolated_cpus);
5810         mutex_unlock(&sched_domains_mutex);
5811
5812         /* Move init over to a non-isolated CPU */
5813         if (set_cpus_allowed_ptr(current, non_isolated_cpus) < 0)
5814                 BUG();
5815         sched_init_granularity();
5816         free_cpumask_var(non_isolated_cpus);
5817
5818         init_sched_rt_class();
5819         init_sched_dl_class();
5820
5821         sched_init_smt();
5822
5823         sched_smp_initialized = true;
5824 }
5825
5826 static int __init migration_init(void)
5827 {
5828         sched_rq_cpu_starting(smp_processor_id());
5829         return 0;
5830 }
5831 early_initcall(migration_init);
5832
5833 #else
5834 void __init sched_init_smp(void)
5835 {
5836         sched_init_granularity();
5837 }
5838 #endif /* CONFIG_SMP */
5839
5840 int in_sched_functions(unsigned long addr)
5841 {
5842         return in_lock_functions(addr) ||
5843                 (addr >= (unsigned long)__sched_text_start
5844                 && addr < (unsigned long)__sched_text_end);
5845 }
5846
5847 #ifdef CONFIG_CGROUP_SCHED
5848 /*
5849  * Default task group.
5850  * Every task in system belongs to this group at bootup.
5851  */
5852 struct task_group root_task_group;
5853 LIST_HEAD(task_groups);
5854
5855 /* Cacheline aligned slab cache for task_group */
5856 static struct kmem_cache *task_group_cache __read_mostly;
5857 #endif
5858
5859 DECLARE_PER_CPU(cpumask_var_t, load_balance_mask);
5860 DECLARE_PER_CPU(cpumask_var_t, select_idle_mask);
5861
5862 void __init sched_init(void)
5863 {
5864         int i, j;
5865         unsigned long alloc_size = 0, ptr;
5866
5867         sched_clock_init();
5868         wait_bit_init();
5869
5870 #ifdef CONFIG_FAIR_GROUP_SCHED
5871         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
5872 #endif
5873 #ifdef CONFIG_RT_GROUP_SCHED
5874         alloc_size += 2 * nr_cpu_ids * sizeof(void **);
5875 #endif
5876         if (alloc_size) {
5877                 ptr = (unsigned long)kzalloc(alloc_size, GFP_NOWAIT);
5878
5879 #ifdef CONFIG_FAIR_GROUP_SCHED
5880                 root_task_group.se = (struct sched_entity **)ptr;
5881                 ptr += nr_cpu_ids * sizeof(void **);
5882
5883                 root_task_group.cfs_rq = (struct cfs_rq **)ptr;
5884                 ptr += nr_cpu_ids * sizeof(void **);
5885
5886 #endif /* CONFIG_FAIR_GROUP_SCHED */
5887 #ifdef CONFIG_RT_GROUP_SCHED
5888                 root_task_group.rt_se = (struct sched_rt_entity **)ptr;
5889                 ptr += nr_cpu_ids * sizeof(void **);
5890
5891                 root_task_group.rt_rq = (struct rt_rq **)ptr;
5892                 ptr += nr_cpu_ids * sizeof(void **);
5893
5894 #endif /* CONFIG_RT_GROUP_SCHED */
5895         }
5896 #ifdef CONFIG_CPUMASK_OFFSTACK
5897         for_each_possible_cpu(i) {
5898                 per_cpu(load_balance_mask, i) = (cpumask_var_t)kzalloc_node(
5899                         cpumask_size(), GFP_KERNEL, cpu_to_node(i));
5900                 per_cpu(select_idle_mask, i) = (cpumask_var_t)kzalloc_node(
5901                         cpumask_size(), GFP_KERNEL, cpu_to_node(i));
5902         }
5903 #endif /* CONFIG_CPUMASK_OFFSTACK */
5904
5905         init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime());
5906         init_dl_bandwidth(&def_dl_bandwidth, global_rt_period(), global_rt_runtime());
5907
5908 #ifdef CONFIG_SMP
5909         init_defrootdomain();
5910 #endif
5911
5912 #ifdef CONFIG_RT_GROUP_SCHED
5913         init_rt_bandwidth(&root_task_group.rt_bandwidth,
5914                         global_rt_period(), global_rt_runtime());
5915 #endif /* CONFIG_RT_GROUP_SCHED */
5916
5917 #ifdef CONFIG_CGROUP_SCHED
5918         task_group_cache = KMEM_CACHE(task_group, 0);
5919
5920         list_add(&root_task_group.list, &task_groups);
5921         INIT_LIST_HEAD(&root_task_group.children);
5922         INIT_LIST_HEAD(&root_task_group.siblings);
5923         autogroup_init(&init_task);
5924 #endif /* CONFIG_CGROUP_SCHED */
5925
5926         for_each_possible_cpu(i) {
5927                 struct rq *rq;
5928
5929                 rq = cpu_rq(i);
5930                 raw_spin_lock_init(&rq->lock);
5931                 rq->nr_running = 0;
5932                 rq->calc_load_active = 0;
5933                 rq->calc_load_update = jiffies + LOAD_FREQ;
5934                 init_cfs_rq(&rq->cfs);
5935                 init_rt_rq(&rq->rt);
5936                 init_dl_rq(&rq->dl);
5937 #ifdef CONFIG_FAIR_GROUP_SCHED
5938                 root_task_group.shares = ROOT_TASK_GROUP_LOAD;
5939                 INIT_LIST_HEAD(&rq->leaf_cfs_rq_list);
5940                 rq->tmp_alone_branch = &rq->leaf_cfs_rq_list;
5941                 /*
5942                  * How much CPU bandwidth does root_task_group get?
5943                  *
5944                  * In case of task-groups formed thr' the cgroup filesystem, it
5945                  * gets 100% of the CPU resources in the system. This overall
5946                  * system CPU resource is divided among the tasks of
5947                  * root_task_group and its child task-groups in a fair manner,
5948                  * based on each entity's (task or task-group's) weight
5949                  * (se->load.weight).
5950                  *
5951                  * In other words, if root_task_group has 10 tasks of weight
5952                  * 1024) and two child groups A0 and A1 (of weight 1024 each),
5953                  * then A0's share of the CPU resource is:
5954                  *
5955                  *      A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33%
5956                  *
5957                  * We achieve this by letting root_task_group's tasks sit
5958                  * directly in rq->cfs (i.e root_task_group->se[] = NULL).
5959                  */
5960                 init_cfs_bandwidth(&root_task_group.cfs_bandwidth);
5961                 init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL);
5962 #endif /* CONFIG_FAIR_GROUP_SCHED */
5963
5964                 rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime;
5965 #ifdef CONFIG_RT_GROUP_SCHED
5966                 init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL);
5967 #endif
5968
5969                 for (j = 0; j < CPU_LOAD_IDX_MAX; j++)
5970                         rq->cpu_load[j] = 0;
5971
5972 #ifdef CONFIG_SMP
5973                 rq->sd = NULL;
5974                 rq->rd = NULL;
5975                 rq->cpu_capacity = rq->cpu_capacity_orig = SCHED_CAPACITY_SCALE;
5976                 rq->balance_callback = NULL;
5977                 rq->active_balance = 0;
5978                 rq->next_balance = jiffies;
5979                 rq->push_cpu = 0;
5980                 rq->cpu = i;
5981                 rq->online = 0;
5982                 rq->idle_stamp = 0;
5983                 rq->avg_idle = 2*sysctl_sched_migration_cost;
5984                 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
5985 #ifdef CONFIG_SCHED_WALT
5986                 rq->cur_irqload = 0;
5987                 rq->avg_irqload = 0;
5988                 rq->irqload_ts = 0;
5989 #endif
5990
5991                 INIT_LIST_HEAD(&rq->cfs_tasks);
5992
5993                 rq_attach_root(rq, &def_root_domain);
5994 #ifdef CONFIG_NO_HZ_COMMON
5995                 rq->last_load_update_tick = jiffies;
5996                 rq->last_blocked_load_update_tick = jiffies;
5997                 rq->nohz_flags = 0;
5998 #endif
5999 #ifdef CONFIG_NO_HZ_FULL
6000                 rq->last_sched_tick = 0;
6001 #endif
6002 #endif /* CONFIG_SMP */
6003                 init_rq_hrtick(rq);
6004                 atomic_set(&rq->nr_iowait, 0);
6005         }
6006
6007         set_load_weight(&init_task);
6008
6009         /*
6010          * The boot idle thread does lazy MMU switching as well:
6011          */
6012         mmgrab(&init_mm);
6013         enter_lazy_tlb(&init_mm, current);
6014
6015         /*
6016          * Make us the idle thread. Technically, schedule() should not be
6017          * called from this thread, however somewhere below it might be,
6018          * but because we are the idle thread, we just pick up running again
6019          * when this runqueue becomes "idle".
6020          */
6021         init_idle(current, smp_processor_id());
6022
6023         calc_load_update = jiffies + LOAD_FREQ;
6024
6025 #ifdef CONFIG_SMP
6026         /* May be allocated at isolcpus cmdline parse time */
6027         if (cpu_isolated_map == NULL)
6028                 zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
6029         idle_thread_set_boot_cpu();
6030         set_cpu_rq_start_time(smp_processor_id());
6031 #endif
6032         init_sched_fair_class();
6033
6034         init_schedstats();
6035
6036         scheduler_running = 1;
6037 }
6038
6039 #ifdef CONFIG_DEBUG_ATOMIC_SLEEP
6040 static inline int preempt_count_equals(int preempt_offset)
6041 {
6042         int nested = preempt_count() + rcu_preempt_depth();
6043
6044         return (nested == preempt_offset);
6045 }
6046
6047 void __might_sleep(const char *file, int line, int preempt_offset)
6048 {
6049         /*
6050          * Blocking primitives will set (and therefore destroy) current->state,
6051          * since we will exit with TASK_RUNNING make sure we enter with it,
6052          * otherwise we will destroy state.
6053          */
6054         WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change,
6055                         "do not call blocking ops when !TASK_RUNNING; "
6056                         "state=%lx set at [<%p>] %pS\n",
6057                         current->state,
6058                         (void *)current->task_state_change,
6059                         (void *)current->task_state_change);
6060
6061         ___might_sleep(file, line, preempt_offset);
6062 }
6063 EXPORT_SYMBOL(__might_sleep);
6064
6065 void ___might_sleep(const char *file, int line, int preempt_offset)
6066 {
6067         /* Ratelimiting timestamp: */
6068         static unsigned long prev_jiffy;
6069
6070         unsigned long preempt_disable_ip;
6071
6072         /* WARN_ON_ONCE() by default, no rate limit required: */
6073         rcu_sleep_check();
6074
6075         if ((preempt_count_equals(preempt_offset) && !irqs_disabled() &&
6076              !is_idle_task(current)) ||
6077             system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING ||
6078             oops_in_progress)
6079                 return;
6080
6081         if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy)
6082                 return;
6083         prev_jiffy = jiffies;
6084
6085         /* Save this before calling printk(), since that will clobber it: */
6086         preempt_disable_ip = get_preempt_disable_ip(current);
6087
6088         printk(KERN_ERR
6089                 "BUG: sleeping function called from invalid context at %s:%d\n",
6090                         file, line);
6091         printk(KERN_ERR
6092                 "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n",
6093                         in_atomic(), irqs_disabled(),
6094                         current->pid, current->comm);
6095
6096         if (task_stack_end_corrupted(current))
6097                 printk(KERN_EMERG "Thread overran stack, or stack corrupted\n");
6098
6099         debug_show_held_locks(current);
6100         if (irqs_disabled())
6101                 print_irqtrace_events(current);
6102         if (IS_ENABLED(CONFIG_DEBUG_PREEMPT)
6103             && !preempt_count_equals(preempt_offset)) {
6104                 pr_err("Preemption disabled at:");
6105                 print_ip_sym(preempt_disable_ip);
6106                 pr_cont("\n");
6107         }
6108         dump_stack();
6109         add_taint(TAINT_WARN, LOCKDEP_STILL_OK);
6110 }
6111 EXPORT_SYMBOL(___might_sleep);
6112 #endif
6113
6114 #ifdef CONFIG_MAGIC_SYSRQ
6115 void normalize_rt_tasks(void)
6116 {
6117         struct task_struct *g, *p;
6118         struct sched_attr attr = {
6119                 .sched_policy = SCHED_NORMAL,
6120         };
6121
6122         read_lock(&tasklist_lock);
6123         for_each_process_thread(g, p) {
6124                 /*
6125                  * Only normalize user tasks:
6126                  */
6127                 if (p->flags & PF_KTHREAD)
6128                         continue;
6129
6130                 p->se.exec_start = 0;
6131                 schedstat_set(p->se.statistics.wait_start,  0);
6132                 schedstat_set(p->se.statistics.sleep_start, 0);
6133                 schedstat_set(p->se.statistics.block_start, 0);
6134
6135                 if (!dl_task(p) && !rt_task(p)) {
6136                         /*
6137                          * Renice negative nice level userspace
6138                          * tasks back to 0:
6139                          */
6140                         if (task_nice(p) < 0)
6141                                 set_user_nice(p, 0);
6142                         continue;
6143                 }
6144
6145                 __sched_setscheduler(p, &attr, false, false);
6146         }
6147         read_unlock(&tasklist_lock);
6148 }
6149
6150 #endif /* CONFIG_MAGIC_SYSRQ */
6151
6152 #if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB)
6153 /*
6154  * These functions are only useful for the IA64 MCA handling, or kdb.
6155  *
6156  * They can only be called when the whole system has been
6157  * stopped - every CPU needs to be quiescent, and no scheduling
6158  * activity can take place. Using them for anything else would
6159  * be a serious bug, and as a result, they aren't even visible
6160  * under any other configuration.
6161  */
6162
6163 /**
6164  * curr_task - return the current task for a given CPU.
6165  * @cpu: the processor in question.
6166  *
6167  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6168  *
6169  * Return: The current task for @cpu.
6170  */
6171 struct task_struct *curr_task(int cpu)
6172 {
6173         return cpu_curr(cpu);
6174 }
6175
6176 #endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */
6177
6178 #ifdef CONFIG_IA64
6179 /**
6180  * set_curr_task - set the current task for a given CPU.
6181  * @cpu: the processor in question.
6182  * @p: the task pointer to set.
6183  *
6184  * Description: This function must only be used when non-maskable interrupts
6185  * are serviced on a separate stack. It allows the architecture to switch the
6186  * notion of the current task on a CPU in a non-blocking manner. This function
6187  * must be called with all CPU's synchronized, and interrupts disabled, the
6188  * and caller must save the original value of the current task (see
6189  * curr_task() above) and restore that value before reenabling interrupts and
6190  * re-starting the system.
6191  *
6192  * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED!
6193  */
6194 void ia64_set_curr_task(int cpu, struct task_struct *p)
6195 {
6196         cpu_curr(cpu) = p;
6197 }
6198
6199 #endif
6200
6201 #ifdef CONFIG_CGROUP_SCHED
6202 /* task_group_lock serializes the addition/removal of task groups */
6203 static DEFINE_SPINLOCK(task_group_lock);
6204
6205 static void sched_free_group(struct task_group *tg)
6206 {
6207         free_fair_sched_group(tg);
6208         free_rt_sched_group(tg);
6209         autogroup_free(tg);
6210         kmem_cache_free(task_group_cache, tg);
6211 }
6212
6213 /* allocate runqueue etc for a new task group */
6214 struct task_group *sched_create_group(struct task_group *parent)
6215 {
6216         struct task_group *tg;
6217
6218         tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO);
6219         if (!tg)
6220                 return ERR_PTR(-ENOMEM);
6221
6222         if (!alloc_fair_sched_group(tg, parent))
6223                 goto err;
6224
6225         if (!alloc_rt_sched_group(tg, parent))
6226                 goto err;
6227
6228         return tg;
6229
6230 err:
6231         sched_free_group(tg);
6232         return ERR_PTR(-ENOMEM);
6233 }
6234
6235 void sched_online_group(struct task_group *tg, struct task_group *parent)
6236 {
6237         unsigned long flags;
6238
6239         spin_lock_irqsave(&task_group_lock, flags);
6240         list_add_rcu(&tg->list, &task_groups);
6241
6242         /* Root should already exist: */
6243         WARN_ON(!parent);
6244
6245         tg->parent = parent;
6246         INIT_LIST_HEAD(&tg->children);
6247         list_add_rcu(&tg->siblings, &parent->children);
6248         spin_unlock_irqrestore(&task_group_lock, flags);
6249
6250         online_fair_sched_group(tg);
6251 }
6252
6253 /* rcu callback to free various structures associated with a task group */
6254 static void sched_free_group_rcu(struct rcu_head *rhp)
6255 {
6256         /* Now it should be safe to free those cfs_rqs: */
6257         sched_free_group(container_of(rhp, struct task_group, rcu));
6258 }
6259
6260 void sched_destroy_group(struct task_group *tg)
6261 {
6262         /* Wait for possible concurrent references to cfs_rqs complete: */
6263         call_rcu(&tg->rcu, sched_free_group_rcu);
6264 }
6265
6266 void sched_offline_group(struct task_group *tg)
6267 {
6268         unsigned long flags;
6269
6270         /* End participation in shares distribution: */
6271         unregister_fair_sched_group(tg);
6272
6273         spin_lock_irqsave(&task_group_lock, flags);
6274         list_del_rcu(&tg->list);
6275         list_del_rcu(&tg->siblings);
6276         spin_unlock_irqrestore(&task_group_lock, flags);
6277 }
6278
6279 static void sched_change_group(struct task_struct *tsk, int type)
6280 {
6281         struct task_group *tg;
6282
6283         /*
6284          * All callers are synchronized by task_rq_lock(); we do not use RCU
6285          * which is pointless here. Thus, we pass "true" to task_css_check()
6286          * to prevent lockdep warnings.
6287          */
6288         tg = container_of(task_css_check(tsk, cpu_cgrp_id, true),
6289                           struct task_group, css);
6290         tg = autogroup_task_group(tsk, tg);
6291         tsk->sched_task_group = tg;
6292
6293 #ifdef CONFIG_FAIR_GROUP_SCHED
6294         if (tsk->sched_class->task_change_group)
6295                 tsk->sched_class->task_change_group(tsk, type);
6296         else
6297 #endif
6298                 set_task_rq(tsk, task_cpu(tsk));
6299 }
6300
6301 /*
6302  * Change task's runqueue when it moves between groups.
6303  *
6304  * The caller of this function should have put the task in its new group by
6305  * now. This function just updates tsk->se.cfs_rq and tsk->se.parent to reflect
6306  * its new group.
6307  */
6308 void sched_move_task(struct task_struct *tsk)
6309 {
6310         int queued, running, queue_flags =
6311                 DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK;
6312         struct rq_flags rf;
6313         struct rq *rq;
6314
6315         rq = task_rq_lock(tsk, &rf);
6316         update_rq_clock(rq);
6317
6318         running = task_current(rq, tsk);
6319         queued = task_on_rq_queued(tsk);
6320
6321         if (queued)
6322                 dequeue_task(rq, tsk, queue_flags);
6323         if (running)
6324                 put_prev_task(rq, tsk);
6325
6326         sched_change_group(tsk, TASK_MOVE_GROUP);
6327
6328         if (queued)
6329                 enqueue_task(rq, tsk, queue_flags);
6330         if (running)
6331                 set_curr_task(rq, tsk);
6332
6333         task_rq_unlock(rq, tsk, &rf);
6334 }
6335
6336 static inline struct task_group *css_tg(struct cgroup_subsys_state *css)
6337 {
6338         return css ? container_of(css, struct task_group, css) : NULL;
6339 }
6340
6341 static struct cgroup_subsys_state *
6342 cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
6343 {
6344         struct task_group *parent = css_tg(parent_css);
6345         struct task_group *tg;
6346
6347         if (!parent) {
6348                 /* This is early initialization for the top cgroup */
6349                 return &root_task_group.css;
6350         }
6351
6352         tg = sched_create_group(parent);
6353         if (IS_ERR(tg))
6354                 return ERR_PTR(-ENOMEM);
6355
6356         return &tg->css;
6357 }
6358
6359 /* Expose task group only after completing cgroup initialization */
6360 static int cpu_cgroup_css_online(struct cgroup_subsys_state *css)
6361 {
6362         struct task_group *tg = css_tg(css);
6363         struct task_group *parent = css_tg(css->parent);
6364
6365         if (parent)
6366                 sched_online_group(tg, parent);
6367         return 0;
6368 }
6369
6370 static void cpu_cgroup_css_released(struct cgroup_subsys_state *css)
6371 {
6372         struct task_group *tg = css_tg(css);
6373
6374         sched_offline_group(tg);
6375 }
6376
6377 static void cpu_cgroup_css_free(struct cgroup_subsys_state *css)
6378 {
6379         struct task_group *tg = css_tg(css);
6380
6381         /*
6382          * Relies on the RCU grace period between css_released() and this.
6383          */
6384         sched_free_group(tg);
6385 }
6386
6387 /*
6388  * This is called before wake_up_new_task(), therefore we really only
6389  * have to set its group bits, all the other stuff does not apply.
6390  */
6391 static void cpu_cgroup_fork(struct task_struct *task)
6392 {
6393         struct rq_flags rf;
6394         struct rq *rq;
6395
6396         rq = task_rq_lock(task, &rf);
6397
6398         update_rq_clock(rq);
6399         sched_change_group(task, TASK_SET_GROUP);
6400
6401         task_rq_unlock(rq, task, &rf);
6402 }
6403
6404 static int cpu_cgroup_can_attach(struct cgroup_taskset *tset)
6405 {
6406         struct task_struct *task;
6407         struct cgroup_subsys_state *css;
6408         int ret = 0;
6409
6410         cgroup_taskset_for_each(task, css, tset) {
6411 #ifdef CONFIG_RT_GROUP_SCHED
6412                 if (!sched_rt_can_attach(css_tg(css), task))
6413                         return -EINVAL;
6414 #else
6415                 /* We don't support RT-tasks being in separate groups */
6416                 if (task->sched_class != &fair_sched_class)
6417                         return -EINVAL;
6418 #endif
6419                 /*
6420                  * Serialize against wake_up_new_task() such that if its
6421                  * running, we're sure to observe its full state.
6422                  */
6423                 raw_spin_lock_irq(&task->pi_lock);
6424                 /*
6425                  * Avoid calling sched_move_task() before wake_up_new_task()
6426                  * has happened. This would lead to problems with PELT, due to
6427                  * move wanting to detach+attach while we're not attached yet.
6428                  */
6429                 if (task->state == TASK_NEW)
6430                         ret = -EINVAL;
6431                 raw_spin_unlock_irq(&task->pi_lock);
6432
6433                 if (ret)
6434                         break;
6435         }
6436         return ret;
6437 }
6438
6439 static void cpu_cgroup_attach(struct cgroup_taskset *tset)
6440 {
6441         struct task_struct *task;
6442         struct cgroup_subsys_state *css;
6443
6444         cgroup_taskset_for_each(task, css, tset)
6445                 sched_move_task(task);
6446 }
6447
6448 #ifdef CONFIG_FAIR_GROUP_SCHED
6449 static int cpu_shares_write_u64(struct cgroup_subsys_state *css,
6450                                 struct cftype *cftype, u64 shareval)
6451 {
6452         return sched_group_set_shares(css_tg(css), scale_load(shareval));
6453 }
6454
6455 static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css,
6456                                struct cftype *cft)
6457 {
6458         struct task_group *tg = css_tg(css);
6459
6460         return (u64) scale_load_down(tg->shares);
6461 }
6462
6463 #ifdef CONFIG_CFS_BANDWIDTH
6464 static DEFINE_MUTEX(cfs_constraints_mutex);
6465
6466 const u64 max_cfs_quota_period = 1 * NSEC_PER_SEC; /* 1s */
6467 const u64 min_cfs_quota_period = 1 * NSEC_PER_MSEC; /* 1ms */
6468
6469 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 runtime);
6470
6471 static int tg_set_cfs_bandwidth(struct task_group *tg, u64 period, u64 quota)
6472 {
6473         int i, ret = 0, runtime_enabled, runtime_was_enabled;
6474         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6475
6476         if (tg == &root_task_group)
6477                 return -EINVAL;
6478
6479         /*
6480          * Ensure we have at some amount of bandwidth every period.  This is
6481          * to prevent reaching a state of large arrears when throttled via
6482          * entity_tick() resulting in prolonged exit starvation.
6483          */
6484         if (quota < min_cfs_quota_period || period < min_cfs_quota_period)
6485                 return -EINVAL;
6486
6487         /*
6488          * Likewise, bound things on the otherside by preventing insane quota
6489          * periods.  This also allows us to normalize in computing quota
6490          * feasibility.
6491          */
6492         if (period > max_cfs_quota_period)
6493                 return -EINVAL;
6494
6495         /*
6496          * Prevent race between setting of cfs_rq->runtime_enabled and
6497          * unthrottle_offline_cfs_rqs().
6498          */
6499         get_online_cpus();
6500         mutex_lock(&cfs_constraints_mutex);
6501         ret = __cfs_schedulable(tg, period, quota);
6502         if (ret)
6503                 goto out_unlock;
6504
6505         runtime_enabled = quota != RUNTIME_INF;
6506         runtime_was_enabled = cfs_b->quota != RUNTIME_INF;
6507         /*
6508          * If we need to toggle cfs_bandwidth_used, off->on must occur
6509          * before making related changes, and on->off must occur afterwards
6510          */
6511         if (runtime_enabled && !runtime_was_enabled)
6512                 cfs_bandwidth_usage_inc();
6513         raw_spin_lock_irq(&cfs_b->lock);
6514         cfs_b->period = ns_to_ktime(period);
6515         cfs_b->quota = quota;
6516
6517         __refill_cfs_bandwidth_runtime(cfs_b);
6518
6519         /* Restart the period timer (if active) to handle new period expiry: */
6520         if (runtime_enabled)
6521                 start_cfs_bandwidth(cfs_b);
6522
6523         raw_spin_unlock_irq(&cfs_b->lock);
6524
6525         for_each_online_cpu(i) {
6526                 struct cfs_rq *cfs_rq = tg->cfs_rq[i];
6527                 struct rq *rq = cfs_rq->rq;
6528                 struct rq_flags rf;
6529
6530                 rq_lock_irq(rq, &rf);
6531                 cfs_rq->runtime_enabled = runtime_enabled;
6532                 cfs_rq->runtime_remaining = 0;
6533
6534                 if (cfs_rq->throttled)
6535                         unthrottle_cfs_rq(cfs_rq);
6536                 rq_unlock_irq(rq, &rf);
6537         }
6538         if (runtime_was_enabled && !runtime_enabled)
6539                 cfs_bandwidth_usage_dec();
6540 out_unlock:
6541         mutex_unlock(&cfs_constraints_mutex);
6542         put_online_cpus();
6543
6544         return ret;
6545 }
6546
6547 int tg_set_cfs_quota(struct task_group *tg, long cfs_quota_us)
6548 {
6549         u64 quota, period;
6550
6551         period = ktime_to_ns(tg->cfs_bandwidth.period);
6552         if (cfs_quota_us < 0)
6553                 quota = RUNTIME_INF;
6554         else
6555                 quota = (u64)cfs_quota_us * NSEC_PER_USEC;
6556
6557         return tg_set_cfs_bandwidth(tg, period, quota);
6558 }
6559
6560 long tg_get_cfs_quota(struct task_group *tg)
6561 {
6562         u64 quota_us;
6563
6564         if (tg->cfs_bandwidth.quota == RUNTIME_INF)
6565                 return -1;
6566
6567         quota_us = tg->cfs_bandwidth.quota;
6568         do_div(quota_us, NSEC_PER_USEC);
6569
6570         return quota_us;
6571 }
6572
6573 int tg_set_cfs_period(struct task_group *tg, long cfs_period_us)
6574 {
6575         u64 quota, period;
6576
6577         period = (u64)cfs_period_us * NSEC_PER_USEC;
6578         quota = tg->cfs_bandwidth.quota;
6579
6580         return tg_set_cfs_bandwidth(tg, period, quota);
6581 }
6582
6583 long tg_get_cfs_period(struct task_group *tg)
6584 {
6585         u64 cfs_period_us;
6586
6587         cfs_period_us = ktime_to_ns(tg->cfs_bandwidth.period);
6588         do_div(cfs_period_us, NSEC_PER_USEC);
6589
6590         return cfs_period_us;
6591 }
6592
6593 static s64 cpu_cfs_quota_read_s64(struct cgroup_subsys_state *css,
6594                                   struct cftype *cft)
6595 {
6596         return tg_get_cfs_quota(css_tg(css));
6597 }
6598
6599 static int cpu_cfs_quota_write_s64(struct cgroup_subsys_state *css,
6600                                    struct cftype *cftype, s64 cfs_quota_us)
6601 {
6602         return tg_set_cfs_quota(css_tg(css), cfs_quota_us);
6603 }
6604
6605 static u64 cpu_cfs_period_read_u64(struct cgroup_subsys_state *css,
6606                                    struct cftype *cft)
6607 {
6608         return tg_get_cfs_period(css_tg(css));
6609 }
6610
6611 static int cpu_cfs_period_write_u64(struct cgroup_subsys_state *css,
6612                                     struct cftype *cftype, u64 cfs_period_us)
6613 {
6614         return tg_set_cfs_period(css_tg(css), cfs_period_us);
6615 }
6616
6617 struct cfs_schedulable_data {
6618         struct task_group *tg;
6619         u64 period, quota;
6620 };
6621
6622 /*
6623  * normalize group quota/period to be quota/max_period
6624  * note: units are usecs
6625  */
6626 static u64 normalize_cfs_quota(struct task_group *tg,
6627                                struct cfs_schedulable_data *d)
6628 {
6629         u64 quota, period;
6630
6631         if (tg == d->tg) {
6632                 period = d->period;
6633                 quota = d->quota;
6634         } else {
6635                 period = tg_get_cfs_period(tg);
6636                 quota = tg_get_cfs_quota(tg);
6637         }
6638
6639         /* note: these should typically be equivalent */
6640         if (quota == RUNTIME_INF || quota == -1)
6641                 return RUNTIME_INF;
6642
6643         return to_ratio(period, quota);
6644 }
6645
6646 static int tg_cfs_schedulable_down(struct task_group *tg, void *data)
6647 {
6648         struct cfs_schedulable_data *d = data;
6649         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6650         s64 quota = 0, parent_quota = -1;
6651
6652         if (!tg->parent) {
6653                 quota = RUNTIME_INF;
6654         } else {
6655                 struct cfs_bandwidth *parent_b = &tg->parent->cfs_bandwidth;
6656
6657                 quota = normalize_cfs_quota(tg, d);
6658                 parent_quota = parent_b->hierarchical_quota;
6659
6660                 /*
6661                  * Ensure max(child_quota) <= parent_quota, inherit when no
6662                  * limit is set:
6663                  */
6664                 if (quota == RUNTIME_INF)
6665                         quota = parent_quota;
6666                 else if (parent_quota != RUNTIME_INF && quota > parent_quota)
6667                         return -EINVAL;
6668         }
6669         cfs_b->hierarchical_quota = quota;
6670
6671         return 0;
6672 }
6673
6674 static int __cfs_schedulable(struct task_group *tg, u64 period, u64 quota)
6675 {
6676         int ret;
6677         struct cfs_schedulable_data data = {
6678                 .tg = tg,
6679                 .period = period,
6680                 .quota = quota,
6681         };
6682
6683         if (quota != RUNTIME_INF) {
6684                 do_div(data.period, NSEC_PER_USEC);
6685                 do_div(data.quota, NSEC_PER_USEC);
6686         }
6687
6688         rcu_read_lock();
6689         ret = walk_tg_tree(tg_cfs_schedulable_down, tg_nop, &data);
6690         rcu_read_unlock();
6691
6692         return ret;
6693 }
6694
6695 static int cpu_stats_show(struct seq_file *sf, void *v)
6696 {
6697         struct task_group *tg = css_tg(seq_css(sf));
6698         struct cfs_bandwidth *cfs_b = &tg->cfs_bandwidth;
6699
6700         seq_printf(sf, "nr_periods %d\n", cfs_b->nr_periods);
6701         seq_printf(sf, "nr_throttled %d\n", cfs_b->nr_throttled);
6702         seq_printf(sf, "throttled_time %llu\n", cfs_b->throttled_time);
6703
6704         return 0;
6705 }
6706 #endif /* CONFIG_CFS_BANDWIDTH */
6707 #endif /* CONFIG_FAIR_GROUP_SCHED */
6708
6709 #ifdef CONFIG_RT_GROUP_SCHED
6710 static int cpu_rt_runtime_write(struct cgroup_subsys_state *css,
6711                                 struct cftype *cft, s64 val)
6712 {
6713         return sched_group_set_rt_runtime(css_tg(css), val);
6714 }
6715
6716 static s64 cpu_rt_runtime_read(struct cgroup_subsys_state *css,
6717                                struct cftype *cft)
6718 {
6719         return sched_group_rt_runtime(css_tg(css));
6720 }
6721
6722 static int cpu_rt_period_write_uint(struct cgroup_subsys_state *css,
6723                                     struct cftype *cftype, u64 rt_period_us)
6724 {
6725         return sched_group_set_rt_period(css_tg(css), rt_period_us);
6726 }
6727
6728 static u64 cpu_rt_period_read_uint(struct cgroup_subsys_state *css,
6729                                    struct cftype *cft)
6730 {
6731         return sched_group_rt_period(css_tg(css));
6732 }
6733 #endif /* CONFIG_RT_GROUP_SCHED */
6734
6735 static struct cftype cpu_files[] = {
6736 #ifdef CONFIG_FAIR_GROUP_SCHED
6737         {
6738                 .name = "shares",
6739                 .read_u64 = cpu_shares_read_u64,
6740                 .write_u64 = cpu_shares_write_u64,
6741         },
6742 #endif
6743 #ifdef CONFIG_CFS_BANDWIDTH
6744         {
6745                 .name = "cfs_quota_us",
6746                 .read_s64 = cpu_cfs_quota_read_s64,
6747                 .write_s64 = cpu_cfs_quota_write_s64,
6748         },
6749         {
6750                 .name = "cfs_period_us",
6751                 .read_u64 = cpu_cfs_period_read_u64,
6752                 .write_u64 = cpu_cfs_period_write_u64,
6753         },
6754         {
6755                 .name = "stat",
6756                 .seq_show = cpu_stats_show,
6757         },
6758 #endif
6759 #ifdef CONFIG_RT_GROUP_SCHED
6760         {
6761                 .name = "rt_runtime_us",
6762                 .read_s64 = cpu_rt_runtime_read,
6763                 .write_s64 = cpu_rt_runtime_write,
6764         },
6765         {
6766                 .name = "rt_period_us",
6767                 .read_u64 = cpu_rt_period_read_uint,
6768                 .write_u64 = cpu_rt_period_write_uint,
6769         },
6770 #endif
6771         { }     /* Terminate */
6772 };
6773
6774 struct cgroup_subsys cpu_cgrp_subsys = {
6775         .css_alloc      = cpu_cgroup_css_alloc,
6776         .css_online     = cpu_cgroup_css_online,
6777         .css_released   = cpu_cgroup_css_released,
6778         .css_free       = cpu_cgroup_css_free,
6779         .fork           = cpu_cgroup_fork,
6780         .can_attach     = cpu_cgroup_can_attach,
6781         .attach         = cpu_cgroup_attach,
6782         .legacy_cftypes = cpu_files,
6783         .early_init     = true,
6784 };
6785
6786 #endif  /* CONFIG_CGROUP_SCHED */
6787
6788 void dump_cpu_task(int cpu)
6789 {
6790         pr_info("Task dump for CPU %d:\n", cpu);
6791         sched_show_task(cpu_curr(cpu));
6792 }
6793
6794 /*
6795  * Nice levels are multiplicative, with a gentle 10% change for every
6796  * nice level changed. I.e. when a CPU-bound task goes from nice 0 to
6797  * nice 1, it will get ~10% less CPU time than another CPU-bound task
6798  * that remained on nice 0.
6799  *
6800  * The "10% effect" is relative and cumulative: from _any_ nice level,
6801  * if you go up 1 level, it's -10% CPU usage, if you go down 1 level
6802  * it's +10% CPU usage. (to achieve that we use a multiplier of 1.25.
6803  * If a task goes up by ~10% and another task goes down by ~10% then
6804  * the relative distance between them is ~25%.)
6805  */
6806 const int sched_prio_to_weight[40] = {
6807  /* -20 */     88761,     71755,     56483,     46273,     36291,
6808  /* -15 */     29154,     23254,     18705,     14949,     11916,
6809  /* -10 */      9548,      7620,      6100,      4904,      3906,
6810  /*  -5 */      3121,      2501,      1991,      1586,      1277,
6811  /*   0 */      1024,       820,       655,       526,       423,
6812  /*   5 */       335,       272,       215,       172,       137,
6813  /*  10 */       110,        87,        70,        56,        45,
6814  /*  15 */        36,        29,        23,        18,        15,
6815 };
6816
6817 /*
6818  * Inverse (2^32/x) values of the sched_prio_to_weight[] array, precalculated.
6819  *
6820  * In cases where the weight does not change often, we can use the
6821  * precalculated inverse to speed up arithmetics by turning divisions
6822  * into multiplications:
6823  */
6824 const u32 sched_prio_to_wmult[40] = {
6825  /* -20 */     48388,     59856,     76040,     92818,    118348,
6826  /* -15 */    147320,    184698,    229616,    287308,    360437,
6827  /* -10 */    449829,    563644,    704093,    875809,   1099582,
6828  /*  -5 */   1376151,   1717300,   2157191,   2708050,   3363326,
6829  /*   0 */   4194304,   5237765,   6557202,   8165337,  10153587,
6830  /*   5 */  12820798,  15790321,  19976592,  24970740,  31350126,
6831  /*  10 */  39045157,  49367440,  61356676,  76695844,  95443717,
6832  /*  15 */ 119304647, 148102320, 186737708, 238609294, 286331153,
6833 };