kernel/sched/fair.c

   1 /*
   2  * Completely Fair Scheduling (CFS) Class (SCHED_NORMAL/SCHED_BATCH)
   3  *
   4  *  Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
   5  *
   6  *  Interactivity improvements by Mike Galbraith
   7  *  (C) 2007 Mike Galbraith <efault@gmx.de>
   8  *
   9  *  Various enhancements by Dmitry Adamushko.
  10  *  (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
  11  *
  12  *  Group scheduling enhancements by Srivatsa Vaddagiri
  13  *  Copyright IBM Corporation, 2007
  14  *  Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
  15  *
  16  *  Scaled math optimizations by Thomas Gleixner
  17  *  Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
  18  *
  19  *  Adaptive scheduling granularity, math enhancements by Peter Zijlstra
  20  *  Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
  21  */
  22
  23 #include <linux/latencytop.h>
  24 #include <linux/sched.h>
  25 #include <linux/cpumask.h>
  26 #include <linux/slab.h>
  27 #include <linux/profile.h>
  28 #include <linux/interrupt.h>
  29 #include <linux/mempolicy.h>
  30 #include <linux/migrate.h>
  31 #include <linux/task_work.h>
  32
  33 #include <trace/events/sched.h>
  34 #ifdef CONFIG_HMP_VARIABLE_SCALE
  35 #include <linux/sysfs.h>
  36 #include <linux/vmalloc.h>
  37 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
  38 /* Include cpufreq header to add a notifier so that cpu frequency
  39  * scaling can track the current CPU frequency
  40  */
  41 #include <linux/cpufreq.h>
  42 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
  43 #endif /* CONFIG_HMP_VARIABLE_SCALE */
  44
  45 #include "sched.h"
  46
  47 #include <mtlbprof/mtlbprof.h>
  48
  49
  50 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
  51 #ifdef CONFIG_LOCAL_TIMERS
  52 unsigned long localtimer_get_counter(void);
  53 #endif
  54 #endif
  55
  56 #ifdef CONFIG_HEVTASK_INTERFACE
  57 #include <linux/proc_fs.h>
  58 #include <linux/seq_file.h>
  59 #ifdef CONFIG_KGDB_KDB
  60 #include <linux/kdb.h>
  61 #endif
  62 #endif
  63
  64 /*
  65  * Targeted preemption latency for CPU-bound tasks:
  66  * (default: 6ms * (1 + ilog(ncpus)), units: nanoseconds)
  67  *
  68  * NOTE: this latency value is not the same as the concept of
  69  * 'timeslice length' - timeslices in CFS are of variable length
  70  * and have no persistent notion like in traditional, time-slice
  71  * based scheduling concepts.
  72  *
  73  * (to see the precise effective timeslice length of your workload,
  74  *  run vmstat and monitor the context-switches (cs) field)
  75  */
  76 unsigned int sysctl_sched_latency = 6000000ULL;
  77 unsigned int normalized_sysctl_sched_latency = 6000000ULL;
  78
  79 /*
  80  * The initial- and re-scaling of tunables is configurable
  81  * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus))
  82  *
  83  * Options are:
  84  * SCHED_TUNABLESCALING_NONE - unscaled, always *1
  85  * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus)
  86  * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus
  87  */
  88 enum sched_tunable_scaling sysctl_sched_tunable_scaling
  89         = SCHED_TUNABLESCALING_LOG;
  90
  91 /*
  92  * Minimal preemption granularity for CPU-bound tasks:
  93  * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds)
  94  */
  95 unsigned int sysctl_sched_min_granularity = 750000ULL;
  96 unsigned int normalized_sysctl_sched_min_granularity = 750000ULL;
  97
  98 /*
  99  * is kept at sysctl_sched_latency / sysctl_sched_min_granularity
 100  */
 101 static unsigned int sched_nr_latency = 8;
 102
 103 /*
 104  * After fork, child runs first. If set to 0 (default) then
 105  * parent will (try to) run first.
 106  */
 107 unsigned int sysctl_sched_child_runs_first __read_mostly;
 108
 109 /*
 110  * SCHED_OTHER wake-up granularity.
 111  * (default: 1 msec * (1 + ilog(ncpus)), units: nanoseconds)
 112  *
 113  * This option delays the preemption effects of decoupled workloads
 114  * and reduces their over-scheduling. Synchronous workloads will still
 115  * have immediate wakeup/sleep latencies.
 116  */
 117 unsigned int sysctl_sched_wakeup_granularity = 1000000UL;
 118 unsigned int normalized_sysctl_sched_wakeup_granularity = 1000000UL;
 119
 120 const_debug unsigned int sysctl_sched_migration_cost = 100000UL;
 121
 122 /*
 123  * The exponential sliding  window over which load is averaged for shares
 124  * distribution.
 125  * (default: 10msec)
 126  */
 127 unsigned int __read_mostly sysctl_sched_shares_window = 10000000UL;
 128
 129 #ifdef CONFIG_CFS_BANDWIDTH
 130 /*
 131  * Amount of runtime to allocate from global (tg) to local (per-cfs_rq) pool
 132  * each time a cfs_rq requests quota.
 133  *
 134  * Note: in the case that the slice exceeds the runtime remaining (either due
 135  * to consumption or the quota being specified to be smaller than the slice)
 136  * we will always only issue the remaining available time.
 137  *
 138  * default: 5 msec, units: microseconds
 139   */
 140 unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL;
 141 #endif
 142 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
 143 static int need_lazy_balance(int dst_cpu, int src_cpu, struct task_struct *p);
 144 #endif
 145
 146 /*
 147  * Increase the granularity value when there are more CPUs,
 148  * because with more CPUs the 'effective latency' as visible
 149  * to users decreases. But the relationship is not linear,
 150  * so pick a second-best guess by going with the log2 of the
 151  * number of CPUs.
 152  *
 153  * This idea comes from the SD scheduler of Con Kolivas:
 154  */
 155 static int get_update_sysctl_factor(void)
 156 {
 157         unsigned int cpus = min_t(int, num_online_cpus(), 8);
 158         unsigned int factor;
 159
 160         switch (sysctl_sched_tunable_scaling) {
 161         case SCHED_TUNABLESCALING_NONE:
 162                 factor = 1;
 163                 break;
 164         case SCHED_TUNABLESCALING_LINEAR:
 165                 factor = cpus;
 166                 break;
 167         case SCHED_TUNABLESCALING_LOG:
 168         default:
 169                 factor = 1 + ilog2(cpus);
 170                 break;
 171         }
 172
 173         return factor;
 174 }
 175
 176 static void update_sysctl(void)
 177 {
 178         unsigned int factor = get_update_sysctl_factor();
 179
 180 #define SET_SYSCTL(name) \
 181         (sysctl_##name = (factor) * normalized_sysctl_##name)
 182         SET_SYSCTL(sched_min_granularity);
 183         SET_SYSCTL(sched_latency);
 184         SET_SYSCTL(sched_wakeup_granularity);
 185 #undef SET_SYSCTL
 186 }
 187
 188 void sched_init_granularity(void)
 189 {
 190         update_sysctl();
 191 }
 192 #if defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK) || defined (CONFIG_HMP_PACK_SMALL_TASK)
 193 /*
 194  * Save the id of the optimal CPU that should be used to pack small tasks
 195  * The value -1 is used when no buddy has been found
 196  */
 197 DEFINE_PER_CPU(int, sd_pack_buddy) = {-1};
 198
 199 #ifdef CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK
 200 struct cpumask buddy_cpu_map = {{0}};
 201 #endif
 202
 203 /* Look for the best buddy CPU that can be used to pack small tasks
 204  * We make the assumption that it doesn't wort to pack on CPU that share the
 205  * same powerline. We looks for the 1st sched_domain without the
 206  * SD_SHARE_POWERLINE flag. Then We look for the sched_group witht the lowest
 207  * power per core based on the assumption that their power efficiency is
 208  * better */
 209 void update_packing_domain(int cpu)
 210 {
 211         struct sched_domain *sd;
 212         int id = -1;
 213
 214 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
 215         pr_info("[PACK] update_packing_domain() CPU%d\n", cpu);
 216 #endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
 217         mt_sched_printf("[PACK] update_packing_domain() CPU%d", cpu);
 218
 219         sd = highest_flag_domain(cpu, SD_SHARE_POWERLINE);
 220         if (!sd)
 221         {
 222                 sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
 223         }
 224         else
 225                 if (cpumask_first(sched_domain_span(sd)) == cpu || !sd->parent)
 226                         sd = sd->parent;
 227
 228         while (sd) {
 229                 struct sched_group *sg = sd->groups;
 230                 struct sched_group *pack = sg;
 231                 struct sched_group *tmp = sg->next;
 232
 233 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
 234                 pr_info("[PACK]  sd = 0x%08x, flags = %d\n", (unsigned int)sd, sd->flags);
 235 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
 236
 237 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
 238                 pr_info("[PACK]  sg = 0x%08x\n", (unsigned int)sg);
 239 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
 240
 241                 /* 1st CPU of the sched domain is a good candidate */
 242                 if (id == -1)
 243                         id = cpumask_first(sched_domain_span(sd));
 244
 245 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
 246                 pr_info("[PACK]  First cpu in this sd id = %d\n", id);
 247 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
 248
 249                 /* Find sched group of candidate */
 250                 tmp = sd->groups;
 251                 do {
 252                         if (cpumask_test_cpu(id, sched_group_cpus(tmp))) {
 253                                 sg = tmp;
 254                                 break;
 255                         }
 256                 } while (tmp = tmp->next, tmp != sd->groups);
 257
 258 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
 259                 pr_info("[PACK]  pack = 0x%08x\n", (unsigned int)sg);
 260 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
 261
 262                 pack = sg;
 263                 tmp = sg->next;
 264
 265                 /* loop the sched groups to find the best one */
 266                 //Stop find the best one in the same Load Balance Domain
 267                 //while (tmp != sg) {
 268                 while (tmp != sg && !(sd->flags & SD_LOAD_BALANCE)) {
 269                         if (tmp->sgp->power * sg->group_weight <
 270                                         sg->sgp->power * tmp->group_weight) {
 271
 272 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
 273                                 pr_info("[PACK]  Now sg power = %u, weight = %u, mask = %lu\n", sg->sgp->power, sg->group_weight, sg->cpumask[0]);
 274                                 pr_info("[PACK]  Better sg power = %u, weight = %u, mask = %lu\n", tmp->sgp->power, tmp->group_weight, tmp->cpumask[0]);
 275 #endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
 276
 277                                 pack = tmp;
 278                         }
 279                         tmp = tmp->next;
 280                 }
 281
 282                 /* we have found a better group */
 283                 if (pack != sg) {
 284                         id = cpumask_first(sched_group_cpus(pack));
 285
 286 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
 287                         pr_info("[PACK]  Better sg, first cpu id = %d\n", id);
 288 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
 289
 290                 }
 291
 292 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
 293                 if(sd->parent) {
 294                         pr_info("[PACK]  cpu = %d, id = %d, sd->parent = 0x%08x, flags = %d, SD_LOAD_BALANCE = %d\n", cpu, id, (unsigned int)sd->parent, sd->parent->flags, SD_LOAD_BALANCE);
 295                         pr_info("[PACK]  %d\n", (id != cpu));
 296                         pr_info("[PACK]  0x%08x\n", (unsigned int)(sd->parent));
 297                         pr_info("[PACK]  %d\n", (sd->parent->flags & SD_LOAD_BALANCE));
 298                 }
 299                 else {
 300                         pr_info("[PACK]  cpu = %d, id = %d, sd->parent = 0x%08x\n", cpu, id, (unsigned int)sd->parent);
 301                 }
 302 #endif /* CONFIG_HMP_PACK_BUDDY_INFO */
 303
 304
 305                 /* Look for another CPU than itself */
 306                 if ((id != cpu) ||
 307                                 ((sd->parent) && (sd->parent->flags & SD_LOAD_BALANCE))) {
 308
 309 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
 310                         pr_info("[PACK]  Break\n");
 311 #endif /*CONFIG_HMP_PACK_BUDDY_INFO */
 312
 313                         break;
 314                 }
 315                 sd = sd->parent;
 316         }
 317
 318 #ifdef CONFIG_HMP_PACK_BUDDY_INFO
 319         pr_info("[PACK] CPU%d packing on CPU%d\n", cpu, id);
 320 #endif /* CONFIG_MTK_SCHED_CMP_PACK_BUDDY_INFO || CONFIG_HMP_PACK_BUDDY_INFO */
 321         mt_sched_printf("[PACK] CPU%d packing on CPU%d", cpu, id);
 322
 323 #ifdef CONFIG_HMP_PACK_SMALL_TASK
 324         per_cpu(sd_pack_buddy, cpu) = id;
 325 #else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
 326         if(per_cpu(sd_pack_buddy, cpu) != -1)
 327                 cpu_clear(per_cpu(sd_pack_buddy, cpu), buddy_cpu_map);
 328         per_cpu(sd_pack_buddy, cpu) = id;
 329         if(id != -1)
 330                 cpumask_set_cpu(id, &buddy_cpu_map);
 331 #endif
 332 }
 333
 334 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
 335 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_USAGE);
 336 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_PERIOD);
 337 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_NR);
 338 DEFINE_PER_CPU(u32, TASK_USGAE);
 339 DEFINE_PER_CPU(u32, TASK_PERIOD);
 340 u32 PACK_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
 341 u32 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
 342 u32 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
 343 u32 TASK_PACK_CPU_COUNT[4][NR_CPUS] = {{0}};
 344 u32 PA_ENABLE = 1;
 345 u32 PA_MON_ENABLE = 0;
 346 char PA_MON[4][TASK_COMM_LEN]={{0}};
 347 #endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER */
 348
 349 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
 350 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_USAGE);
 351 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_PERIOD);
 352 DEFINE_PER_CPU(u32, BUDDY_CPU_RQ_NR);
 353 DEFINE_PER_CPU(u32, TASK_USGAE);
 354 DEFINE_PER_CPU(u32, TASK_PERIOD);
 355 u32 PACK_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
 356 u32 AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
 357 u32 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
 358 u32 HMP_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
 359 u32 PA_ENABLE = 1;
 360 u32 LB_ENABLE = 1;
 361 u32 PA_MON_ENABLE = 0;
 362 char PA_MON[TASK_COMM_LEN];
 363
 364 #ifdef CONFIG_HMP_TRACER
 365 #define POWER_AWARE_ACTIVE_MODULE_PACK_FORM_CPUX_TO_CPUY (0)
 366 #define POWER_AWARE_ACTIVE_MODULE_AVOID_WAKE_UP_FORM_CPUX_TO_CPUY (1)
 367 #define POWER_AWARE_ACTIVE_MODULE_AVOID_BALANCE_FORM_CPUX_TO_CPUY (2)
 368 #define POWER_AWARE_ACTIVE_MODULE_AVOID_FORCE_UP_FORM_CPUX_TO_CPUY (3)
 369 #endif /* CONFIG_HMP_TRACER */
 370
 371 #endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER */
 372
 373
 374 static inline bool is_buddy_busy(int cpu)
 375 {
 376 #ifdef CONFIG_HMP_PACK_SMALL_TASK
 377         struct rq *rq;
 378
 379     if (cpu < 0)
 380         return 0;
 381
 382     rq = cpu_rq(cpu);
 383 #else  /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
 384                 struct rq *rq = cpu_rq(cpu);
 385 #endif
 386         /*
 387          * A busy buddy is a CPU with a high load or a small load with a lot of
 388          * running tasks.
 389          */
 390
 391 #if defined (CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER)
 392         per_cpu(BUDDY_CPU_RQ_USAGE, cpu) = rq->avg.usage_avg_sum;
 393         per_cpu(BUDDY_CPU_RQ_PERIOD, cpu) = rq->avg.runnable_avg_period;
 394         per_cpu(BUDDY_CPU_RQ_NR, cpu) = rq->nr_running;
 395 #endif /*(CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER) */
 396
 397         return ((rq->avg.usage_avg_sum << rq->nr_running) >
 398                         rq->avg.runnable_avg_period);
 399
 400 }
 401
 402 static inline bool is_light_task(struct task_struct *p)
 403 {
 404 #if defined (CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER) || defined (CONFIG_HMP_POWER_AWARE_CONTROLLER)
 405         per_cpu(TASK_USGAE, task_cpu(p)) = p->se.avg.usage_avg_sum;
 406         per_cpu(TASK_PERIOD, task_cpu(p)) = p->se.avg.runnable_avg_period;
 407 #endif /* CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER || CONFIG_HMP_POWER_AWARE_CONTROLLER*/
 408
 409         /* A light task runs less than 25% in average */
 410         return ((p->se.avg.usage_avg_sum << 2) < p->se.avg.runnable_avg_period);
 411 }
 412
 413
 414 static int check_pack_buddy(int cpu, struct task_struct *p)
 415 {
 416 #ifdef CONFIG_HMP_PACK_SMALL_TASK
 417         int buddy;
 418
 419         if(cpu >= NR_CPUS || cpu < 0)
 420                 return false;
 421         buddy = per_cpu(sd_pack_buddy, cpu);
 422 #else /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK */
 423         int buddy = cpu;
 424 #endif
 425
 426         /* No pack buddy for this CPU */
 427         if (buddy == -1)
 428                 return false;
 429
 430         /*
 431          * If a task is waiting for running on the CPU which is its own buddy,
 432          * let the default behavior to look for a better CPU if available
 433          * The threshold has been set to 37.5%
 434          */
 435 #ifdef CONFIG_HMP_PACK_SMALL_TASK
 436         if ((buddy == cpu)
 437          && ((p->se.avg.usage_avg_sum << 3) < (p->se.avg.runnable_avg_sum * 5)))
 438                 return false;
 439 #endif
 440
 441         /* buddy is not an allowed CPU */
 442         if (!cpumask_test_cpu(buddy, tsk_cpus_allowed(p)))
 443                 return false;
 444
 445         /*
 446          * If the task is a small one and the buddy is not overloaded,
 447          * we use buddy cpu
 448          */
 449          if (!is_light_task(p) || is_buddy_busy(buddy))
 450                 return false;
 451
 452         return true;
 453 }
 454 #endif /* CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK || CONFIG_HMP_PACK_SMALL_TASK*/
 455
 456 #if BITS_PER_LONG == 32
 457 # define WMULT_CONST    (~0UL)
 458 #else
 459 # define WMULT_CONST    (1UL << 32)
 460 #endif
 461
 462 #define WMULT_SHIFT     32
 463
 464 /*
 465  * Shift right and round:
 466  */
 467 #define SRR(x, y) (((x) + (1UL << ((y) - 1))) >> (y))
 468
 469 /*
 470  * delta *= weight / lw
 471  */
 472 static unsigned long
 473 calc_delta_mine(unsigned long delta_exec, unsigned long weight,
 474                 struct load_weight *lw)
 475 {
 476         u64 tmp;
 477
 478         /*
 479          * weight can be less than 2^SCHED_LOAD_RESOLUTION for task group sched
 480          * entities since MIN_SHARES = 2. Treat weight as 1 if less than
 481          * 2^SCHED_LOAD_RESOLUTION.
 482          */
 483         if (likely(weight > (1UL << SCHED_LOAD_RESOLUTION)))
 484                 tmp = (u64)delta_exec * scale_load_down(weight);
 485         else
 486                 tmp = (u64)delta_exec;
 487
 488         if (!lw->inv_weight) {
 489                 unsigned long w = scale_load_down(lw->weight);
 490
 491                 if (BITS_PER_LONG > 32 && unlikely(w >= WMULT_CONST))
 492                         lw->inv_weight = 1;
 493                 else if (unlikely(!w))
 494                         lw->inv_weight = WMULT_CONST;
 495                 else
 496                         lw->inv_weight = WMULT_CONST / w;
 497         }
 498
 499         /*
 500          * Check whether we'd overflow the 64-bit multiplication:
 501          */
 502         if (unlikely(tmp > WMULT_CONST))
 503                 tmp = SRR(SRR(tmp, WMULT_SHIFT/2) * lw->inv_weight,
 504                         WMULT_SHIFT/2);
 505         else
 506                 tmp = SRR(tmp * lw->inv_weight, WMULT_SHIFT);
 507
 508         return (unsigned long)min(tmp, (u64)(unsigned long)LONG_MAX);
 509 }
 510
 511
 512 const struct sched_class fair_sched_class;
 513
 514 /**************************************************************
 515  * CFS operations on generic schedulable entities:
 516  */
 517
 518 #ifdef CONFIG_FAIR_GROUP_SCHED
 519
 520 /* cpu runqueue to which this cfs_rq is attached */
 521 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 522 {
 523         return cfs_rq->rq;
 524 }
 525
 526 /* An entity is a task if it doesn't "own" a runqueue */
 527 #define entity_is_task(se)      (!se->my_q)
 528
 529 static inline struct task_struct *task_of(struct sched_entity *se)
 530 {
 531 #ifdef CONFIG_SCHED_DEBUG
 532         WARN_ON_ONCE(!entity_is_task(se));
 533 #endif
 534         return container_of(se, struct task_struct, se);
 535 }
 536
 537 /* Walk up scheduling entities hierarchy */
 538 #define for_each_sched_entity(se) \
 539                 for (; se; se = se->parent)
 540
 541 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 542 {
 543         return p->se.cfs_rq;
 544 }
 545
 546 /* runqueue on which this entity is (to be) queued */
 547 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 548 {
 549         return se->cfs_rq;
 550 }
 551
 552 /* runqueue "owned" by this group */
 553 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 554 {
 555         return grp->my_q;
 556 }
 557
 558 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
 559                                        int force_update);
 560
 561 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 562 {
 563         if (!cfs_rq->on_list) {
 564                 /*
 565                  * Ensure we either appear before our parent (if already
 566                  * enqueued) or force our parent to appear after us when it is
 567                  * enqueued.  The fact that we always enqueue bottom-up
 568                  * reduces this to two cases.
 569                  */
 570                 if (cfs_rq->tg->parent &&
 571                     cfs_rq->tg->parent->cfs_rq[cpu_of(rq_of(cfs_rq))]->on_list) {
 572                         list_add_rcu(&cfs_rq->leaf_cfs_rq_list,
 573                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
 574                 } else {
 575                         list_add_tail_rcu(&cfs_rq->leaf_cfs_rq_list,
 576                                 &rq_of(cfs_rq)->leaf_cfs_rq_list);
 577                 }
 578
 579                 cfs_rq->on_list = 1;
 580                 /* We should have no load, but we need to update last_decay. */
 581                 update_cfs_rq_blocked_load(cfs_rq, 0);
 582         }
 583 }
 584
 585 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 586 {
 587         if (cfs_rq->on_list) {
 588                 list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
 589                 cfs_rq->on_list = 0;
 590         }
 591 }
 592
 593 /* Iterate thr' all leaf cfs_rq's on a runqueue */
 594 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 595         list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
 596
 597 /* Do the two (enqueued) entities belong to the same group ? */
 598 static inline int
 599 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 600 {
 601         if (se && pse)
 602         {
 603                 if (se->cfs_rq == pse->cfs_rq)
 604                         return 1;
 605         }
 606
 607         return 0;
 608 }
 609
 610 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 611 {
 612         return se->parent;
 613 }
 614
 615 /* return depth at which a sched entity is present in the hierarchy */
 616 static inline int depth_se(struct sched_entity *se)
 617 {
 618         int depth = 0;
 619
 620         for_each_sched_entity(se)
 621                 depth++;
 622
 623         return depth;
 624 }
 625
 626 static void
 627 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 628 {
 629         int se_depth, pse_depth;
 630
 631         /*
 632          * preemption test can be made between sibling entities who are in the
 633          * same cfs_rq i.e who have a common parent. Walk up the hierarchy of
 634          * both tasks until we find their ancestors who are siblings of common
 635          * parent.
 636          */
 637
 638         /* First walk up until both entities are at same depth */
 639         se_depth = depth_se(*se);
 640         pse_depth = depth_se(*pse);
 641
 642         while (se_depth > pse_depth) {
 643                 se_depth--;
 644                 *se = parent_entity(*se);
 645         }
 646
 647         while (pse_depth > se_depth) {
 648                 pse_depth--;
 649                 *pse = parent_entity(*pse);
 650         }
 651
 652         while (!is_same_group(*se, *pse)) {
 653                 *se = parent_entity(*se);
 654                 *pse = parent_entity(*pse);
 655         }
 656 }
 657
 658 #else   /* !CONFIG_FAIR_GROUP_SCHED */
 659
 660 static inline struct task_struct *task_of(struct sched_entity *se)
 661 {
 662         return container_of(se, struct task_struct, se);
 663 }
 664
 665 static inline struct rq *rq_of(struct cfs_rq *cfs_rq)
 666 {
 667         return container_of(cfs_rq, struct rq, cfs);
 668 }
 669
 670 #define entity_is_task(se)      1
 671
 672 #define for_each_sched_entity(se) \
 673                 for (; se; se = NULL)
 674
 675 static inline struct cfs_rq *task_cfs_rq(struct task_struct *p)
 676 {
 677         return &task_rq(p)->cfs;
 678 }
 679
 680 static inline struct cfs_rq *cfs_rq_of(struct sched_entity *se)
 681 {
 682         struct task_struct *p = task_of(se);
 683         struct rq *rq = task_rq(p);
 684
 685         return &rq->cfs;
 686 }
 687
 688 /* runqueue "owned" by this group */
 689 static inline struct cfs_rq *group_cfs_rq(struct sched_entity *grp)
 690 {
 691         return NULL;
 692 }
 693
 694 static inline void list_add_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 695 {
 696 }
 697
 698 static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
 699 {
 700 }
 701
 702 #define for_each_leaf_cfs_rq(rq, cfs_rq) \
 703                 for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
 704
 705 static inline int
 706 is_same_group(struct sched_entity *se, struct sched_entity *pse)
 707 {
 708         return 1;
 709 }
 710
 711 static inline struct sched_entity *parent_entity(struct sched_entity *se)
 712 {
 713         return NULL;
 714 }
 715
 716 static inline void
 717 find_matching_se(struct sched_entity **se, struct sched_entity **pse)
 718 {
 719 }
 720
 721 #endif  /* CONFIG_FAIR_GROUP_SCHED */
 722
 723 static __always_inline
 724 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec);
 725
 726 /**************************************************************
 727  * Scheduling class tree data structure manipulation methods:
 728  */
 729
 730 static inline u64 max_vruntime(u64 max_vruntime, u64 vruntime)
 731 {
 732         s64 delta = (s64)(vruntime - max_vruntime);
 733         if (delta > 0)
 734                 max_vruntime = vruntime;
 735
 736         return max_vruntime;
 737 }
 738
 739 static inline u64 min_vruntime(u64 min_vruntime, u64 vruntime)
 740 {
 741         s64 delta = (s64)(vruntime - min_vruntime);
 742         if (delta < 0)
 743                 min_vruntime = vruntime;
 744
 745         return min_vruntime;
 746 }
 747
 748 static inline int entity_before(struct sched_entity *a,
 749                                 struct sched_entity *b)
 750 {
 751         return (s64)(a->vruntime - b->vruntime) < 0;
 752 }
 753
 754 static void update_min_vruntime(struct cfs_rq *cfs_rq)
 755 {
 756         u64 vruntime = cfs_rq->min_vruntime;
 757
 758         if (cfs_rq->curr)
 759                 vruntime = cfs_rq->curr->vruntime;
 760
 761         if (cfs_rq->rb_leftmost) {
 762                 struct sched_entity *se = rb_entry(cfs_rq->rb_leftmost,
 763                                                    struct sched_entity,
 764                                                    run_node);
 765
 766                 if (!cfs_rq->curr)
 767                         vruntime = se->vruntime;
 768                 else
 769                         vruntime = min_vruntime(vruntime, se->vruntime);
 770         }
 771
 772         /* ensure we never gain time by being placed backwards. */
 773         cfs_rq->min_vruntime = max_vruntime(cfs_rq->min_vruntime, vruntime);
 774 #ifndef CONFIG_64BIT
 775         smp_wmb();
 776         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
 777 #endif
 778 }
 779
 780 /*
 781  * Enqueue an entity into the rb-tree:
 782  */
 783 static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 784 {
 785         struct rb_node **link = &cfs_rq->tasks_timeline.rb_node;
 786         struct rb_node *parent = NULL;
 787         struct sched_entity *entry;
 788         int leftmost = 1;
 789
 790         /*
 791          * Find the right place in the rbtree:
 792          */
 793         while (*link) {
 794                 parent = *link;
 795                 entry = rb_entry(parent, struct sched_entity, run_node);
 796                 /*
 797                  * We dont care about collisions. Nodes with
 798                  * the same key stay together.
 799                  */
 800                 if (entity_before(se, entry)) {
 801                         link = &parent->rb_left;
 802                 } else {
 803                         link = &parent->rb_right;
 804                         leftmost = 0;
 805                 }
 806         }
 807
 808         /*
 809          * Maintain a cache of leftmost tree entries (it is frequently
 810          * used):
 811          */
 812         if (leftmost)
 813                 cfs_rq->rb_leftmost = &se->run_node;
 814
 815         rb_link_node(&se->run_node, parent, link);
 816         rb_insert_color(&se->run_node, &cfs_rq->tasks_timeline);
 817 }
 818
 819 static void __dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
 820 {
 821         if (cfs_rq->rb_leftmost == &se->run_node) {
 822                 struct rb_node *next_node;
 823
 824                 next_node = rb_next(&se->run_node);
 825                 cfs_rq->rb_leftmost = next_node;
 826         }
 827
 828         rb_erase(&se->run_node, &cfs_rq->tasks_timeline);
 829 }
 830
 831 struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq)
 832 {
 833         struct rb_node *left = cfs_rq->rb_leftmost;
 834
 835         if (!left)
 836                 return NULL;
 837
 838         return rb_entry(left, struct sched_entity, run_node);
 839 }
 840
 841 static struct sched_entity *__pick_next_entity(struct sched_entity *se)
 842 {
 843         struct rb_node *next = rb_next(&se->run_node);
 844
 845         if (!next)
 846                 return NULL;
 847
 848         return rb_entry(next, struct sched_entity, run_node);
 849 }
 850
 851 #ifdef CONFIG_SCHED_DEBUG
 852 struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq)
 853 {
 854         struct rb_node *last = rb_last(&cfs_rq->tasks_timeline);
 855
 856         if (!last)
 857                 return NULL;
 858
 859         return rb_entry(last, struct sched_entity, run_node);
 860 }
 861
 862 /**************************************************************
 863  * Scheduling class statistics methods:
 864  */
 865
 866 int sched_proc_update_handler(struct ctl_table *table, int write,
 867                 void __user *buffer, size_t *lenp,
 868                 loff_t *ppos)
 869 {
 870         int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
 871         int factor = get_update_sysctl_factor();
 872
 873         if (ret || !write)
 874                 return ret;
 875
 876         sched_nr_latency = DIV_ROUND_UP(sysctl_sched_latency,
 877                                         sysctl_sched_min_granularity);
 878
 879 #define WRT_SYSCTL(name) \
 880         (normalized_sysctl_##name = sysctl_##name / (factor))
 881         WRT_SYSCTL(sched_min_granularity);
 882         WRT_SYSCTL(sched_latency);
 883         WRT_SYSCTL(sched_wakeup_granularity);
 884 #undef WRT_SYSCTL
 885
 886         return 0;
 887 }
 888 #endif
 889
 890 /*
 891  * delta /= w
 892  */
 893 static inline unsigned long
 894 calc_delta_fair(unsigned long delta, struct sched_entity *se)
 895 {
 896         if (unlikely(se->load.weight != NICE_0_LOAD))
 897                 delta = calc_delta_mine(delta, NICE_0_LOAD, &se->load);
 898
 899         return delta;
 900 }
 901
 902 /*
 903  * The idea is to set a period in which each task runs once.
 904  *
 905  * When there are too many tasks (sched_nr_latency) we have to stretch
 906  * this period because otherwise the slices get too small.
 907  *
 908  * p = (nr <= nl) ? l : l*nr/nl
 909  */
 910 static u64 __sched_period(unsigned long nr_running)
 911 {
 912         u64 period = sysctl_sched_latency;
 913         unsigned long nr_latency = sched_nr_latency;
 914
 915         if (unlikely(nr_running > nr_latency)) {
 916                 period = sysctl_sched_min_granularity;
 917                 period *= nr_running;
 918         }
 919
 920         return period;
 921 }
 922
 923 /*
 924  * We calculate the wall-time slice from the period by taking a part
 925  * proportional to the weight.
 926  *
 927  * s = p*P[w/rw]
 928  */
 929 static u64 sched_slice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 930 {
 931         u64 slice = __sched_period(cfs_rq->nr_running + !se->on_rq);
 932
 933         for_each_sched_entity(se) {
 934                 struct load_weight *load;
 935                 struct load_weight lw;
 936
 937                 cfs_rq = cfs_rq_of(se);
 938                 load = &cfs_rq->load;
 939
 940                 if (unlikely(!se->on_rq)) {
 941                         lw = cfs_rq->load;
 942
 943                         update_load_add(&lw, se->load.weight);
 944                         load = &lw;
 945                 }
 946                 slice = calc_delta_mine(slice, se->load.weight, load);
 947         }
 948         return slice;
 949 }
 950
 951 /*
 952  * We calculate the vruntime slice of a to-be-inserted task.
 953  *
 954  * vs = s/w
 955  */
 956 static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
 957 {
 958         return calc_delta_fair(sched_slice(cfs_rq, se), se);
 959 }
 960
 961
 962 #ifdef CONFIG_SMP
 963 static inline void __update_task_entity_contrib(struct sched_entity *se);
 964
 965 static long __update_task_entity_ratio(struct sched_entity *se);
 966
 967 #define LOAD_AVG_PERIOD 32
 968 #define LOAD_AVG_MAX 47742 /* maximum possible load avg */
 969 #define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
 970 #define LOAD_AVG_VARIABLE_PERIOD 512
 971 static unsigned int init_task_load_period = 4000;
 972
 973 /* Give new task start runnable values to heavy its load in infant time */
 974 void init_task_runnable_average(struct task_struct *p)
 975 {
 976         u32 slice;
 977
 978         p->se.avg.decay_count = 0;
 979         slice = sched_slice(task_cfs_rq(p), &p->se) >> 10;
 980         p->se.avg.runnable_avg_sum = (init_task_load_period) ? 0 : slice;
 981         p->se.avg.runnable_avg_period = (init_task_load_period)?(init_task_load_period):slice;
 982         __update_task_entity_contrib(&p->se);
 983
 984 #ifdef CONFIG_MTK_SCHED_CMP
 985         /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
 986         p->se.avg.usage_avg_sum =  (init_task_load_period) ? 0 : slice;
 987 #endif
 988         __update_task_entity_ratio(&p->se);
 989         trace_sched_task_entity_avg(0, p, &p->se.avg);
 990 }
 991 #else
 992 void init_task_runnable_average(struct task_struct *p)
 993 {
 994 }
 995 #endif
 996
 997 /*
 998  * Update the current task's runtime statistics. Skip current tasks that
 999  * are not in our scheduling class.
1000  */
1001 static inline void
1002 __update_curr(struct cfs_rq *cfs_rq, struct sched_entity *curr,
1003               unsigned long delta_exec)
1004 {
1005         unsigned long delta_exec_weighted;
1006
1007         schedstat_set(curr->statistics.exec_max,
1008                       max((u64)delta_exec, curr->statistics.exec_max));
1009
1010         curr->sum_exec_runtime += delta_exec;
1011         schedstat_add(cfs_rq, exec_clock, delta_exec);
1012         delta_exec_weighted = calc_delta_fair(delta_exec, curr);
1013
1014         curr->vruntime += delta_exec_weighted;
1015         update_min_vruntime(cfs_rq);
1016 }
1017
1018 static void update_curr(struct cfs_rq *cfs_rq)
1019 {
1020         struct sched_entity *curr = cfs_rq->curr;
1021         u64 now = rq_of(cfs_rq)->clock_task;
1022         unsigned long delta_exec;
1023
1024         if (unlikely(!curr))
1025                 return;
1026
1027         /*
1028          * Get the amount of time the current task was running
1029          * since the last time we changed load (this cannot
1030          * overflow on 32 bits):
1031          */
1032         delta_exec = (unsigned long)(now - curr->exec_start);
1033         if (!delta_exec)
1034                 return;
1035
1036         __update_curr(cfs_rq, curr, delta_exec);
1037         curr->exec_start = now;
1038
1039         if (entity_is_task(curr)) {
1040                 struct task_struct *curtask = task_of(curr);
1041
1042                 trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
1043                 cpuacct_charge(curtask, delta_exec);
1044                 account_group_exec_runtime(curtask, delta_exec);
1045         }
1046
1047         account_cfs_rq_runtime(cfs_rq, delta_exec);
1048 }
1049
1050 static inline void
1051 update_stats_wait_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1052 {
1053         schedstat_set(se->statistics.wait_start, rq_of(cfs_rq)->clock);
1054 }
1055
1056 /*
1057  * Task is being enqueued - update stats:
1058  */
1059 static void update_stats_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1060 {
1061         /*
1062          * Are we enqueueing a waiting task? (for current tasks
1063          * a dequeue/enqueue event is a NOP)
1064          */
1065         if (se != cfs_rq->curr)
1066                 update_stats_wait_start(cfs_rq, se);
1067 }
1068
1069 static void
1070 update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
1071 {
1072         schedstat_set(se->statistics.wait_max, max(se->statistics.wait_max,
1073                         rq_of(cfs_rq)->clock - se->statistics.wait_start));
1074         schedstat_set(se->statistics.wait_count, se->statistics.wait_count + 1);
1075         schedstat_set(se->statistics.wait_sum, se->statistics.wait_sum +
1076                         rq_of(cfs_rq)->clock - se->statistics.wait_start);
1077 #ifdef CONFIG_SCHEDSTATS
1078         if (entity_is_task(se)) {
1079                 trace_sched_stat_wait(task_of(se),
1080                         rq_of(cfs_rq)->clock - se->statistics.wait_start);
1081         }
1082 #endif
1083         schedstat_set(se->statistics.wait_start, 0);
1084 }
1085
1086 static inline void
1087 update_stats_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1088 {
1089         /*
1090          * Mark the end of the wait period if dequeueing a
1091          * waiting task:
1092          */
1093         if (se != cfs_rq->curr)
1094                 update_stats_wait_end(cfs_rq, se);
1095 }
1096
1097 /*
1098  * We are picking a new current task - update its stats:
1099  */
1100 static inline void
1101 update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
1102 {
1103         /*
1104          * We are starting a new run period:
1105          */
1106         se->exec_start = rq_of(cfs_rq)->clock_task;
1107 }
1108
1109 /**************************************************
1110  * Scheduling class queueing methods:
1111  */
1112
1113 #ifdef CONFIG_NUMA_BALANCING
1114 /*
1115  * numa task sample period in ms
1116  */
1117 unsigned int sysctl_numa_balancing_scan_period_min = 100;
1118 unsigned int sysctl_numa_balancing_scan_period_max = 100*50;
1119 unsigned int sysctl_numa_balancing_scan_period_reset = 100*600;
1120
1121 /* Portion of address space to scan in MB */
1122 unsigned int sysctl_numa_balancing_scan_size = 256;
1123
1124 /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
1125 unsigned int sysctl_numa_balancing_scan_delay = 1000;
1126
1127 static void task_numa_placement(struct task_struct *p)
1128 {
1129         int seq;
1130
1131         if (!p->mm)     /* for example, ksmd faulting in a user's mm */
1132                 return;
1133         seq = ACCESS_ONCE(p->mm->numa_scan_seq);
1134         if (p->numa_scan_seq == seq)
1135                 return;
1136         p->numa_scan_seq = seq;
1137
1138         /* FIXME: Scheduling placement policy hints go here */
1139 }
1140
1141 /*
1142  * Got a PROT_NONE fault for a page on @node.
1143  */
1144 void task_numa_fault(int node, int pages, bool migrated)
1145 {
1146         struct task_struct *p = current;
1147
1148         if (!sched_feat_numa(NUMA))
1149                 return;
1150
1151         /* FIXME: Allocate task-specific structure for placement policy here */
1152
1153         /*
1154          * If pages are properly placed (did not migrate) then scan slower.
1155          * This is reset periodically in case of phase changes
1156          */
1157         if (!migrated)
1158                 p->numa_scan_period = min(sysctl_numa_balancing_scan_period_max,
1159                         p->numa_scan_period + jiffies_to_msecs(10));
1160
1161         task_numa_placement(p);
1162 }
1163
1164 static void reset_ptenuma_scan(struct task_struct *p)
1165 {
1166         ACCESS_ONCE(p->mm->numa_scan_seq)++;
1167         p->mm->numa_scan_offset = 0;
1168 }
1169
1170 /*
1171  * The expensive part of numa migration is done from task_work context.
1172  * Triggered from task_tick_numa().
1173  */
1174 void task_numa_work(struct callback_head *work)
1175 {
1176         unsigned long migrate, next_scan, now = jiffies;
1177         struct task_struct *p = current;
1178         struct mm_struct *mm = p->mm;
1179         struct vm_area_struct *vma;
1180         unsigned long start, end;
1181         long pages;
1182
1183         WARN_ON_ONCE(p != container_of(work, struct task_struct, numa_work));
1184
1185         work->next = work; /* protect against double add */
1186         /*
1187          * Who cares about NUMA placement when they're dying.
1188          *
1189          * NOTE: make sure not to dereference p->mm before this check,
1190          * exit_task_work() happens _after_ exit_mm() so we could be called
1191          * without p->mm even though we still had it when we enqueued this
1192          * work.
1193          */
1194         if (p->flags & PF_EXITING)
1195                 return;
1196
1197         /*
1198          * We do not care about task placement until a task runs on a node
1199          * other than the first one used by the address space. This is
1200          * largely because migrations are driven by what CPU the task
1201          * is running on. If it's never scheduled on another node, it'll
1202          * not migrate so why bother trapping the fault.
1203          */
1204         if (mm->first_nid == NUMA_PTE_SCAN_INIT)
1205                 mm->first_nid = numa_node_id();
1206         if (mm->first_nid != NUMA_PTE_SCAN_ACTIVE) {
1207                 /* Are we running on a new node yet? */
1208                 if (numa_node_id() == mm->first_nid &&
1209                     !sched_feat_numa(NUMA_FORCE))
1210                         return;
1211
1212                 mm->first_nid = NUMA_PTE_SCAN_ACTIVE;
1213         }
1214
1215         /*
1216          * Reset the scan period if enough time has gone by. Objective is that
1217          * scanning will be reduced if pages are properly placed. As tasks
1218          * can enter different phases this needs to be re-examined. Lacking
1219          * proper tracking of reference behaviour, this blunt hammer is used.
1220          */
1221         migrate = mm->numa_next_reset;
1222         if (time_after(now, migrate)) {
1223                 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1224                 next_scan = now + msecs_to_jiffies(sysctl_numa_balancing_scan_period_reset);
1225                 xchg(&mm->numa_next_reset, next_scan);
1226         }
1227
1228         /*
1229          * Enforce maximal scan/migration frequency..
1230          */
1231         migrate = mm->numa_next_scan;
1232         if (time_before(now, migrate))
1233                 return;
1234
1235         if (p->numa_scan_period == 0)
1236                 p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1237
1238         next_scan = now + msecs_to_jiffies(p->numa_scan_period);
1239         if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
1240                 return;
1241
1242         /*
1243          * Do not set pte_numa if the current running node is rate-limited.
1244          * This loses statistics on the fault but if we are unwilling to
1245          * migrate to this node, it is less likely we can do useful work
1246          */
1247         if (migrate_ratelimited(numa_node_id()))
1248                 return;
1249
1250         start = mm->numa_scan_offset;
1251         pages = sysctl_numa_balancing_scan_size;
1252         pages <<= 20 - PAGE_SHIFT; /* MB in pages */
1253         if (!pages)
1254                 return;
1255
1256         down_read(&mm->mmap_sem);
1257         vma = find_vma(mm, start);
1258         if (!vma) {
1259                 reset_ptenuma_scan(p);
1260                 start = 0;
1261                 vma = mm->mmap;
1262         }
1263         for (; vma; vma = vma->vm_next) {
1264                 if (!vma_migratable(vma))
1265                         continue;
1266
1267                 /* Skip small VMAs. They are not likely to be of relevance */
1268                 if (vma->vm_end - vma->vm_start < HPAGE_SIZE)
1269                         continue;
1270
1271                 /*
1272                  * Skip inaccessible VMAs to avoid any confusion between
1273                  * PROT_NONE and NUMA hinting ptes
1274                  */
1275                 if (!(vma->vm_flags & (VM_READ | VM_EXEC | VM_WRITE)))
1276                         continue;
1277
1278                 do {
1279                         start = max(start, vma->vm_start);
1280                         end = ALIGN(start + (pages << PAGE_SHIFT), HPAGE_SIZE);
1281                         end = min(end, vma->vm_end);
1282                         pages -= change_prot_numa(vma, start, end);
1283
1284                         start = end;
1285                         if (pages <= 0)
1286                                 goto out;
1287                 } while (end != vma->vm_end);
1288         }
1289
1290 out:
1291         /*
1292          * It is possible to reach the end of the VMA list but the last few VMAs are
1293          * not guaranteed to the vma_migratable. If they are not, we would find the
1294          * !migratable VMA on the next scan but not reset the scanner to the start
1295          * so check it now.
1296          */
1297         if (vma)
1298                 mm->numa_scan_offset = start;
1299         else
1300                 reset_ptenuma_scan(p);
1301         up_read(&mm->mmap_sem);
1302 }
1303
1304 /*
1305  * Drive the periodic memory faults..
1306  */
1307 void task_tick_numa(struct rq *rq, struct task_struct *curr)
1308 {
1309         struct callback_head *work = &curr->numa_work;
1310         u64 period, now;
1311
1312         /*
1313          * We don't care about NUMA placement if we don't have memory.
1314          */
1315         if (!curr->mm || (curr->flags & PF_EXITING) || work->next != work)
1316                 return;
1317
1318         /*
1319          * Using runtime rather than walltime has the dual advantage that
1320          * we (mostly) drive the selection from busy threads and that the
1321          * task needs to have done some actual work before we bother with
1322          * NUMA placement.
1323          */
1324         now = curr->se.sum_exec_runtime;
1325         period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
1326
1327         if (now - curr->node_stamp > period) {
1328                 if (!curr->node_stamp)
1329                         curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
1330                 curr->node_stamp = now;
1331
1332                 if (!time_before(jiffies, curr->mm->numa_next_scan)) {
1333                         init_task_work(work, task_numa_work); /* TODO: move this into sched_fork() */
1334                         task_work_add(curr, work, true);
1335                 }
1336         }
1337 }
1338 #else
1339 static void task_tick_numa(struct rq *rq, struct task_struct *curr)
1340 {
1341 }
1342 #endif /* CONFIG_NUMA_BALANCING */
1343
1344 static void
1345 account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1346 {
1347         update_load_add(&cfs_rq->load, se->load.weight);
1348         if (!parent_entity(se))
1349                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
1350 #ifdef CONFIG_SMP
1351         if (entity_is_task(se))
1352                 list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
1353 #endif
1354         cfs_rq->nr_running++;
1355 }
1356
1357 static void
1358 account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
1359 {
1360         update_load_sub(&cfs_rq->load, se->load.weight);
1361         if (!parent_entity(se))
1362                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
1363         if (entity_is_task(se))
1364                 list_del_init(&se->group_node);
1365         cfs_rq->nr_running--;
1366 }
1367
1368 #ifdef CONFIG_FAIR_GROUP_SCHED
1369 # ifdef CONFIG_SMP
1370 static inline long calc_tg_weight(struct task_group *tg, struct cfs_rq *cfs_rq)
1371 {
1372         long tg_weight;
1373
1374         /*
1375          * Use this CPU's actual weight instead of the last load_contribution
1376          * to gain a more accurate current total weight. See
1377          * update_cfs_rq_load_contribution().
1378          */
1379         tg_weight = atomic_long_read(&tg->load_avg);
1380         tg_weight -= cfs_rq->tg_load_contrib;
1381         tg_weight += cfs_rq->load.weight;
1382
1383         return tg_weight;
1384 }
1385
1386 static long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1387 {
1388         long tg_weight, load, shares;
1389
1390         tg_weight = calc_tg_weight(tg, cfs_rq);
1391         load = cfs_rq->load.weight;
1392
1393         shares = (tg->shares * load);
1394         if (tg_weight)
1395                 shares /= tg_weight;
1396
1397         if (shares < MIN_SHARES)
1398                 shares = MIN_SHARES;
1399         if (shares > tg->shares)
1400                 shares = tg->shares;
1401
1402         return shares;
1403 }
1404 # else /* CONFIG_SMP */
1405 static inline long calc_cfs_shares(struct cfs_rq *cfs_rq, struct task_group *tg)
1406 {
1407         return tg->shares;
1408 }
1409 # endif /* CONFIG_SMP */
1410 static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se,
1411                             unsigned long weight)
1412 {
1413         if (se->on_rq) {
1414                 /* commit outstanding execution time */
1415                 if (cfs_rq->curr == se)
1416                         update_curr(cfs_rq);
1417                 account_entity_dequeue(cfs_rq, se);
1418         }
1419
1420         update_load_set(&se->load, weight);
1421
1422         if (se->on_rq)
1423                 account_entity_enqueue(cfs_rq, se);
1424 }
1425
1426 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq);
1427
1428 static void update_cfs_shares(struct cfs_rq *cfs_rq)
1429 {
1430         struct task_group *tg;
1431         struct sched_entity *se;
1432         long shares;
1433
1434         tg = cfs_rq->tg;
1435         se = tg->se[cpu_of(rq_of(cfs_rq))];
1436         if (!se || throttled_hierarchy(cfs_rq))
1437                 return;
1438 #ifndef CONFIG_SMP
1439         if (likely(se->load.weight == tg->shares))
1440                 return;
1441 #endif
1442         shares = calc_cfs_shares(cfs_rq, tg);
1443
1444         reweight_entity(cfs_rq_of(se), se, shares);
1445 }
1446 #else /* CONFIG_FAIR_GROUP_SCHED */
1447 static inline void update_cfs_shares(struct cfs_rq *cfs_rq)
1448 {
1449 }
1450 #endif /* CONFIG_FAIR_GROUP_SCHED */
1451
1452 #ifdef CONFIG_SMP
1453 /*
1454  * We choose a half-life close to 1 scheduling period.
1455  * Note: The tables below are dependent on this value.
1456  */
1457 //#define LOAD_AVG_PERIOD 32
1458 //#define LOAD_AVG_MAX 47742 /* maximum possible load avg */
1459 //#define LOAD_AVG_MAX_N 345 /* number of full periods to produce LOAD_MAX_AVG */
1460
1461 /* Precomputed fixed inverse multiplies for multiplication by y^n */
1462 static const u32 runnable_avg_yN_inv[] = {
1463         0xffffffff, 0xfa83b2da, 0xf5257d14, 0xefe4b99a, 0xeac0c6e6, 0xe5b906e6,
1464         0xe0ccdeeb, 0xdbfbb796, 0xd744fcc9, 0xd2a81d91, 0xce248c14, 0xc9b9bd85,
1465         0xc5672a10, 0xc12c4cc9, 0xbd08a39e, 0xb8fbaf46, 0xb504f333, 0xb123f581,
1466         0xad583ee9, 0xa9a15ab4, 0xa5fed6a9, 0xa2704302, 0x9ef5325f, 0x9b8d39b9,
1467         0x9837f050, 0x94f4efa8, 0x91c3d373, 0x8ea4398a, 0x8b95c1e3, 0x88980e80,
1468         0x85aac367, 0x82cd8698,
1469 };
1470
1471 /*
1472  * Precomputed \Sum y^k { 1<=k<=n }.  These are floor(true_value) to prevent
1473  * over-estimates when re-combining.
1474  */
1475 static const u32 runnable_avg_yN_sum[] = {
1476             0, 1002, 1982, 2941, 3880, 4798, 5697, 6576, 7437, 8279, 9103,
1477          9909,10698,11470,12226,12966,13690,14398,15091,15769,16433,17082,
1478         17718,18340,18949,19545,20128,20698,21256,21802,22336,22859,23371,
1479 };
1480
1481 /*
1482  * Approximate:
1483  *   val * y^n,    where y^32 ~= 0.5 (~1 scheduling period)
1484  */
1485 static __always_inline u64 decay_load(u64 val, u64 n)
1486 {
1487         unsigned int local_n;
1488
1489         if (!n)
1490                 return val;
1491         else if (unlikely(n > LOAD_AVG_PERIOD * 63))
1492                 return 0;
1493
1494         /* after bounds checking we can collapse to 32-bit */
1495         local_n = n;
1496
1497         /*
1498          * As y^PERIOD = 1/2, we can combine
1499          *    y^n = 1/2^(n/PERIOD) * k^(n%PERIOD)
1500          * With a look-up table which covers k^n (n<PERIOD)
1501          *
1502          * To achieve constant time decay_load.
1503          */
1504         if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
1505                 val >>= local_n / LOAD_AVG_PERIOD;
1506                 local_n %= LOAD_AVG_PERIOD;
1507         }
1508
1509         val *= runnable_avg_yN_inv[local_n];
1510         /* We don't use SRR here since we always want to round down. */
1511         return val >> 32;
1512 }
1513
1514 /*
1515  * For updates fully spanning n periods, the contribution to runnable
1516  * average will be: \Sum 1024*y^n
1517  *
1518  * We can compute this reasonably efficiently by combining:
1519  *   y^PERIOD = 1/2 with precomputed \Sum 1024*y^n {for  n <PERIOD}
1520  */
1521 static u32 __compute_runnable_contrib(u64 n)
1522 {
1523         u32 contrib = 0;
1524
1525         if (likely(n <= LOAD_AVG_PERIOD))
1526                 return runnable_avg_yN_sum[n];
1527         else if (unlikely(n >= LOAD_AVG_MAX_N))
1528                 return LOAD_AVG_MAX;
1529
1530         /* Compute \Sum k^n combining precomputed values for k^i, \Sum k^j */
1531         do {
1532                 contrib /= 2; /* y^LOAD_AVG_PERIOD = 1/2 */
1533                 contrib += runnable_avg_yN_sum[LOAD_AVG_PERIOD];
1534
1535                 n -= LOAD_AVG_PERIOD;
1536         } while (n > LOAD_AVG_PERIOD);
1537
1538         contrib = decay_load(contrib, n);
1539         return contrib + runnable_avg_yN_sum[n];
1540 }
1541
1542 #ifdef CONFIG_HMP_VARIABLE_SCALE
1543
1544 #define HMP_VARIABLE_SCALE_SHIFT 16ULL
1545 struct hmp_global_attr {
1546         struct attribute attr;
1547         ssize_t (*show)(struct kobject *kobj,
1548                         struct attribute *attr, char *buf);
1549         ssize_t (*store)(struct kobject *a, struct attribute *b,
1550                         const char *c, size_t count);
1551         int *value;
1552         int (*to_sysfs)(int);
1553         int (*from_sysfs)(int);
1554 };
1555
1556 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1557 #define HMP_DATA_SYSFS_MAX 5
1558 #else
1559 #define HMP_DATA_SYSFS_MAX 4
1560 #endif
1561
1562 struct hmp_data_struct {
1563 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1564         int freqinvar_load_scale_enabled;
1565 #endif
1566         int multiplier; /* used to scale the time delta */
1567         struct attribute_group attr_group;
1568         struct attribute *attributes[HMP_DATA_SYSFS_MAX + 1];
1569         struct hmp_global_attr attr[HMP_DATA_SYSFS_MAX];
1570 } hmp_data;
1571
1572 static u64 hmp_variable_scale_convert(u64 delta);
1573 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1574 /* Frequency-Invariant Load Modification:
1575  * Loads are calculated as in PJT's patch however we also scale the current
1576  * contribution in line with the frequency of the CPU that the task was
1577  * executed on.
1578  * In this version, we use a simple linear scale derived from the maximum
1579  * frequency reported by CPUFreq. As an example:
1580  *
1581  * Consider that we ran a task for 100% of the previous interval.
1582  *
1583  * Our CPU was under asynchronous frequency control through one of the
1584  * CPUFreq governors.
1585  *
1586  * The CPUFreq governor reports that it is able to scale the CPU between
1587  * 500MHz and 1GHz.
1588  *
1589  * During the period, the CPU was running at 1GHz.
1590  *
1591  * In this case, our load contribution for that period is calculated as
1592  * 1 * (number_of_active_microseconds)
1593  *
1594  * This results in our task being able to accumulate maximum load as normal.
1595  *
1596  *
1597  * Consider now that our CPU was executing at 500MHz.
1598  *
1599  * We now scale the load contribution such that it is calculated as
1600  * 0.5 * (number_of_active_microseconds)
1601  *
1602  * Our task can only record 50% maximum load during this period.
1603  *
1604  * This represents the task consuming 50% of the CPU's *possible* compute
1605  * capacity. However the task did consume 100% of the CPU's *available*
1606  * compute capacity which is the value seen by the CPUFreq governor and
1607  * user-side CPU Utilization tools.
1608  *
1609  * Restricting tracked load to be scaled by the CPU's frequency accurately
1610  * represents the consumption of possible compute capacity and allows the
1611  * HMP migration's simple threshold migration strategy to interact more
1612  * predictably with CPUFreq's asynchronous compute capacity changes.
1613  */
1614 #define SCHED_FREQSCALE_SHIFT 10
1615 struct cpufreq_extents {
1616         u32 curr_scale;
1617         u32 min;
1618         u32 max;
1619         u32 flags;
1620 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
1621         u32 const_max;
1622         u32 throttling;
1623 #endif
1624 };
1625 /* Flag set when the governor in use only allows one frequency.
1626  * Disables scaling.
1627  */
1628 #define SCHED_LOAD_FREQINVAR_SINGLEFREQ 0x01
1629
1630 static struct cpufreq_extents freq_scale[CONFIG_NR_CPUS];
1631 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1632 #endif /* CONFIG_HMP_VARIABLE_SCALE */
1633
1634 #ifdef CONFIG_MTK_SCHED_CMP
1635 int get_cluster_id(unsigned int cpu)
1636 {
1637         return arch_get_cluster_id(cpu);
1638 }
1639
1640 void get_cluster_cpus(struct cpumask *cpus, int cluster_id,
1641                             bool exclusive_offline)
1642 {
1643         struct cpumask cls_cpus;
1644
1645         arch_get_cluster_cpus(&cls_cpus, cluster_id);
1646         if (exclusive_offline) {
1647                 cpumask_and(cpus, cpu_online_mask, &cls_cpus);
1648         } else
1649                 cpumask_copy(cpus, &cls_cpus);
1650 }
1651
1652 static int nr_cpus_in_cluster(int cluster_id, bool exclusive_offline)
1653 {
1654         struct cpumask cls_cpus;
1655         int nr_cpus;
1656
1657         arch_get_cluster_cpus(&cls_cpus, cluster_id);
1658         if (exclusive_offline) {
1659                 struct cpumask online_cpus;
1660                 cpumask_and(&online_cpus, cpu_online_mask, &cls_cpus);
1661                 nr_cpus = cpumask_weight(&online_cpus);
1662         } else
1663                 nr_cpus = cpumask_weight(&cls_cpus);
1664
1665         return nr_cpus;
1666 }
1667 #endif /* CONFIG_MTK_SCHED_CMP */
1668
1669 void sched_get_big_little_cpus(struct cpumask *big, struct cpumask *little)
1670 {
1671         arch_get_big_little_cpus(big, little);
1672 }
1673 EXPORT_SYMBOL(sched_get_big_little_cpus);
1674
1675 /*
1676  * generic entry point for cpu mask construction, dedicated for
1677  * mediatek scheduler.
1678  */
1679 static __init __inline void cmp_cputopo_domain_setup(void)
1680 {
1681         WARN(smp_processor_id() != 0, "%s is supposed runs on CPU0 "
1682                                       "while kernel init", __func__);
1683 #ifdef CONFIG_MTK_CPU_TOPOLOGY
1684         /*
1685          * sched_init
1686          *   |-> cmp_cputopo_domain_seutp()
1687          * ...
1688          * rest_init
1689          *   ^ fork kernel_init
1690          *       |-> kernel_init_freeable
1691          *        ...
1692          *           |-> arch_build_cpu_topology_domain
1693          *
1694          * here, we focus to build up cpu topology and domain before scheduler runs.
1695          */
1696         pr_debug("[CPUTOPO][%s] build CPU topology and cluster.\n", __func__);
1697         arch_build_cpu_topology_domain();
1698 #endif
1699 }
1700
1701 #ifdef CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY
1702 static u64 __inline variable_scale_convert(u64 delta)
1703 {
1704         u64 high = delta >> 32ULL;
1705         u64 low = delta & 0xffffffffULL;
1706         low *= LOAD_AVG_VARIABLE_PERIOD;
1707         high *= LOAD_AVG_VARIABLE_PERIOD;
1708         return (low >> 16ULL) + (high << (32ULL - 16ULL));
1709 }
1710 #endif
1711
1712 /* We can represent the historical contribution to runnable average as the
1713  * coefficients of a geometric series.  To do this we sub-divide our runnable
1714  * history into segments of approximately 1ms (1024us); label the segment that
1715  * occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
1716  *
1717  * [<- 1024us ->|<- 1024us ->|<- 1024us ->| ...
1718  *      p0            p1           p2
1719  *     (now)       (~1ms ago)  (~2ms ago)
1720  *
1721  * Let u_i denote the fraction of p_i that the entity was runnable.
1722  *
1723  * We then designate the fractions u_i as our co-efficients, yielding the
1724  * following representation of historical load:
1725  *   u_0 + u_1*y + u_2*y^2 + u_3*y^3 + ...
1726  *
1727  * We choose y based on the with of a reasonably scheduling period, fixing:
1728  *   y^32 = 0.5
1729  *
1730  * This means that the contribution to load ~32ms ago (u_32) will be weighted
1731  * approximately half as much as the contribution to load within the last ms
1732  * (u_0).
1733  *
1734  * When a period "rolls over" and we have new u_0`, multiplying the previous
1735  * sum again by y is sufficient to update:
1736  *   load_avg = u_0` + y*(u_0 + u_1*y + u_2*y^2 + ... )
1737  *            = u_0 + u_1*y + u_2*y^2 + ... [re-labeling u_i --> u_{i+1}]
1738  */
1739 static __always_inline int __update_entity_runnable_avg(u64 now,
1740                                                         struct sched_avg *sa,
1741                                                         int runnable,
1742                                                         int running,
1743                                                         int cpu)
1744 {
1745         u64 delta, periods, lru;
1746         u32 runnable_contrib;
1747         int delta_w, decayed = 0;
1748 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1749         u64 scaled_delta;
1750         u32 scaled_runnable_contrib;
1751         int scaled_delta_w;
1752         u32 curr_scale = 1024;
1753 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1754         u64 scaled_delta;
1755         u32 scaled_runnable_contrib;
1756         int scaled_delta_w;
1757         u32 curr_scale = CPUPOWER_FREQSCALE_DEFAULT;
1758 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1759
1760         delta = now - sa->last_runnable_update;
1761         lru = sa->last_runnable_update;
1762         /*
1763          * This should only happen when time goes backwards, which it
1764          * unfortunately does during sched clock init when we swap over to TSC.
1765          */
1766         if ((s64)delta < 0) {
1767                 sa->last_runnable_update = now;
1768                 return 0;
1769         }
1770
1771 #ifdef CONFIG_HMP_VARIABLE_SCALE
1772         delta = hmp_variable_scale_convert(delta);
1773 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1774         delta = variable_scale_convert(delta);
1775 #endif
1776         /*
1777          * Use 1024ns as the unit of measurement since it's a reasonable
1778          * approximation of 1us and fast to compute.
1779          */
1780         delta >>= 10;
1781         if (!delta)
1782                 return 0;
1783         sa->last_runnable_update = now;
1784
1785 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1786         WARN(cpu < 0, "[%s] CPU %d < 0 !!!\n", __func__, cpu);
1787         /* retrieve scale factor for load */
1788         if (cpu >= 0 && cpu < nr_cpu_ids && hmp_data.freqinvar_load_scale_enabled)
1789                 curr_scale = freq_scale[cpu].curr_scale;
1790         mt_sched_printf("[%s] cpu=%d delta=%llu now=%llu last=%llu curr_scale=%u",
1791                                         __func__, cpu, delta, now, lru, curr_scale);
1792 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1793         WARN(cpu < 0, "[%s] CPU %d < 0 !!!\n", __func__, cpu);
1794         /* retrieve scale factor for load */
1795         if (cpu >= 0 && cpu < nr_cpu_ids)
1796                 curr_scale = (topology_cpu_capacity(cpu) << CPUPOWER_FREQSCALE_SHIFT)
1797                         / (topology_max_cpu_capacity(cpu)+1);
1798         mt_sched_printf("[%s] cpu=%d delta=%llu now=%llu last=%llu curr_scale=%u",
1799                                         __func__, cpu, delta, now, lru, curr_scale);
1800 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1801
1802         /* delta_w is the amount already accumulated against our next period */
1803         delta_w = sa->runnable_avg_period % 1024;
1804         if (delta + delta_w >= 1024) {
1805                 /* period roll-over */
1806                 decayed = 1;
1807
1808                 /*
1809                  * Now that we know we're crossing a period boundary, figure
1810                  * out how much from delta we need to complete the current
1811                  * period and accrue it.
1812                  */
1813                 delta_w = 1024 - delta_w;
1814 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1815                 /* scale runnable time if necessary */
1816                 scaled_delta_w = (delta_w * curr_scale)
1817                                 >> SCHED_FREQSCALE_SHIFT;
1818                 if (runnable)
1819                         sa->runnable_avg_sum += scaled_delta_w;
1820                 if (running)
1821                         sa->usage_avg_sum += scaled_delta_w;
1822 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1823                 /* scale runnable time if necessary */
1824                 scaled_delta_w = (delta_w * curr_scale)
1825                                 >> CPUPOWER_FREQSCALE_SHIFT;
1826                 if (runnable)
1827                         sa->runnable_avg_sum += scaled_delta_w;
1828                 if (running)
1829                         sa->usage_avg_sum += scaled_delta_w;
1830 #else
1831                 if (runnable)
1832                         sa->runnable_avg_sum += delta_w;
1833                 if (running)
1834                         sa->usage_avg_sum += delta_w;
1835 #endif /* #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1836                 sa->runnable_avg_period += delta_w;
1837
1838                 delta -= delta_w;
1839
1840                 /* Figure out how many additional periods this update spans */
1841                 periods = delta / 1024;
1842                 delta %= 1024;
1843                 /* decay the load we have accumulated so far */
1844                 sa->runnable_avg_sum = decay_load(sa->runnable_avg_sum,
1845                                                   periods + 1);
1846                 sa->runnable_avg_period = decay_load(sa->runnable_avg_period,
1847                                                      periods + 1);
1848                 sa->usage_avg_sum = decay_load(sa->usage_avg_sum, periods + 1);
1849                 /* add the contribution from this period */
1850                 /* Efficiently calculate \sum (1..n_period) 1024*y^i */
1851                 runnable_contrib = __compute_runnable_contrib(periods);
1852 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1853                 /* Apply load scaling if necessary.
1854                  * Note that multiplying the whole series is same as
1855                  * multiplying all terms
1856                  */
1857                 scaled_runnable_contrib = (runnable_contrib * curr_scale)
1858                                 >> SCHED_FREQSCALE_SHIFT;
1859                 if (runnable)
1860                         sa->runnable_avg_sum += scaled_runnable_contrib;
1861                 if (running)
1862                         sa->usage_avg_sum += scaled_runnable_contrib;
1863 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1864                 /* Apply load scaling if necessary.
1865                  * Note that multiplying the whole series is same as
1866                  * multiplying all terms
1867                  */
1868                 scaled_runnable_contrib = (runnable_contrib * curr_scale)
1869                                 >> CPUPOWER_FREQSCALE_SHIFT;
1870                 if (runnable)
1871                         sa->runnable_avg_sum += scaled_runnable_contrib;
1872                 if (running)
1873                         sa->usage_avg_sum += scaled_runnable_contrib;
1874 #else
1875                 if (runnable)
1876                         sa->runnable_avg_sum += runnable_contrib;
1877                 if (running)
1878                         sa->usage_avg_sum += runnable_contrib;
1879 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1880                 sa->runnable_avg_period += runnable_contrib;
1881         }
1882
1883         /* Remainder of delta accrued against u_0` */
1884 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
1885         /* scale if necessary */
1886         scaled_delta = ((delta * curr_scale) >> SCHED_FREQSCALE_SHIFT);
1887         if (runnable)
1888                 sa->runnable_avg_sum += scaled_delta;
1889         if (running)
1890                 sa->usage_avg_sum += scaled_delta;
1891 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
1892         /* scale if necessary */
1893         scaled_delta = ((delta * curr_scale) >> CPUPOWER_FREQSCALE_SHIFT);
1894         if (runnable)
1895                 sa->runnable_avg_sum += scaled_delta;
1896         if (running)
1897                 sa->usage_avg_sum += scaled_delta;
1898 #else
1899         if (runnable)
1900                 sa->runnable_avg_sum += delta;
1901         if (running)
1902                 sa->usage_avg_sum += delta;
1903 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
1904         sa->runnable_avg_period += delta;
1905
1906         return decayed;
1907 }
1908
1909 /* Synchronize an entity's decay with its parenting cfs_rq.*/
1910 static inline u64 __synchronize_entity_decay(struct sched_entity *se)
1911 {
1912         struct cfs_rq *cfs_rq = cfs_rq_of(se);
1913         u64 decays = atomic64_read(&cfs_rq->decay_counter);
1914
1915         decays -= se->avg.decay_count;
1916         if (!decays)
1917                 return 0;
1918
1919         se->avg.load_avg_contrib = decay_load(se->avg.load_avg_contrib, decays);
1920         se->avg.decay_count = 0;
1921
1922         return decays;
1923 }
1924
1925 #ifdef CONFIG_FAIR_GROUP_SCHED
1926 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
1927                                                  int force_update)
1928 {
1929         struct task_group *tg = cfs_rq->tg;
1930         long tg_contrib;
1931
1932         tg_contrib = cfs_rq->runnable_load_avg + cfs_rq->blocked_load_avg;
1933         tg_contrib -= cfs_rq->tg_load_contrib;
1934
1935         if (force_update || abs(tg_contrib) > cfs_rq->tg_load_contrib / 8) {
1936                 atomic_long_add(tg_contrib, &tg->load_avg);
1937                 cfs_rq->tg_load_contrib += tg_contrib;
1938         }
1939 }
1940
1941 /*
1942  * Aggregate cfs_rq runnable averages into an equivalent task_group
1943  * representation for computing load contributions.
1944  */
1945 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
1946                                                   struct cfs_rq *cfs_rq)
1947 {
1948         struct task_group *tg = cfs_rq->tg;
1949         long contrib, usage_contrib;
1950
1951         /* The fraction of a cpu used by this cfs_rq */
1952         contrib = div_u64(sa->runnable_avg_sum << NICE_0_SHIFT,
1953                           sa->runnable_avg_period + 1);
1954         contrib -= cfs_rq->tg_runnable_contrib;
1955
1956         usage_contrib = div_u64(sa->usage_avg_sum << NICE_0_SHIFT,
1957                                 sa->runnable_avg_period + 1);
1958         usage_contrib -= cfs_rq->tg_usage_contrib;
1959
1960         /*
1961          * contrib/usage at this point represent deltas, only update if they
1962          * are substantive.
1963          */
1964         if ((abs(contrib) > cfs_rq->tg_runnable_contrib / 64) ||
1965             (abs(usage_contrib) > cfs_rq->tg_usage_contrib / 64)) {
1966                 atomic_add(contrib, &tg->runnable_avg);
1967                 cfs_rq->tg_runnable_contrib += contrib;
1968
1969                 atomic_add(usage_contrib, &tg->usage_avg);
1970                 cfs_rq->tg_usage_contrib += usage_contrib;
1971         }
1972 }
1973
1974 static inline void __update_group_entity_contrib(struct sched_entity *se)
1975 {
1976         struct cfs_rq *cfs_rq = group_cfs_rq(se);
1977         struct task_group *tg = cfs_rq->tg;
1978         int runnable_avg;
1979
1980         u64 contrib;
1981
1982         contrib = cfs_rq->tg_load_contrib * tg->shares;
1983         se->avg.load_avg_contrib = div_u64(contrib,
1984                                      atomic_long_read(&tg->load_avg) + 1);
1985
1986         /*
1987          * For group entities we need to compute a correction term in the case
1988          * that they are consuming <1 cpu so that we would contribute the same
1989          * load as a task of equal weight.
1990          *
1991          * Explicitly co-ordinating this measurement would be expensive, but
1992          * fortunately the sum of each cpus contribution forms a usable
1993          * lower-bound on the true value.
1994          *
1995          * Consider the aggregate of 2 contributions.  Either they are disjoint
1996          * (and the sum represents true value) or they are disjoint and we are
1997          * understating by the aggregate of their overlap.
1998          *
1999          * Extending this to N cpus, for a given overlap, the maximum amount we
2000          * understand is then n_i(n_i+1)/2 * w_i where n_i is the number of
2001          * cpus that overlap for this interval and w_i is the interval width.
2002          *
2003          * On a small machine; the first term is well-bounded which bounds the
2004          * total error since w_i is a subset of the period.  Whereas on a
2005          * larger machine, while this first term can be larger, if w_i is the
2006          * of consequential size guaranteed to see n_i*w_i quickly converge to
2007          * our upper bound of 1-cpu.
2008          */
2009         runnable_avg = atomic_read(&tg->runnable_avg);
2010         if (runnable_avg < NICE_0_LOAD) {
2011                 se->avg.load_avg_contrib *= runnable_avg;
2012                 se->avg.load_avg_contrib >>= NICE_0_SHIFT;
2013         }
2014 }
2015 #else
2016 static inline void __update_cfs_rq_tg_load_contrib(struct cfs_rq *cfs_rq,
2017                                                    int force_update) {}
2018 static inline void __update_tg_runnable_avg(struct sched_avg *sa,
2019                                             struct cfs_rq *cfs_rq) {}
2020 static inline void __update_group_entity_contrib(struct sched_entity *se) {}
2021 #endif
2022
2023 static inline void __update_task_entity_contrib(struct sched_entity *se)
2024 {
2025         u32 contrib;
2026
2027         /* avoid overflowing a 32-bit type w/ SCHED_LOAD_SCALE */
2028         contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);
2029         contrib /= (se->avg.runnable_avg_period + 1);
2030         se->avg.load_avg_contrib = scale_load(contrib);
2031 }
2032
2033 /* Compute the current contribution to load_avg by se, return any delta */
2034 static long __update_entity_load_avg_contrib(struct sched_entity *se)
2035 {
2036         long old_contrib = se->avg.load_avg_contrib;
2037
2038         if (entity_is_task(se)) {
2039                 __update_task_entity_contrib(se);
2040         } else {
2041                 __update_tg_runnable_avg(&se->avg, group_cfs_rq(se));
2042                 __update_group_entity_contrib(se);
2043         }
2044
2045         return se->avg.load_avg_contrib - old_contrib;
2046 }
2047
2048 #if defined(CONFIG_MTK_SCHED_CMP) || defined(CONFIG_SCHED_HMP_ENHANCEMENT)
2049 /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
2050 static long __update_task_entity_ratio(struct sched_entity *se)
2051 {
2052         long old_ratio = se->avg.load_avg_ratio;
2053         u32 ratio;
2054
2055         ratio = se->avg.runnable_avg_sum * scale_load_down(NICE_0_LOAD);
2056         ratio /= (se->avg.runnable_avg_period + 1);
2057         se->avg.load_avg_ratio = scale_load(ratio);
2058
2059         return se->avg.load_avg_ratio - old_ratio;
2060 }
2061 #else
2062 static inline long __update_task_entity_ratio(struct sched_entity *se) { return 0; }
2063 #endif
2064
2065 static inline void subtract_blocked_load_contrib(struct cfs_rq *cfs_rq,
2066                                                  long load_contrib)
2067 {
2068         if (likely(load_contrib < cfs_rq->blocked_load_avg))
2069                 cfs_rq->blocked_load_avg -= load_contrib;
2070         else
2071                 cfs_rq->blocked_load_avg = 0;
2072 }
2073
2074 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2075 unsigned int hmp_up_prio = NICE_TO_PRIO(CONFIG_SCHED_HMP_PRIO_FILTER_VAL);
2076 #endif
2077
2078 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
2079 /* Schedule entity */
2080 #define se_pid(se) ((se != NULL && entity_is_task(se))? \
2081                         container_of(se,struct task_struct,se)->pid:-1)
2082 #define se_load(se) se->avg.load_avg_ratio
2083 #define se_contrib(se) se->avg.load_avg_contrib
2084
2085 /* CPU related : load information */
2086 #define cfs_pending_load(cpu) cpu_rq(cpu)->cfs.avg.pending_load
2087 #define cfs_load(cpu) cpu_rq(cpu)->cfs.avg.load_avg_ratio
2088 #define cfs_contrib(cpu) cpu_rq(cpu)->cfs.avg.load_avg_contrib
2089
2090 /* CPU related : the number of tasks */
2091 #define cfs_nr_normal_prio(cpu) cpu_rq(cpu)->cfs.avg.nr_normal_prio
2092 #define cfs_nr_pending(cpu) cpu_rq(cpu)->cfs.avg.nr_pending
2093 #define cfs_length(cpu) cpu_rq(cpu)->cfs.h_nr_running
2094 #define rq_length(cpu) (cpu_rq(cpu)->nr_running + cfs_nr_pending(cpu))
2095
2096 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2097 #define task_low_priority(prio) ((prio >= hmp_up_prio)?1:0)
2098 #define cfs_nr_dequeuing_low_prio(cpu) \
2099                         cpu_rq(cpu)->cfs.avg.nr_dequeuing_low_prio
2100 #define cfs_reset_nr_dequeuing_low_prio(cpu) \
2101                         (cfs_nr_dequeuing_low_prio(cpu) = 0)
2102 #else
2103 #define task_low_priority(prio) (0)
2104 #define cfs_reset_nr_dequeuing_low_prio(cpu)
2105 #endif /* CONFIG_SCHED_HMP_PRIO_FILTER */
2106 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
2107
2108 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq);
2109
2110 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2111 int group_leader_is_empty(struct task_struct *p) {
2112
2113         struct task_struct *tg = p->group_leader;
2114
2115         if (SIGNAL_GROUP_EXIT & p->signal->flags){
2116         //      pr_warn("[%s] (0x%p/0x%p)(#%d/%s) leader: pid(%d) state(%d) exit_state(%d)signal_flags=%x p->signal->flags=%x group_exit_code=%x\n", __func__,
2117         //      p, tg, get_nr_threads(p), thread_group_empty(p) ? "empty" : "not empty",
2118         //      p->tgid, tg->state, tg->exit_state, tg->state, p->signal->flags, p->signal->group_exit_code);
2119                 return 1;
2120         }
2121
2122         // workaround debug codes
2123         if(tg->state == 0x6b6b6b6b){
2124         //      pr_warn("[%s] (0x%p/0x%p)(#%d/%s) leader: state(%d) exit_state(%d)\n", __func__,
2125         //      p, tg, get_nr_threads(p), thread_group_empty(p) ? "empty" : "not empty",
2126         //      tg->state, tg->exit_state);
2127                 return 1;
2128         }
2129
2130         return 0;
2131 }
2132
2133 static inline void update_tg_info(struct cfs_rq *cfs_rq, struct sched_entity *se, long ratio_delta)
2134 {
2135         struct task_struct *p = task_of(se);
2136         struct task_struct *tg = p->group_leader;
2137         int id;
2138         unsigned long flags;
2139
2140         if (group_leader_is_empty(p))
2141                 return;
2142         id = get_cluster_id(cfs_rq->rq->cpu);
2143         if (unlikely(WARN_ON(id < 0)))
2144                 return;
2145
2146         raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags);
2147         tg->thread_group_info[id].load_avg_ratio += ratio_delta;
2148         raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags);
2149
2150 #ifdef CONFIG_MT_SCHED_INFO
2151         mt_sched_printf("update_tg_info %d:%s %d:%s %ld %ld %d %d %lu:%lu:%lu update",
2152            tg->pid, tg->comm, p->pid, p->comm,
2153            se->avg.load_avg_ratio, ratio_delta,
2154            cfs_rq->rq->cpu, id,
2155            tg->thread_group_info[id].nr_running,
2156            tg->thread_group_info[id].cfs_nr_running,
2157            tg->thread_group_info[id].load_avg_ratio);
2158 /*
2159         mt_sched_printf("update %d:%s %d:%s %ld %ld %d %d %lu %lu %lu, %lu %lu %lu",
2160            tg->pid, tg->comm, p->pid, p->comm,
2161            se->avg.load_avg_ratio, ratio_delta,
2162            id, cfs_rq->rq->cpu,
2163            tg->thread_group_info[0].nr_running,
2164            tg->thread_group_info[0].cfs_nr_running,
2165            tg->thread_group_info[0].load_avg_ratio,
2166            tg->thread_group_info[1].nr_running,
2167            tg->thread_group_info[1].cfs_nr_running,
2168            tg->thread_group_info[1].load_avg_ratio);
2169 */
2170 #endif
2171
2172 }
2173 #endif
2174
2175 /* Update a sched_entity's runnable average */
2176 static inline void update_entity_load_avg(struct sched_entity *se,
2177                                           int update_cfs_rq)
2178 {
2179         struct cfs_rq *cfs_rq = cfs_rq_of(se);
2180         long contrib_delta;
2181         u64 now;
2182         long ratio_delta = 0;
2183         int cpu = -1;   /* not used in normal case */
2184
2185 #if defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE)                       \
2186         || defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
2187         cpu = cfs_rq->rq->cpu;
2188 #endif
2189
2190         /*
2191          * For a group entity we need to use their owned cfs_rq_clock_task() in
2192          * case they are the parent of a throttled hierarchy.
2193          */
2194         if (entity_is_task(se))
2195                 now = cfs_rq_clock_task(cfs_rq);
2196         else
2197                 now = cfs_rq_clock_task(group_cfs_rq(se));
2198
2199         if (!__update_entity_runnable_avg(now, &se->avg, se->on_rq,
2200                                    cfs_rq->curr == se, cpu)) {
2201 #if 0
2202                 if (entity_is_task(se)) {
2203                         ratio_delta = __update_task_entity_ratio(se);
2204                         if (update_cfs_rq)
2205                         {
2206                                 cpu = cfs_rq->rq->cpu;
2207                                 cpu_rq(cpu)->cfs.avg.load_avg_ratio += ratio_delta;
2208 #ifdef CONFIG_HMP_TRACER
2209                                 trace_sched_cfs_load_update(task_of(se),se_load(se),ratio_delta, cpu);
2210 #endif /* CONFIG_HMP_TRACER */
2211                         }
2212
2213                         trace_sched_task_entity_avg(2, task_of(se), &se->avg);
2214 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2215                         if (se->on_rq) {
2216                                 update_tg_info(cfs_rq, se, ratio_delta);
2217                         }
2218 #endif
2219                 }
2220 #endif
2221                 return;
2222         }
2223
2224         contrib_delta = __update_entity_load_avg_contrib(se);
2225
2226         /* usage_avg_sum & load_avg_ratio are based on Linaro 12.11. */
2227         if (entity_is_task(se)) {
2228                 ratio_delta = __update_task_entity_ratio(se);
2229                 /*
2230                  * ratio is re-estimated just for entity of task; as
2231                  * for contrib, mark tracer here for task entity while
2232                  * mining tg's at __update_group_entity_contrib().
2233                  *
2234                  * track running usage in passing.
2235                  */
2236                 trace_sched_task_entity_avg(3, task_of(se), &se->avg);
2237         }
2238
2239         if (!update_cfs_rq)
2240                 return;
2241
2242         if (se->on_rq) {
2243                 cfs_rq->runnable_load_avg += contrib_delta;
2244                 if (entity_is_task(se)) {
2245                         cpu = cfs_rq->rq->cpu;
2246                         cpu_rq(cpu)->cfs.avg.load_avg_ratio += ratio_delta;
2247                         cpu_rq(cpu)->cfs.avg.load_avg_contrib += contrib_delta;
2248 #ifdef CONFIG_HMP_TRACER
2249                         trace_sched_cfs_load_update(task_of(se),se_load(se),ratio_delta,cpu);
2250 #endif /* CONFIG_HMP_TRACER */
2251 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2252                         update_tg_info(cfs_rq, se, ratio_delta);
2253 #endif
2254                 }
2255         }
2256         else
2257                 subtract_blocked_load_contrib(cfs_rq, -contrib_delta);
2258 }
2259
2260
2261 /*
2262  * Decay the load contributed by all blocked children and account this so that
2263  * their contribution may appropriately discounted when they wake up.
2264  */
2265 static void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq, int force_update)
2266 {
2267         u64 now = cfs_rq_clock_task(cfs_rq) >> 20;
2268         u64 decays;
2269
2270         decays = now - cfs_rq->last_decay;
2271         if (!decays && !force_update)
2272                 return;
2273
2274         if (atomic_long_read(&cfs_rq->removed_load)) {
2275                 unsigned long removed_load;
2276                 removed_load = atomic_long_xchg(&cfs_rq->removed_load, 0);
2277                 subtract_blocked_load_contrib(cfs_rq, removed_load);
2278         }
2279
2280         if (decays) {
2281                 cfs_rq->blocked_load_avg = decay_load(cfs_rq->blocked_load_avg,
2282                                                       decays);
2283                 atomic64_add(decays, &cfs_rq->decay_counter);
2284                 cfs_rq->last_decay = now;
2285         }
2286
2287         __update_cfs_rq_tg_load_contrib(cfs_rq, force_update);
2288 }
2289
2290 static inline void update_rq_runnable_avg(struct rq *rq, int runnable)
2291 {
2292         u32 contrib;
2293         int cpu = -1;   /* not used in normal case */
2294
2295 #if defined(CONFIG_HMP_FREQUENCY_INVARIANT_SCALE)                       \
2296         || defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
2297         cpu = rq->cpu;
2298 #endif
2299         __update_entity_runnable_avg(rq->clock_task, &rq->avg, runnable,
2300                               runnable, cpu);
2301         __update_tg_runnable_avg(&rq->avg, &rq->cfs);
2302         contrib = rq->avg.runnable_avg_sum * scale_load_down(1024);
2303         contrib /= (rq->avg.runnable_avg_period + 1);
2304         trace_sched_rq_runnable_ratio(cpu_of(rq), scale_load(contrib));
2305         trace_sched_rq_runnable_load(cpu_of(rq), rq->cfs.runnable_load_avg);
2306 }
2307
2308 /* Add the load generated by se into cfs_rq's child load-average */
2309 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2310                                                   struct sched_entity *se,
2311                                                   int wakeup)
2312 {
2313         int cpu = cfs_rq->rq->cpu;
2314
2315         /*
2316          * We track migrations using entity decay_count <= 0, on a wake-up
2317          * migration we use a negative decay count to track the remote decays
2318          * accumulated while sleeping.
2319          *
2320          * Newly forked tasks are enqueued with se->avg.decay_count == 0, they
2321          * are seen by enqueue_entity_load_avg() as a migration with an already
2322          * constructed load_avg_contrib.
2323          */
2324         if (unlikely(se->avg.decay_count <= 0)) {
2325                 se->avg.last_runnable_update = rq_of(cfs_rq)->clock_task;
2326                 if (se->avg.decay_count) {
2327                         /*
2328                          * In a wake-up migration we have to approximate the
2329                          * time sleeping.  This is because we can't synchronize
2330                          * clock_task between the two cpus, and it is not
2331                          * guaranteed to be read-safe.  Instead, we can
2332                          * approximate this using our carried decays, which are
2333                          * explicitly atomically readable.
2334                          */
2335                         se->avg.last_runnable_update -= (-se->avg.decay_count)
2336                                                         << 20;
2337                         update_entity_load_avg(se, 0);
2338                         /* Indicate that we're now synchronized and on-rq */
2339                         se->avg.decay_count = 0;
2340 #ifdef CONFIG_MTK_SCHED_CMP
2341                 } else {
2342                         if (entity_is_task(se))
2343                                 trace_sched_task_entity_avg(1, task_of(se), &se->avg);
2344 #endif
2345                 }
2346                 wakeup = 0;
2347         } else {
2348                 __synchronize_entity_decay(se);
2349         }
2350
2351         /* migrated tasks did not contribute to our blocked load */
2352         if (wakeup) {
2353                 subtract_blocked_load_contrib(cfs_rq, se->avg.load_avg_contrib);
2354                 update_entity_load_avg(se, 0);
2355         }
2356
2357         cfs_rq->runnable_load_avg += se->avg.load_avg_contrib;
2358 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2359         if(entity_is_task(se)){
2360                 update_tg_info(cfs_rq, se, se->avg.load_avg_ratio);
2361         }
2362 #endif
2363
2364         if (entity_is_task(se)) {
2365                 cpu_rq(cpu)->cfs.avg.load_avg_contrib += se->avg.load_avg_contrib;
2366                 cpu_rq(cpu)->cfs.avg.load_avg_ratio += se->avg.load_avg_ratio;
2367 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
2368                 cfs_nr_pending(cpu) = 0;
2369                 cfs_pending_load(cpu) = 0;
2370 #endif
2371 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2372                 if(!task_low_priority(task_of(se)->prio))
2373                         cfs_nr_normal_prio(cpu)++;
2374 #endif
2375 #ifdef CONFIG_HMP_TRACER
2376                 trace_sched_cfs_enqueue_task(task_of(se),se_load(se),cpu);
2377 #endif
2378         }
2379
2380         /* we force update consideration on load-balancer moves */
2381         update_cfs_rq_blocked_load(cfs_rq, !wakeup);
2382 }
2383
2384 /*
2385  * Remove se's load from this cfs_rq child load-average, if the entity is
2386  * transitioning to a blocked state we track its projected decay using
2387  * blocked_load_avg.
2388  */
2389 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2390                                                   struct sched_entity *se,
2391                                                   int sleep)
2392 {
2393         int cpu = cfs_rq->rq->cpu;
2394
2395         update_entity_load_avg(se, 1);
2396         /* we force update consideration on load-balancer moves */
2397         update_cfs_rq_blocked_load(cfs_rq, !sleep);
2398
2399         cfs_rq->runnable_load_avg -= se->avg.load_avg_contrib;
2400 #ifdef CONFIG_MTK_SCHED_CMP_TGS
2401         if(entity_is_task(se)){
2402                 update_tg_info(cfs_rq, se, -se->avg.load_avg_ratio);
2403         }
2404 #endif
2405
2406         if (entity_is_task(se)) {
2407                 cpu_rq(cpu)->cfs.avg.load_avg_contrib -= se->avg.load_avg_contrib;
2408                 cpu_rq(cpu)->cfs.avg.load_avg_ratio -= se->avg.load_avg_ratio;
2409 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
2410                 cfs_reset_nr_dequeuing_low_prio(cpu);
2411                 if(!task_low_priority(task_of(se)->prio))
2412                         cfs_nr_normal_prio(cpu)--;
2413 #endif
2414 #ifdef CONFIG_HMP_TRACER
2415                 trace_sched_cfs_dequeue_task(task_of(se),se_load(se),cpu);
2416 #endif
2417         }
2418
2419         if (sleep) {
2420                 cfs_rq->blocked_load_avg += se->avg.load_avg_contrib;
2421                 se->avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
2422         } /* migrations, e.g. sleep=0 leave decay_count == 0 */
2423 }
2424
2425 /*
2426  * Update the rq's load with the elapsed running time before entering
2427  * idle. if the last scheduled task is not a CFS task, idle_enter will
2428  * be the only way to update the runnable statistic.
2429  */
2430 void idle_enter_fair(struct rq *this_rq)
2431 {
2432         update_rq_runnable_avg(this_rq, 1);
2433 }
2434
2435 /*
2436  * Update the rq's load with the elapsed idle time before a task is
2437  * scheduled. if the newly scheduled task is not a CFS task, idle_exit will
2438  * be the only way to update the runnable statistic.
2439  */
2440 void idle_exit_fair(struct rq *this_rq)
2441 {
2442         update_rq_runnable_avg(this_rq, 0);
2443 }
2444
2445 #else
2446 static inline void update_entity_load_avg(struct sched_entity *se,
2447                                           int update_cfs_rq) {}
2448 static inline void update_rq_runnable_avg(struct rq *rq, int runnable) {}
2449 static inline void enqueue_entity_load_avg(struct cfs_rq *cfs_rq,
2450                                            struct sched_entity *se,
2451                                            int wakeup) {}
2452 static inline void dequeue_entity_load_avg(struct cfs_rq *cfs_rq,
2453                                            struct sched_entity *se,
2454                                            int sleep) {}
2455 static inline void update_cfs_rq_blocked_load(struct cfs_rq *cfs_rq,
2456                                               int force_update) {}
2457 #endif
2458
2459 static void enqueue_sleeper(struct cfs_rq *cfs_rq, struct sched_entity *se)
2460 {
2461 #ifdef CONFIG_SCHEDSTATS
2462         struct task_struct *tsk = NULL;
2463
2464         if (entity_is_task(se))
2465                 tsk = task_of(se);
2466
2467         if (se->statistics.sleep_start) {
2468                 u64 delta = rq_of(cfs_rq)->clock - se->statistics.sleep_start;
2469
2470                 if ((s64)delta < 0)
2471                         delta = 0;
2472
2473                 if (unlikely(delta > se->statistics.sleep_max))
2474                         se->statistics.sleep_max = delta;
2475
2476                 se->statistics.sleep_start = 0;
2477                 se->statistics.sum_sleep_runtime += delta;
2478
2479                 if (tsk) {
2480                         account_scheduler_latency(tsk, delta >> 10, 1);
2481                         trace_sched_stat_sleep(tsk, delta);
2482                 }
2483         }
2484         if (se->statistics.block_start) {
2485                 u64 delta = rq_of(cfs_rq)->clock - se->statistics.block_start;
2486
2487                 if ((s64)delta < 0)
2488                         delta = 0;
2489
2490                 if (unlikely(delta > se->statistics.block_max))
2491                         se->statistics.block_max = delta;
2492
2493                 se->statistics.block_start = 0;
2494                 se->statistics.sum_sleep_runtime += delta;
2495
2496                 if (tsk) {
2497                         if (tsk->in_iowait) {
2498                                 se->statistics.iowait_sum += delta;
2499                                 se->statistics.iowait_count++;
2500                                 trace_sched_stat_iowait(tsk, delta);
2501                         }
2502
2503                         trace_sched_stat_blocked(tsk, delta);
2504
2505                         /*
2506                          * Blocking time is in units of nanosecs, so shift by
2507                          * 20 to get a milliseconds-range estimation of the
2508                          * amount of time that the task spent sleeping:
2509                          */
2510                         if (unlikely(prof_on == SLEEP_PROFILING)) {
2511                                 profile_hits(SLEEP_PROFILING,
2512                                                 (void *)get_wchan(tsk),
2513                                                 delta >> 20);
2514                         }
2515                         account_scheduler_latency(tsk, delta >> 10, 0);
2516                 }
2517         }
2518 #endif
2519 }
2520
2521 static void check_spread(struct cfs_rq *cfs_rq, struct sched_entity *se)
2522 {
2523 #ifdef CONFIG_SCHED_DEBUG
2524         s64 d = se->vruntime - cfs_rq->min_vruntime;
2525
2526         if (d < 0)
2527                 d = -d;
2528
2529         if (d > 3*sysctl_sched_latency)
2530                 schedstat_inc(cfs_rq, nr_spread_over);
2531 #endif
2532 }
2533
2534 static void
2535 place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int initial)
2536 {
2537         u64 vruntime = cfs_rq->min_vruntime;
2538
2539         /*
2540          * The 'current' period is already promised to the current tasks,
2541          * however the extra weight of the new task will slow them down a
2542          * little, place the new task so that it fits in the slot that
2543          * stays open at the end.
2544          */
2545         if (initial && sched_feat(START_DEBIT))
2546                 vruntime += sched_vslice(cfs_rq, se);
2547
2548         /* sleeps up to a single latency don't count. */
2549         if (!initial) {
2550                 unsigned long thresh = sysctl_sched_latency;
2551
2552                 /*
2553                  * Halve their sleep time's effect, to allow
2554                  * for a gentler effect of sleepers:
2555                  */
2556                 if (sched_feat(GENTLE_FAIR_SLEEPERS))
2557                         thresh >>= 1;
2558
2559                 vruntime -= thresh;
2560         }
2561
2562         /* ensure we never gain time by being placed backwards. */
2563         se->vruntime = max_vruntime(se->vruntime, vruntime);
2564 }
2565
2566 static void check_enqueue_throttle(struct cfs_rq *cfs_rq);
2567
2568 static void
2569 enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2570 {
2571         /*
2572          * Update the normalized vruntime before updating min_vruntime
2573          * through calling update_curr().
2574          */
2575         if (!(flags & ENQUEUE_WAKEUP) || (flags & ENQUEUE_WAKING))
2576                 se->vruntime += cfs_rq->min_vruntime;
2577
2578         /*
2579          * Update run-time statistics of the 'current'.
2580          */
2581         update_curr(cfs_rq);
2582         enqueue_entity_load_avg(cfs_rq, se, flags & ENQUEUE_WAKEUP);
2583         account_entity_enqueue(cfs_rq, se);
2584         update_cfs_shares(cfs_rq);
2585
2586         if (flags & ENQUEUE_WAKEUP) {
2587                 place_entity(cfs_rq, se, 0);
2588                 enqueue_sleeper(cfs_rq, se);
2589         }
2590
2591         update_stats_enqueue(cfs_rq, se);
2592         check_spread(cfs_rq, se);
2593         if (se != cfs_rq->curr)
2594                 __enqueue_entity(cfs_rq, se);
2595         se->on_rq = 1;
2596
2597         if (cfs_rq->nr_running == 1) {
2598                 list_add_leaf_cfs_rq(cfs_rq);
2599                 check_enqueue_throttle(cfs_rq);
2600         }
2601 }
2602
2603 static void __clear_buddies_last(struct sched_entity *se)
2604 {
2605         for_each_sched_entity(se) {
2606                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2607                 if (cfs_rq->last == se)
2608                         cfs_rq->last = NULL;
2609                 else
2610                         break;
2611         }
2612 }
2613
2614 static void __clear_buddies_next(struct sched_entity *se)
2615 {
2616         for_each_sched_entity(se) {
2617                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2618                 if (cfs_rq->next == se)
2619                         cfs_rq->next = NULL;
2620                 else
2621                         break;
2622         }
2623 }
2624
2625 static void __clear_buddies_skip(struct sched_entity *se)
2626 {
2627         for_each_sched_entity(se) {
2628                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
2629                 if (cfs_rq->skip == se)
2630                         cfs_rq->skip = NULL;
2631                 else
2632                         break;
2633         }
2634 }
2635
2636 static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
2637 {
2638         if (cfs_rq->last == se)
2639                 __clear_buddies_last(se);
2640
2641         if (cfs_rq->next == se)
2642                 __clear_buddies_next(se);
2643
2644         if (cfs_rq->skip == se)
2645                 __clear_buddies_skip(se);
2646 }
2647
2648 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2649
2650 static void
2651 dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
2652 {
2653         /*
2654          * Update run-time statistics of the 'current'.
2655          */
2656         update_curr(cfs_rq);
2657         dequeue_entity_load_avg(cfs_rq, se, flags & DEQUEUE_SLEEP);
2658
2659         update_stats_dequeue(cfs_rq, se);
2660         if (flags & DEQUEUE_SLEEP) {
2661 #ifdef CONFIG_SCHEDSTATS
2662                 if (entity_is_task(se)) {
2663                         struct task_struct *tsk = task_of(se);
2664
2665                         if (tsk->state & TASK_INTERRUPTIBLE)
2666                                 se->statistics.sleep_start = rq_of(cfs_rq)->clock;
2667                         if (tsk->state & TASK_UNINTERRUPTIBLE)
2668                                 se->statistics.block_start = rq_of(cfs_rq)->clock;
2669                 }
2670 #endif
2671         }
2672
2673         clear_buddies(cfs_rq, se);
2674
2675         if (se != cfs_rq->curr)
2676                 __dequeue_entity(cfs_rq, se);
2677         se->on_rq = 0;
2678         account_entity_dequeue(cfs_rq, se);
2679
2680         /*
2681          * Normalize the entity after updating the min_vruntime because the
2682          * update can refer to the ->curr item and we need to reflect this
2683          * movement in our normalized position.
2684          */
2685         if (!(flags & DEQUEUE_SLEEP))
2686                 se->vruntime -= cfs_rq->min_vruntime;
2687
2688         /* return excess runtime on last dequeue */
2689         return_cfs_rq_runtime(cfs_rq);
2690
2691         update_min_vruntime(cfs_rq);
2692         update_cfs_shares(cfs_rq);
2693 }
2694
2695 /*
2696  * Preempt the current task with a newly woken task if needed:
2697  */
2698 static void
2699 check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
2700 {
2701         unsigned long ideal_runtime, delta_exec;
2702         struct sched_entity *se;
2703         s64 delta;
2704
2705         ideal_runtime = sched_slice(cfs_rq, curr);
2706         delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
2707         if (delta_exec > ideal_runtime) {
2708                 resched_task(rq_of(cfs_rq)->curr);
2709                 /*
2710                  * The current task ran long enough, ensure it doesn't get
2711                  * re-elected due to buddy favours.
2712                  */
2713                 clear_buddies(cfs_rq, curr);
2714                 return;
2715         }
2716
2717         /*
2718          * Ensure that a task that missed wakeup preemption by a
2719          * narrow margin doesn't have to wait for a full slice.
2720          * This also mitigates buddy induced latencies under load.
2721          */
2722         if (delta_exec < sysctl_sched_min_granularity)
2723                 return;
2724
2725         se = __pick_first_entity(cfs_rq);
2726         delta = curr->vruntime - se->vruntime;
2727
2728         if (delta < 0)
2729                 return;
2730
2731         if (delta > ideal_runtime)
2732                 resched_task(rq_of(cfs_rq)->curr);
2733 }
2734
2735 static void
2736 set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
2737 {
2738         /* 'current' is not kept within the tree. */
2739         if (se->on_rq) {
2740                 /*
2741                  * Any task has to be enqueued before it get to execute on
2742                  * a CPU. So account for the time it spent waiting on the
2743                  * runqueue.
2744                  */
2745                 update_stats_wait_end(cfs_rq, se);
2746                 __dequeue_entity(cfs_rq, se);
2747                 update_entity_load_avg(se, 1);
2748         }
2749
2750         update_stats_curr_start(cfs_rq, se);
2751         cfs_rq->curr = se;
2752 #ifdef CONFIG_SCHEDSTATS
2753         /*
2754          * Track our maximum slice length, if the CPU's load is at
2755          * least twice that of our own weight (i.e. dont track it
2756          * when there are only lesser-weight tasks around):
2757          */
2758         if (rq_of(cfs_rq)->load.weight >= 2*se->load.weight) {
2759                 se->statistics.slice_max = max(se->statistics.slice_max,
2760                         se->sum_exec_runtime - se->prev_sum_exec_runtime);
2761         }
2762 #endif
2763         se->prev_sum_exec_runtime = se->sum_exec_runtime;
2764 }
2765
2766 static int
2767 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
2768
2769 /*
2770  * Pick the next process, keeping these things in mind, in this order:
2771  * 1) keep things fair between processes/task groups
2772  * 2) pick the "next" process, since someone really wants that to run
2773  * 3) pick the "last" process, for cache locality
2774  * 4) do not run the "skip" process, if something else is available
2775  */
2776 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
2777 {
2778         struct sched_entity *se = __pick_first_entity(cfs_rq);
2779         struct sched_entity *left = se;
2780
2781         /*
2782          * Avoid running the skip buddy, if running something else can
2783          * be done without getting too unfair.
2784          */
2785         if (cfs_rq->skip == se) {
2786                 struct sched_entity *second = __pick_next_entity(se);
2787                 if (second && wakeup_preempt_entity(second, left) < 1)
2788                         se = second;
2789         }
2790
2791         /*
2792          * Prefer last buddy, try to return the CPU to a preempted task.
2793          */
2794         if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
2795                 se = cfs_rq->last;
2796
2797         /*
2798          * Someone really wants this to run. If it's not unfair, run it.
2799          */
2800         if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
2801                 se = cfs_rq->next;
2802
2803         clear_buddies(cfs_rq, se);
2804
2805         return se;
2806 }
2807
2808 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq);
2809
2810 static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev)
2811 {
2812         /*
2813          * If still on the runqueue then deactivate_task()
2814          * was not called and update_curr() has to be done:
2815          */
2816         if (prev->on_rq)
2817                 update_curr(cfs_rq);
2818
2819         /* throttle cfs_rqs exceeding runtime */
2820         check_cfs_rq_runtime(cfs_rq);
2821
2822         check_spread(cfs_rq, prev);
2823         if (prev->on_rq) {
2824                 update_stats_wait_start(cfs_rq, prev);
2825                 /* Put 'current' back into the tree. */
2826                 __enqueue_entity(cfs_rq, prev);
2827                 /* in !on_rq case, update occurred at dequeue */
2828                 update_entity_load_avg(prev, 1);
2829         }
2830         cfs_rq->curr = NULL;
2831 }
2832
2833 static void
2834 entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued)
2835 {
2836         /*
2837          * Update run-time statistics of the 'current'.
2838          */
2839         update_curr(cfs_rq);
2840
2841         /*
2842          * Ensure that runnable average is periodically updated.
2843          */
2844         update_entity_load_avg(curr, 1);
2845         update_cfs_rq_blocked_load(cfs_rq, 1);
2846         update_cfs_shares(cfs_rq);
2847
2848 #ifdef CONFIG_SCHED_HRTICK
2849         /*
2850          * queued ticks are scheduled to match the slice, so don't bother
2851          * validating it and just reschedule.
2852          */
2853         if (queued) {
2854                 resched_task(rq_of(cfs_rq)->curr);
2855                 return;
2856         }
2857         /*
2858          * don't let the period tick interfere with the hrtick preemption
2859          */
2860         if (!sched_feat(DOUBLE_TICK) &&
2861                         hrtimer_active(&rq_of(cfs_rq)->hrtick_timer))
2862                 return;
2863 #endif
2864
2865         if (cfs_rq->nr_running > 1)
2866                 check_preempt_tick(cfs_rq, curr);
2867 }
2868
2869
2870 /**************************************************
2871  * CFS bandwidth control machinery
2872  */
2873
2874 #ifdef CONFIG_CFS_BANDWIDTH
2875
2876 #ifdef HAVE_JUMP_LABEL
2877 static struct static_key __cfs_bandwidth_used;
2878
2879 static inline bool cfs_bandwidth_used(void)
2880 {
2881         return static_key_false(&__cfs_bandwidth_used);
2882 }
2883
2884 void cfs_bandwidth_usage_inc(void)
2885 {
2886         static_key_slow_inc(&__cfs_bandwidth_used);
2887 }
2888
2889 void cfs_bandwidth_usage_dec(void)
2890 {
2891         static_key_slow_dec(&__cfs_bandwidth_used);
2892 }
2893 #else /* HAVE_JUMP_LABEL */
2894 static bool cfs_bandwidth_used(void)
2895 {
2896         return true;
2897 }
2898
2899 void cfs_bandwidth_usage_inc(void) {}
2900 void cfs_bandwidth_usage_dec(void) {}
2901 #endif /* HAVE_JUMP_LABEL */
2902
2903 /*
2904  * default period for cfs group bandwidth.
2905  * default: 0.1s, units: nanoseconds
2906  */
2907 static inline u64 default_cfs_period(void)
2908 {
2909         return 100000000ULL;
2910 }
2911
2912 static inline u64 sched_cfs_bandwidth_slice(void)
2913 {
2914         return (u64)sysctl_sched_cfs_bandwidth_slice * NSEC_PER_USEC;
2915 }
2916
2917 /*
2918  * Replenish runtime according to assigned quota and update expiration time.
2919  * We use sched_clock_cpu directly instead of rq->clock to avoid adding
2920  * additional synchronization around rq->lock.
2921  *
2922  * requires cfs_b->lock
2923  */
2924 void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
2925 {
2926         u64 now;
2927
2928         if (cfs_b->quota == RUNTIME_INF)
2929                 return;
2930
2931         now = sched_clock_cpu(smp_processor_id());
2932         cfs_b->runtime = cfs_b->quota;
2933         cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
2934 }
2935
2936 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
2937 {
2938         return &tg->cfs_bandwidth;
2939 }
2940
2941 /* rq->task_clock normalized against any time this cfs_rq has spent throttled */
2942 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
2943 {
2944         if (unlikely(cfs_rq->throttle_count))
2945                 return cfs_rq->throttled_clock_task;
2946
2947         return rq_of(cfs_rq)->clock_task - cfs_rq->throttled_clock_task_time;
2948 }
2949
2950 /* returns 0 on failure to allocate runtime */
2951 static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
2952 {
2953         struct task_group *tg = cfs_rq->tg;
2954         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
2955         u64 amount = 0, min_amount, expires;
2956
2957         /* note: this is a positive sum as runtime_remaining <= 0 */
2958         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
2959
2960         raw_spin_lock(&cfs_b->lock);
2961         if (cfs_b->quota == RUNTIME_INF)
2962                 amount = min_amount;
2963         else {
2964                 /*
2965                  * If the bandwidth pool has become inactive, then at least one
2966                  * period must have elapsed since the last consumption.
2967                  * Refresh the global state and ensure bandwidth timer becomes
2968                  * active.
2969                  */
2970                 if (!cfs_b->timer_active) {
2971                         __refill_cfs_bandwidth_runtime(cfs_b);
2972                         __start_cfs_bandwidth(cfs_b);
2973                 }
2974
2975                 if (cfs_b->runtime > 0) {
2976                         amount = min(cfs_b->runtime, min_amount);
2977                         cfs_b->runtime -= amount;
2978                         cfs_b->idle = 0;
2979                 }
2980         }
2981         expires = cfs_b->runtime_expires;
2982         raw_spin_unlock(&cfs_b->lock);
2983
2984         cfs_rq->runtime_remaining += amount;
2985         /*
2986          * we may have advanced our local expiration to account for allowed
2987          * spread between our sched_clock and the one on which runtime was
2988          * issued.
2989          */
2990         if ((s64)(expires - cfs_rq->runtime_expires) > 0)
2991                 cfs_rq->runtime_expires = expires;
2992
2993         return cfs_rq->runtime_remaining > 0;
2994 }
2995
2996 /*
2997  * Note: This depends on the synchronization provided by sched_clock and the
2998  * fact that rq->clock snapshots this value.
2999  */
3000 static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3001 {
3002         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3003         struct rq *rq = rq_of(cfs_rq);
3004
3005         /* if the deadline is ahead of our clock, nothing to do */
3006         if (likely((s64)(rq->clock - cfs_rq->runtime_expires) < 0))
3007                 return;
3008
3009         if (cfs_rq->runtime_remaining < 0)
3010                 return;
3011
3012         /*
3013          * If the local deadline has passed we have to consider the
3014          * possibility that our sched_clock is 'fast' and the global deadline
3015          * has not truly expired.
3016          *
3017          * Fortunately we can check determine whether this the case by checking
3018          * whether the global deadline has advanced.
3019          */
3020
3021         if ((s64)(cfs_rq->runtime_expires - cfs_b->runtime_expires) >= 0) {
3022                 /* extend local deadline, drift is bounded above by 2 ticks */
3023                 cfs_rq->runtime_expires += TICK_NSEC;
3024         } else {
3025                 /* global deadline is ahead, expiration has passed */
3026                 cfs_rq->runtime_remaining = 0;
3027         }
3028 }
3029
3030 static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3031                                      unsigned long delta_exec)
3032 {
3033         /* dock delta_exec before expiring quota (as it could span periods) */
3034         cfs_rq->runtime_remaining -= delta_exec;
3035         expire_cfs_rq_runtime(cfs_rq);
3036
3037         if (likely(cfs_rq->runtime_remaining > 0))
3038                 return;
3039
3040         /*
3041          * if we're unable to extend our runtime we resched so that the active
3042          * hierarchy can be throttled
3043          */
3044         if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr))
3045                 resched_task(rq_of(cfs_rq)->curr);
3046 }
3047
3048 static __always_inline
3049 void account_cfs_rq_runtime(struct cfs_rq *cfs_rq, unsigned long delta_exec)
3050 {
3051         if (!cfs_bandwidth_used() || !cfs_rq->runtime_enabled)
3052                 return;
3053
3054         __account_cfs_rq_runtime(cfs_rq, delta_exec);
3055 }
3056
3057 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3058 {
3059         return cfs_bandwidth_used() && cfs_rq->throttled;
3060 }
3061
3062 /* check whether cfs_rq, or any parent, is throttled */
3063 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3064 {
3065         return cfs_bandwidth_used() && cfs_rq->throttle_count;
3066 }
3067
3068 /*
3069  * Ensure that neither of the group entities corresponding to src_cpu or
3070  * dest_cpu are members of a throttled hierarchy when performing group
3071  * load-balance operations.
3072  */
3073 static inline int throttled_lb_pair(struct task_group *tg,
3074                                     int src_cpu, int dest_cpu)
3075 {
3076         struct cfs_rq *src_cfs_rq, *dest_cfs_rq;
3077
3078         src_cfs_rq = tg->cfs_rq[src_cpu];
3079         dest_cfs_rq = tg->cfs_rq[dest_cpu];
3080
3081         return throttled_hierarchy(src_cfs_rq) ||
3082                throttled_hierarchy(dest_cfs_rq);
3083 }
3084
3085 /* updated child weight may affect parent so we have to do this bottom up */
3086 static int tg_unthrottle_up(struct task_group *tg, void *data)
3087 {
3088         struct rq *rq = data;
3089         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3090
3091         cfs_rq->throttle_count--;
3092 #ifdef CONFIG_SMP
3093         if (!cfs_rq->throttle_count) {
3094                 /* adjust cfs_rq_clock_task() */
3095                 cfs_rq->throttled_clock_task_time += rq->clock_task -
3096                                              cfs_rq->throttled_clock_task;
3097         }
3098 #endif
3099
3100         return 0;
3101 }
3102
3103 static int tg_throttle_down(struct task_group *tg, void *data)
3104 {
3105         struct rq *rq = data;
3106         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu_of(rq)];
3107
3108         /* group is entering throttled state, stop time */
3109         if (!cfs_rq->throttle_count)
3110                 cfs_rq->throttled_clock_task = rq->clock_task;
3111         cfs_rq->throttle_count++;
3112
3113         return 0;
3114 }
3115
3116 static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
3117 {
3118         struct rq *rq = rq_of(cfs_rq);
3119         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3120         struct sched_entity *se;
3121         long task_delta, dequeue = 1;
3122
3123         se = cfs_rq->tg->se[cpu_of(rq_of(cfs_rq))];
3124
3125         /* freeze hierarchy runnable averages while throttled */
3126         rcu_read_lock();
3127         walk_tg_tree_from(cfs_rq->tg, tg_throttle_down, tg_nop, (void *)rq);
3128         rcu_read_unlock();
3129
3130         task_delta = cfs_rq->h_nr_running;
3131         for_each_sched_entity(se) {
3132                 struct cfs_rq *qcfs_rq = cfs_rq_of(se);
3133                 /* throttled entity or throttle-on-deactivate */
3134                 if (!se->on_rq)
3135                         break;
3136
3137                 if (dequeue)
3138                         dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
3139                 qcfs_rq->h_nr_running -= task_delta;
3140
3141                 if (qcfs_rq->load.weight)
3142                         dequeue = 0;
3143         }
3144
3145         if (!se)
3146                 rq->nr_running -= task_delta;
3147
3148         cfs_rq->throttled = 1;
3149         cfs_rq->throttled_clock = rq->clock;
3150         raw_spin_lock(&cfs_b->lock);
3151         list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
3152         if (!cfs_b->timer_active)
3153                 __start_cfs_bandwidth(cfs_b);
3154         raw_spin_unlock(&cfs_b->lock);
3155 }
3156
3157 void unthrottle_cfs_rq(struct cfs_rq *cfs_rq)
3158 {
3159         struct rq *rq = rq_of(cfs_rq);
3160         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3161         struct sched_entity *se;
3162         int enqueue = 1;
3163         long task_delta;
3164
3165         se = cfs_rq->tg->se[cpu_of(rq)];
3166
3167         cfs_rq->throttled = 0;
3168         raw_spin_lock(&cfs_b->lock);
3169         cfs_b->throttled_time += rq->clock - cfs_rq->throttled_clock;
3170         list_del_rcu(&cfs_rq->throttled_list);
3171         raw_spin_unlock(&cfs_b->lock);
3172
3173         update_rq_clock(rq);
3174         /* update hierarchical throttle state */
3175         walk_tg_tree_from(cfs_rq->tg, tg_nop, tg_unthrottle_up, (void *)rq);
3176
3177         if (!cfs_rq->load.weight)
3178                 return;
3179
3180         task_delta = cfs_rq->h_nr_running;
3181         for_each_sched_entity(se) {
3182                 if (se->on_rq)
3183                         enqueue = 0;
3184
3185                 cfs_rq = cfs_rq_of(se);
3186                 if (enqueue)
3187                         enqueue_entity(cfs_rq, se, ENQUEUE_WAKEUP);
3188                 cfs_rq->h_nr_running += task_delta;
3189
3190                 if (cfs_rq_throttled(cfs_rq))
3191                         break;
3192         }
3193
3194         if (!se)
3195                 rq->nr_running += task_delta;
3196
3197         /* determine whether we need to wake up potentially idle cpu */
3198         if (rq->curr == rq->idle && rq->cfs.nr_running)
3199                 resched_task(rq->curr);
3200 }
3201
3202 static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b,
3203                 u64 remaining, u64 expires)
3204 {
3205         struct cfs_rq *cfs_rq;
3206         u64 runtime = remaining;
3207
3208         rcu_read_lock();
3209         list_for_each_entry_rcu(cfs_rq, &cfs_b->throttled_cfs_rq,
3210                                 throttled_list) {
3211                 struct rq *rq = rq_of(cfs_rq);
3212
3213                 raw_spin_lock(&rq->lock);
3214                 if (!cfs_rq_throttled(cfs_rq))
3215                         goto next;
3216
3217                 runtime = -cfs_rq->runtime_remaining + 1;
3218                 if (runtime > remaining)
3219                         runtime = remaining;
3220                 remaining -= runtime;
3221
3222                 cfs_rq->runtime_remaining += runtime;
3223                 cfs_rq->runtime_expires = expires;
3224
3225                 /* we check whether we're throttled above */
3226                 if (cfs_rq->runtime_remaining > 0)
3227                         unthrottle_cfs_rq(cfs_rq);
3228
3229 next:
3230                 raw_spin_unlock(&rq->lock);
3231
3232                 if (!remaining)
3233                         break;
3234         }
3235         rcu_read_unlock();
3236
3237         return remaining;
3238 }
3239
3240 /*
3241  * Responsible for refilling a task_group's bandwidth and unthrottling its
3242  * cfs_rqs as appropriate. If there has been no activity within the last
3243  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
3244  * used to track this state.
3245  */
3246 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
3247 {
3248         u64 runtime, runtime_expires;
3249         int idle = 1, throttled;
3250
3251         raw_spin_lock(&cfs_b->lock);
3252         /* no need to continue the timer with no bandwidth constraint */
3253         if (cfs_b->quota == RUNTIME_INF)
3254                 goto out_unlock;
3255
3256         throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3257         /* idle depends on !throttled (for the case of a large deficit) */
3258         idle = cfs_b->idle && !throttled;
3259         cfs_b->nr_periods += overrun;
3260
3261         /* if we're going inactive then everything else can be deferred */
3262         if (idle)
3263                 goto out_unlock;
3264
3265         /*
3266          * if we have relooped after returning idle once, we need to update our
3267          * status as actually running, so that other cpus doing
3268          * __start_cfs_bandwidth will stop trying to cancel us.
3269          */
3270         cfs_b->timer_active = 1;
3271
3272         __refill_cfs_bandwidth_runtime(cfs_b);
3273
3274         if (!throttled) {
3275                 /* mark as potentially idle for the upcoming period */
3276                 cfs_b->idle = 1;
3277                 goto out_unlock;
3278         }
3279
3280         /* account preceding periods in which throttling occurred */
3281         cfs_b->nr_throttled += overrun;
3282
3283         /*
3284          * There are throttled entities so we must first use the new bandwidth
3285          * to unthrottle them before making it generally available.  This
3286          * ensures that all existing debts will be paid before a new cfs_rq is
3287          * allowed to run.
3288          */
3289         runtime = cfs_b->runtime;
3290         runtime_expires = cfs_b->runtime_expires;
3291         cfs_b->runtime = 0;
3292
3293         /*
3294          * This check is repeated as we are holding onto the new bandwidth
3295          * while we unthrottle.  This can potentially race with an unthrottled
3296          * group trying to acquire new bandwidth from the global pool.
3297          */
3298         while (throttled && runtime > 0) {
3299                 raw_spin_unlock(&cfs_b->lock);
3300                 /* we can't nest cfs_b->lock while distributing bandwidth */
3301                 runtime = distribute_cfs_runtime(cfs_b, runtime,
3302                                                  runtime_expires);
3303                 raw_spin_lock(&cfs_b->lock);
3304
3305                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
3306         }
3307
3308         /* return (any) remaining runtime */
3309         cfs_b->runtime = runtime;
3310         /*
3311          * While we are ensured activity in the period following an
3312          * unthrottle, this also covers the case in which the new bandwidth is
3313          * insufficient to cover the existing bandwidth deficit.  (Forcing the
3314          * timer to remain active while there are any throttled entities.)
3315          */
3316         cfs_b->idle = 0;
3317 out_unlock:
3318         if (idle)
3319                 cfs_b->timer_active = 0;
3320         raw_spin_unlock(&cfs_b->lock);
3321
3322         return idle;
3323 }
3324
3325 /* a cfs_rq won't donate quota below this amount */
3326 static const u64 min_cfs_rq_runtime = 1 * NSEC_PER_MSEC;
3327 /* minimum remaining period time to redistribute slack quota */
3328 static const u64 min_bandwidth_expiration = 2 * NSEC_PER_MSEC;
3329 /* how long we wait to gather additional slack before distributing */
3330 static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC;
3331
3332 /*
3333  * Are we near the end of the current quota period?
3334  *
3335  * Requires cfs_b->lock for hrtimer_expires_remaining to be safe against the
3336  * hrtimer base being cleared by __hrtimer_start_range_ns. In the case of
3337  * migrate_hrtimers, base is never cleared, so we are fine.
3338  */
3339 static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire)
3340 {
3341         struct hrtimer *refresh_timer = &cfs_b->period_timer;
3342         u64 remaining;
3343
3344         /* if the call-back is running a quota refresh is already occurring */
3345         if (hrtimer_callback_running(refresh_timer))
3346                 return 1;
3347
3348         /* is a quota refresh about to occur? */
3349         remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer));
3350         if (remaining < min_expire)
3351                 return 1;
3352
3353         return 0;
3354 }
3355
3356 static void start_cfs_slack_bandwidth(struct cfs_bandwidth *cfs_b)
3357 {
3358         u64 min_left = cfs_bandwidth_slack_period + min_bandwidth_expiration;
3359
3360         /* if there's a quota refresh soon don't bother with slack */
3361         if (runtime_refresh_within(cfs_b, min_left))
3362                 return;
3363
3364         start_bandwidth_timer(&cfs_b->slack_timer,
3365                                 ns_to_ktime(cfs_bandwidth_slack_period));
3366 }
3367
3368 /* we know any runtime found here is valid as update_curr() precedes return */
3369 static void __return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3370 {
3371         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3372         s64 slack_runtime = cfs_rq->runtime_remaining - min_cfs_rq_runtime;
3373
3374         if (slack_runtime <= 0)
3375                 return;
3376
3377         raw_spin_lock(&cfs_b->lock);
3378         if (cfs_b->quota != RUNTIME_INF &&
3379             cfs_rq->runtime_expires == cfs_b->runtime_expires) {
3380                 cfs_b->runtime += slack_runtime;
3381
3382                 /* we are under rq->lock, defer unthrottling using a timer */
3383                 if (cfs_b->runtime > sched_cfs_bandwidth_slice() &&
3384                     !list_empty(&cfs_b->throttled_cfs_rq))
3385                         start_cfs_slack_bandwidth(cfs_b);
3386         }
3387         raw_spin_unlock(&cfs_b->lock);
3388
3389         /* even if it's not valid for return we don't want to try again */
3390         cfs_rq->runtime_remaining -= slack_runtime;
3391 }
3392
3393 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3394 {
3395         if (!cfs_bandwidth_used())
3396                 return;
3397
3398         if (!cfs_rq->runtime_enabled || cfs_rq->nr_running)
3399                 return;
3400
3401         __return_cfs_rq_runtime(cfs_rq);
3402 }
3403
3404 /*
3405  * This is done with a timer (instead of inline with bandwidth return) since
3406  * it's necessary to juggle rq->locks to unthrottle their respective cfs_rqs.
3407  */
3408 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
3409 {
3410         u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
3411         u64 expires;
3412
3413         /* confirm we're still not at a refresh boundary */
3414         raw_spin_lock(&cfs_b->lock);
3415         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
3416                 raw_spin_unlock(&cfs_b->lock);
3417                 return;
3418         }
3419
3420         if (cfs_b->quota != RUNTIME_INF && cfs_b->runtime > slice) {
3421                 runtime = cfs_b->runtime;
3422                 cfs_b->runtime = 0;
3423         }
3424         expires = cfs_b->runtime_expires;
3425         raw_spin_unlock(&cfs_b->lock);
3426
3427         if (!runtime)
3428                 return;
3429
3430         runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
3431
3432         raw_spin_lock(&cfs_b->lock);
3433         if (expires == cfs_b->runtime_expires)
3434                 cfs_b->runtime = runtime;
3435         raw_spin_unlock(&cfs_b->lock);
3436 }
3437
3438 /*
3439  * When a group wakes up we want to make sure that its quota is not already
3440  * expired/exceeded, otherwise it may be allowed to steal additional ticks of
3441  * runtime as update_curr() throttling can not not trigger until it's on-rq.
3442  */
3443 static void check_enqueue_throttle(struct cfs_rq *cfs_rq)
3444 {
3445         if (!cfs_bandwidth_used())
3446                 return;
3447
3448         /* an active group must be handled by the update_curr()->put() path */
3449         if (!cfs_rq->runtime_enabled || cfs_rq->curr)
3450                 return;
3451
3452         /* ensure the group is not already throttled */
3453         if (cfs_rq_throttled(cfs_rq))
3454                 return;
3455
3456         /* update runtime allocation */
3457         account_cfs_rq_runtime(cfs_rq, 0);
3458         if (cfs_rq->runtime_remaining <= 0)
3459                 throttle_cfs_rq(cfs_rq);
3460 }
3461
3462 /* conditionally throttle active cfs_rq's from put_prev_entity() */
3463 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3464 {
3465         if (!cfs_bandwidth_used())
3466                 return;
3467
3468         if (likely(!cfs_rq->runtime_enabled || cfs_rq->runtime_remaining > 0))
3469                 return;
3470
3471         /*
3472          * it's possible for a throttled entity to be forced into a running
3473          * state (e.g. set_curr_task), in this case we're finished.
3474          */
3475         if (cfs_rq_throttled(cfs_rq))
3476                 return;
3477
3478         throttle_cfs_rq(cfs_rq);
3479 }
3480
3481 static inline u64 default_cfs_period(void);
3482 static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun);
3483 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b);
3484
3485 static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer)
3486 {
3487         struct cfs_bandwidth *cfs_b =
3488                 container_of(timer, struct cfs_bandwidth, slack_timer);
3489         do_sched_cfs_slack_timer(cfs_b);
3490
3491         return HRTIMER_NORESTART;
3492 }
3493
3494 static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer)
3495 {
3496         struct cfs_bandwidth *cfs_b =
3497                 container_of(timer, struct cfs_bandwidth, period_timer);
3498         ktime_t now;
3499         int overrun;
3500         int idle = 0;
3501
3502         for (;;) {
3503                 now = hrtimer_cb_get_time(timer);
3504                 overrun = hrtimer_forward(timer, now, cfs_b->period);
3505
3506                 if (!overrun)
3507                         break;
3508
3509                 idle = do_sched_cfs_period_timer(cfs_b, overrun);
3510         }
3511
3512         return idle ? HRTIMER_NORESTART : HRTIMER_RESTART;
3513 }
3514
3515 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3516 {
3517         raw_spin_lock_init(&cfs_b->lock);
3518         cfs_b->runtime = 0;
3519         cfs_b->quota = RUNTIME_INF;
3520         cfs_b->period = ns_to_ktime(default_cfs_period());
3521
3522         INIT_LIST_HEAD(&cfs_b->throttled_cfs_rq);
3523         hrtimer_init(&cfs_b->period_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3524         cfs_b->period_timer.function = sched_cfs_period_timer;
3525         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
3526         cfs_b->slack_timer.function = sched_cfs_slack_timer;
3527 }
3528
3529 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
3530 {
3531         cfs_rq->runtime_enabled = 0;
3532         INIT_LIST_HEAD(&cfs_rq->throttled_list);
3533 }
3534
3535 /* requires cfs_b->lock, may release to reprogram timer */
3536 void __start_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3537 {
3538         /*
3539          * The timer may be active because we're trying to set a new bandwidth
3540          * period or because we're racing with the tear-down path
3541          * (timer_active==0 becomes visible before the hrtimer call-back
3542          * terminates).  In either case we ensure that it's re-programmed
3543          */
3544         while (unlikely(hrtimer_active(&cfs_b->period_timer)) &&
3545                hrtimer_try_to_cancel(&cfs_b->period_timer) < 0) {
3546                 /* bounce the lock to allow do_sched_cfs_period_timer to run */
3547                 raw_spin_unlock(&cfs_b->lock);
3548                 cpu_relax();
3549                 raw_spin_lock(&cfs_b->lock);
3550                 /* if someone else restarted the timer then we're done */
3551                 if (cfs_b->timer_active)
3552                         return;
3553         }
3554
3555         cfs_b->timer_active = 1;
3556         start_bandwidth_timer(&cfs_b->period_timer, cfs_b->period);
3557 }
3558
3559 static void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
3560 {
3561         hrtimer_cancel(&cfs_b->period_timer);
3562         hrtimer_cancel(&cfs_b->slack_timer);
3563 }
3564
3565 static void __maybe_unused unthrottle_offline_cfs_rqs(struct rq *rq)
3566 {
3567         struct cfs_rq *cfs_rq;
3568
3569         for_each_leaf_cfs_rq(rq, cfs_rq) {
3570                 struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg);
3571
3572                 if (!cfs_rq->runtime_enabled)
3573                         continue;
3574
3575                 /*
3576                  * clock_task is not advancing so we just need to make sure
3577                  * there's some valid quota amount
3578                  */
3579                 cfs_rq->runtime_remaining = cfs_b->quota;
3580                 if (cfs_rq_throttled(cfs_rq))
3581                         unthrottle_cfs_rq(cfs_rq);
3582         }
3583 }
3584
3585 #else /* CONFIG_CFS_BANDWIDTH */
3586 static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq)
3587 {
3588         return rq_of(cfs_rq)->clock_task;
3589 }
3590
3591 static void account_cfs_rq_runtime(struct cfs_rq *cfs_rq,
3592                                      unsigned long delta_exec) {}
3593 static void check_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3594 static void check_enqueue_throttle(struct cfs_rq *cfs_rq) {}
3595 static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3596
3597 static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq)
3598 {
3599         return 0;
3600 }
3601
3602 static inline int throttled_hierarchy(struct cfs_rq *cfs_rq)
3603 {
3604         return 0;
3605 }
3606
3607 static inline int throttled_lb_pair(struct task_group *tg,
3608                                     int src_cpu, int dest_cpu)
3609 {
3610         return 0;
3611 }
3612
3613 void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3614
3615 #ifdef CONFIG_FAIR_GROUP_SCHED
3616 static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq) {}
3617 #endif
3618
3619 static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
3620 {
3621         return NULL;
3622 }
3623 static inline void destroy_cfs_bandwidth(struct cfs_bandwidth *cfs_b) {}
3624 static inline void unthrottle_offline_cfs_rqs(struct rq *rq) {}
3625
3626 #endif /* CONFIG_CFS_BANDWIDTH */
3627
3628 /**************************************************
3629  * CFS operations on tasks:
3630  */
3631
3632 #ifdef CONFIG_SCHED_HRTICK
3633 static void hrtick_start_fair(struct rq *rq, struct task_struct *p)
3634 {
3635         struct sched_entity *se = &p->se;
3636         struct cfs_rq *cfs_rq = cfs_rq_of(se);
3637
3638         WARN_ON(task_rq(p) != rq);
3639
3640         if (cfs_rq->nr_running > 1) {
3641                 u64 slice = sched_slice(cfs_rq, se);
3642                 u64 ran = se->sum_exec_runtime - se->prev_sum_exec_runtime;
3643                 s64 delta = slice - ran;
3644
3645                 if (delta < 0) {
3646                         if (rq->curr == p)
3647                                 resched_task(p);
3648                         return;
3649                 }
3650
3651                 /*
3652                  * Don't schedule slices shorter than 10000ns, that just
3653                  * doesn't make sense. Rely on vruntime for fairness.
3654                  */
3655                 if (rq->curr != p)
3656                         delta = max_t(s64, 10000LL, delta);
3657
3658                 hrtick_start(rq, delta);
3659         }
3660 }
3661
3662 /*
3663  * called from enqueue/dequeue and updates the hrtick when the
3664  * current task is from our class and nr_running is low enough
3665  * to matter.
3666  */
3667 static void hrtick_update(struct rq *rq)
3668 {
3669         struct task_struct *curr = rq->curr;
3670
3671         if (!hrtick_enabled(rq) || curr->sched_class != &fair_sched_class)
3672                 return;
3673
3674         if (cfs_rq_of(&curr->se)->nr_running < sched_nr_latency)
3675                 hrtick_start_fair(rq, curr);
3676 }
3677 #else /* !CONFIG_SCHED_HRTICK */
3678 static inline void
3679 hrtick_start_fair(struct rq *rq, struct task_struct *p)
3680 {
3681 }
3682
3683 static inline void hrtick_update(struct rq *rq)
3684 {
3685 }
3686 #endif
3687
3688 #if defined(CONFIG_SCHED_HMP) || defined(CONFIG_MTK_SCHED_CMP)
3689
3690 /* CPU cluster statistics for task migration control */
3691 #define HMP_GB (0x1000)
3692 #define HMP_SELECT_RQ (0x2000)
3693 #define HMP_LB (0x4000)
3694 #define HMP_MAX_LOAD (NICE_0_LOAD - 1)
3695
3696
3697 struct clb_env {
3698         struct clb_stats bstats;
3699         struct clb_stats lstats;
3700         int btarget, ltarget;
3701
3702         struct cpumask *bcpus;
3703         struct cpumask *lcpus;
3704
3705         unsigned int flags;
3706         struct mcheck {
3707                 int status; /* Details of this migration check */
3708                 int result; /* Indicate whether we should perform this task migration */
3709         } mcheck;
3710 };
3711
3712 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu);
3713
3714 static void collect_cluster_stats(struct clb_stats *clbs,
3715             struct cpumask *cluster_cpus, int target)
3716 {
3717 #define HMP_RESOLUTION_SCALING (4)
3718 #define hmp_scale_down(w) ((w) >> HMP_RESOLUTION_SCALING)
3719
3720         /* Update cluster informatics */
3721         int cpu;
3722         for_each_cpu(cpu, cluster_cpus) {
3723                 if(cpu_online(cpu)) {
3724                         clbs->ncpu ++;
3725                         clbs->ntask += cpu_rq(cpu)->cfs.h_nr_running;
3726                         clbs->load_avg += cpu_rq(cpu)->cfs.avg.load_avg_ratio;
3727 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
3728                         clbs->nr_normal_prio_task += cfs_nr_normal_prio(cpu);
3729                         clbs->nr_dequeuing_low_prio += cfs_nr_dequeuing_low_prio(cpu);
3730 #endif
3731                 }
3732         }
3733
3734         if(!clbs->ncpu || NR_CPUS == target || !cpumask_test_cpu(target,cluster_cpus))
3735                 return;
3736
3737         clbs->cpu_power = (int) arch_scale_freq_power(NULL, target);
3738
3739         /* Scale current CPU compute capacity in accordance with frequency */
3740         clbs->cpu_capacity = HMP_MAX_LOAD;
3741 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
3742         if (hmp_data.freqinvar_load_scale_enabled) {
3743                 cpu = cpumask_any(cluster_cpus);
3744                 if (freq_scale[cpu].throttling == 1){
3745                         clbs->cpu_capacity *= freq_scale[cpu].curr_scale;
3746                 }else {
3747                 clbs->cpu_capacity *= freq_scale[cpu].max;
3748                 }
3749                 clbs->cpu_capacity >>= SCHED_FREQSCALE_SHIFT;
3750
3751                 if (clbs->cpu_capacity > HMP_MAX_LOAD){
3752                         clbs->cpu_capacity = HMP_MAX_LOAD;
3753                 }
3754         }
3755 #elif defined(CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY)
3756         if (topology_cpu_inv_power_en()) {
3757                 cpu = cpumask_any(cluster_cpus);
3758                 if (topology_cpu_throttling(cpu))
3759                         clbs->cpu_capacity *=
3760                                 (topology_cpu_capacity(cpu) << CPUPOWER_FREQSCALE_SHIFT)
3761                                 / (topology_max_cpu_capacity(cpu)+1);
3762                 else
3763                         clbs->cpu_capacity *= topology_max_cpu_capacity(cpu);
3764                 clbs->cpu_capacity >>= CPUPOWER_FREQSCALE_SHIFT;
3765
3766                 if (clbs->cpu_capacity > HMP_MAX_LOAD){
3767                         clbs->cpu_capacity = HMP_MAX_LOAD;
3768                 }
3769         }
3770 #endif
3771
3772         /*
3773          * Calculate available CPU capacity
3774          * Calculate available task space
3775          *
3776          * Why load ratio should be multiplied by the number of task ?
3777          * The task is the entity of scheduling unit so that we should consider
3778          * it in scheduler. Only considering task load is not enough.
3779          * Thus, multiplying the number of tasks can adjust load ratio to a more
3780          * reasonable value.
3781          */
3782         clbs->load_avg /= clbs->ncpu;
3783         clbs->acap = clbs->cpu_capacity - cpu_rq(target)->cfs.avg.load_avg_ratio;
3784         clbs->scaled_acap = hmp_scale_down(clbs->acap);
3785         clbs->scaled_atask = cpu_rq(target)->cfs.h_nr_running * cpu_rq(target)->cfs.avg.load_avg_ratio;
3786         clbs->scaled_atask = clbs->cpu_capacity - clbs->scaled_atask;
3787         clbs->scaled_atask = hmp_scale_down(clbs->scaled_atask);
3788
3789         mt_sched_printf("[%s] cpu/cluster:%d/%02lx load/len:%lu/%u stats:%d,%d,%d,%d,%d,%d,%d,%d\n", __func__,
3790                                         target, *cpumask_bits(cluster_cpus),
3791                                         cpu_rq(target)->cfs.avg.load_avg_ratio, cpu_rq(target)->cfs.h_nr_running,
3792                                         clbs->ncpu, clbs->ntask, clbs->load_avg, clbs->cpu_capacity,
3793                                         clbs->acap, clbs->scaled_acap, clbs->scaled_atask, clbs->threshold);
3794 }
3795
3796 //#define USE_HMP_DYNAMIC_THRESHOLD
3797 #if defined(CONFIG_SCHED_HMP) && defined(USE_HMP_DYNAMIC_THRESHOLD)
3798 static inline void hmp_dynamic_threshold(struct clb_env *clbenv);
3799 #endif
3800
3801 /*
3802  * Task Dynamic Migration Threshold Adjustment.
3803  *
3804  * If the workload between clusters is not balanced, adjust migration
3805  * threshold in an attempt to move task precisely.
3806  *
3807  * Diff. = Max Threshold - Min Threshold
3808  *
3809  * Dynamic UP-Threshold =
3810  *                               B_nacap               B_natask
3811  * Max Threshold - Diff. x  -----------------  x  -------------------
3812  *                          B_nacap + L_nacap     B_natask + L_natask
3813  *
3814  *
3815  * Dynamic Down-Threshold =
3816  *                               L_nacap               L_natask
3817  * Min Threshold + Diff. x  -----------------  x  -------------------
3818  *                          B_nacap + L_nacap     B_natask + L_natask
3819  */
3820 static void adj_threshold(struct clb_env *clbenv)
3821 {
3822 #define TSKLD_SHIFT (2)
3823 #define POSITIVE(x) ((int)(x) < 0 ? 0 : (x))
3824
3825         int bcpu, lcpu;
3826         unsigned long b_cap=0, l_cap=0;
3827         unsigned long b_load=0, l_load=0;
3828         unsigned long b_task=0, l_task=0;
3829         int b_nacap, l_nacap, b_natask, l_natask;
3830
3831 #if defined(CONFIG_SCHED_HMP) && defined(USE_HMP_DYNAMIC_THRESHOLD)
3832         hmp_dynamic_threshold(clbenv);
3833         return;
3834 #endif
3835
3836         bcpu = clbenv->btarget;
3837         lcpu = clbenv->ltarget;
3838         if (bcpu < nr_cpu_ids) {
3839                 b_load = cpu_rq(bcpu)->cfs.avg.load_avg_ratio;
3840                 b_task = cpu_rq(bcpu)->cfs.h_nr_running;
3841         }
3842         if (lcpu < nr_cpu_ids) {
3843                 l_load = cpu_rq(lcpu)->cfs.avg.load_avg_ratio;
3844                 l_task = cpu_rq(lcpu)->cfs.h_nr_running;
3845         }
3846
3847 #ifdef CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY
3848         if (bcpu < nr_cpu_ids) {
3849                 b_cap = topology_cpu_capacity(bcpu);
3850         }
3851         if (lcpu < nr_cpu_ids) {
3852                 l_cap = topology_cpu_capacity(lcpu);
3853         }
3854
3855         b_nacap = POSITIVE(b_cap - b_load);
3856         b_natask = POSITIVE(b_cap - ((b_task * b_load) >> TSKLD_SHIFT));
3857         l_nacap = POSITIVE(l_cap - l_load);
3858         l_natask = POSITIVE(l_cap - ((l_task * l_load) >> TSKLD_SHIFT));
3859 #else /* !CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY */
3860         b_cap = clbenv->bstats.cpu_power;
3861         l_cap = clbenv->lstats.cpu_power;
3862         b_nacap = POSITIVE(clbenv->bstats.scaled_acap *
3863                                            clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));
3864         b_natask = POSITIVE(clbenv->bstats.scaled_atask *
3865                                                 clbenv->bstats.cpu_power / (clbenv->lstats.cpu_power+1));
3866         l_nacap = POSITIVE(clbenv->lstats.scaled_acap);
3867         l_natask = POSITIVE(clbenv->bstats.scaled_atask);
3868
3869 #endif /* CONFIG_ARCH_SCALE_INVARIANT_CPU_CAPACITY */
3870
3871         clbenv->bstats.threshold = HMP_MAX_LOAD - HMP_MAX_LOAD * b_nacap * b_natask /
3872                 ((b_nacap + l_nacap) * (b_natask + l_natask)+1);
3873         clbenv->lstats.threshold = HMP_MAX_LOAD * l_nacap * l_natask /
3874                 ((b_nacap + l_nacap) * (b_natask + l_natask)+1);
3875
3876         mt_sched_printf("[%s]\tup/dl:%4d/%4d L(%d:%4lu,%4lu/%4lu) b(%d:%4lu,%4lu/%4lu)\n", __func__,
3877                                         clbenv->bstats.threshold, clbenv->lstats.threshold,
3878                                         lcpu, l_load, l_task, l_cap,
3879                                         bcpu, b_load, b_task, b_cap);
3880 }
3881
3882 static void sched_update_clbstats(struct clb_env *clbenv)
3883 {
3884         collect_cluster_stats(&clbenv->bstats, clbenv->bcpus, clbenv->btarget);
3885         collect_cluster_stats(&clbenv->lstats, clbenv->lcpus, clbenv->ltarget);
3886         adj_threshold(clbenv);
3887 }
3888 #endif /* #if defined(CONFIG_SCHED_HMP) || defined(CONFIG_SCHED_CMP) */
3889
3890
3891 #ifdef CONFIG_SCHED_HMP
3892 /*
3893  * Heterogenous multiprocessor (HMP) optimizations
3894  *
3895  * The cpu types are distinguished using a list of hmp_domains
3896  * which each represent one cpu type using a cpumask.
3897  * The list is assumed ordered by compute capacity with the
3898  * fastest domain first.
3899  */
3900 DEFINE_PER_CPU(struct hmp_domain *, hmp_cpu_domain);
3901 /* We need to know which cpus are fast and slow. */
3902 extern struct cpumask hmp_fast_cpu_mask;
3903 extern struct cpumask hmp_slow_cpu_mask;
3904
3905 extern void __init arch_get_hmp_domains(struct list_head *hmp_domains_list);
3906
3907 /* Setup hmp_domains */
3908 static int __init hmp_cpu_mask_setup(void)
3909 {
3910         char buf[64];
3911         struct hmp_domain *domain;
3912         struct list_head *pos;
3913         int dc, cpu;
3914
3915 #if defined(CONFIG_SCHED_HMP_ENHANCEMENT) || \
3916                 defined(CONFIG_MT_RT_SCHED) || defined(CONFIG_MT_RT_SCHED_LOG)
3917         cpumask_clear(&hmp_fast_cpu_mask);
3918         cpumask_clear(&hmp_slow_cpu_mask);
3919 #endif
3920
3921         pr_debug("Initializing HMP scheduler:\n");
3922
3923         /* Initialize hmp_domains using platform code */
3924         arch_get_hmp_domains(&hmp_domains);
3925         if (list_empty(&hmp_domains)) {
3926                 pr_debug("HMP domain list is empty!\n");
3927                 return 0;
3928         }
3929
3930         /* Print hmp_domains */
3931         dc = 0;
3932         list_for_each(pos, &hmp_domains) {
3933                 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3934                 cpulist_scnprintf(buf, 64, &domain->possible_cpus);
3935                 pr_debug("  HMP domain %d: %s\n", dc, buf);
3936
3937                 /*
3938                  * According to the description in "arch_get_hmp_domains",
3939                  * Fastest domain is at head of list. Thus, the fast-cpu mask should
3940                  * be initialized first, followed by slow-cpu mask.
3941                  */
3942 #if defined(CONFIG_SCHED_HMP_ENHANCEMENT) || \
3943                         defined(CONFIG_MT_RT_SCHED) || defined(CONFIG_MT_RT_SCHED_LOG)
3944                 if(cpumask_empty(&hmp_fast_cpu_mask)) {
3945                         cpumask_copy(&hmp_fast_cpu_mask,&domain->possible_cpus);
3946                         for_each_cpu(cpu, &hmp_fast_cpu_mask)
3947                                 pr_debug("  HMP fast cpu : %d\n",cpu);
3948                 } else if (cpumask_empty(&hmp_slow_cpu_mask)){
3949                         cpumask_copy(&hmp_slow_cpu_mask,&domain->possible_cpus);
3950                         for_each_cpu(cpu, &hmp_slow_cpu_mask)
3951                                 pr_debug("  HMP slow cpu : %d\n",cpu);
3952                 }
3953 #endif
3954
3955                 for_each_cpu_mask(cpu, domain->possible_cpus) {
3956                         per_cpu(hmp_cpu_domain, cpu) = domain;
3957                 }
3958                 dc++;
3959         }
3960
3961         return 1;
3962 }
3963
3964 static struct hmp_domain *hmp_get_hmp_domain_for_cpu(int cpu)
3965 {
3966         struct hmp_domain *domain;
3967         struct list_head *pos;
3968
3969         list_for_each(pos, &hmp_domains) {
3970                 domain = list_entry(pos, struct hmp_domain, hmp_domains);
3971                 if(cpumask_test_cpu(cpu, &domain->possible_cpus))
3972                         return domain;
3973         }
3974         return NULL;
3975 }
3976
3977 static void hmp_online_cpu(int cpu)
3978 {
3979         struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3980
3981         if(domain)
3982                 cpumask_set_cpu(cpu, &domain->cpus);
3983 }
3984
3985 static void hmp_offline_cpu(int cpu)
3986 {
3987         struct hmp_domain *domain = hmp_get_hmp_domain_for_cpu(cpu);
3988
3989         if(domain)
3990                 cpumask_clear_cpu(cpu, &domain->cpus);
3991 }
3992
3993 /*
3994  * Migration thresholds should be in the range [0..1023]
3995  * hmp_up_threshold: min. load required for migrating tasks to a faster cpu
3996  * hmp_down_threshold: max. load allowed for tasks migrating to a slower cpu
3997  * The default values (512, 256) offer good responsiveness, but may need
3998  * tweaking suit particular needs.
3999  *
4000  * hmp_up_prio: Only up migrate task with high priority (<hmp_up_prio)
4001  * hmp_next_up_threshold: Delay before next up migration (1024 ~= 1 ms)
4002  * hmp_next_down_threshold: Delay before next down migration (1024 ~= 1 ms)
4003  */
4004 #ifdef CONFIG_HMP_DYNAMIC_THRESHOLD
4005 unsigned int hmp_up_threshold = 1023;
4006 unsigned int hmp_down_threshold = 0;
4007 #else
4008 unsigned int hmp_up_threshold = 512;
4009 unsigned int hmp_down_threshold = 256;
4010 #endif
4011
4012 unsigned int hmp_next_up_threshold = 4096;
4013 unsigned int hmp_next_down_threshold = 4096;
4014 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4015 #define hmp_last_up_migration(cpu) \
4016                         cpu_rq(cpu)->cfs.avg.hmp_last_up_migration
4017 #define hmp_last_down_migration(cpu) \
4018                         cpu_rq(cpu)->cfs.avg.hmp_last_down_migration
4019 static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p,
4020                         int prev_cpu, int new_cpu);
4021 #else
4022 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se);
4023 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se);
4024 #endif
4025 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
4026                                                 int *min_cpu);
4027
4028 /* Check if cpu is in fastest hmp_domain */
4029 static inline unsigned int hmp_cpu_is_fastest(int cpu)
4030 {
4031         struct list_head *pos;
4032
4033         pos = &hmp_cpu_domain(cpu)->hmp_domains;
4034         return pos == hmp_domains.next;
4035 }
4036
4037 /* Check if cpu is in slowest hmp_domain */
4038 static inline unsigned int hmp_cpu_is_slowest(int cpu)
4039 {
4040         struct list_head *pos;
4041
4042         pos = &hmp_cpu_domain(cpu)->hmp_domains;
4043         return list_is_last(pos, &hmp_domains);
4044 }
4045
4046 /* Next (slower) hmp_domain relative to cpu */
4047 static inline struct hmp_domain *hmp_slower_domain(int cpu)
4048 {
4049         struct list_head *pos;
4050
4051         pos = &hmp_cpu_domain(cpu)->hmp_domains;
4052         return list_entry(pos->next, struct hmp_domain, hmp_domains);
4053 }
4054
4055 /* Previous (faster) hmp_domain relative to cpu */
4056 static inline struct hmp_domain *hmp_faster_domain(int cpu)
4057 {
4058         struct list_head *pos;
4059
4060         pos = &hmp_cpu_domain(cpu)->hmp_domains;
4061         return list_entry(pos->prev, struct hmp_domain, hmp_domains);
4062 }
4063
4064 /*
4065  * Selects a cpu in previous (faster) hmp_domain
4066  * Note that cpumask_any_and() returns the first cpu in the cpumask
4067  */
4068 static inline unsigned int hmp_select_faster_cpu(struct task_struct *tsk,
4069                                                         int cpu)
4070 {
4071         int lowest_cpu=NR_CPUS;
4072         __always_unused int lowest_ratio = hmp_domain_min_load(hmp_faster_domain(cpu), &lowest_cpu);
4073         /*
4074          * If the lowest-loaded CPU in the domain is allowed by the task affinity
4075          * select that one, otherwise select one which is allowed
4076          */
4077         if(lowest_cpu < nr_cpu_ids && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk)))
4078                 return lowest_cpu;
4079         else
4080                 return cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
4081                                 tsk_cpus_allowed(tsk));
4082 }
4083
4084 /*
4085  * Selects a cpu in next (slower) hmp_domain
4086  * Note that cpumask_any_and() returns the first cpu in the cpumask
4087  */
4088 static inline unsigned int hmp_select_slower_cpu(struct task_struct *tsk,
4089                                                         int cpu)
4090 {
4091         int lowest_cpu=NR_CPUS;
4092         __always_unused int lowest_ratio = hmp_domain_min_load(hmp_slower_domain(cpu), &lowest_cpu);
4093         /*
4094          * If the lowest-loaded CPU in the domain is allowed by the task affinity
4095          * select that one, otherwise select one which is allowed
4096          */
4097         if(lowest_cpu < nr_cpu_ids && cpumask_test_cpu(lowest_cpu,tsk_cpus_allowed(tsk)))
4098                 return lowest_cpu;
4099         else
4100                 return cpumask_any_and(&hmp_slower_domain(cpu)->cpus,
4101                                 tsk_cpus_allowed(tsk));
4102 }
4103
4104 static inline void hmp_next_up_delay(struct sched_entity *se, int cpu)
4105 {
4106 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4107         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4108         hmp_last_up_migration(cpu) = cfs_rq_clock_task(cfs_rq);
4109         hmp_last_down_migration(cpu) = 0;
4110 #else
4111         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4112
4113         se->avg.hmp_last_up_migration = cfs_rq_clock_task(cfs_rq);
4114         se->avg.hmp_last_down_migration = 0;
4115 #endif
4116 }
4117
4118 static inline void hmp_next_down_delay(struct sched_entity *se, int cpu)
4119 {
4120 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
4121         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4122         hmp_last_down_migration(cpu) = cfs_rq_clock_task(cfs_rq);
4123         hmp_last_up_migration(cpu) = 0;
4124 #else
4125         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
4126
4127         se->avg.hmp_last_down_migration = cfs_rq_clock_task(cfs_rq);
4128         se->avg.hmp_last_up_migration = 0;
4129 #endif
4130 }
4131
4132 #ifdef CONFIG_HMP_VARIABLE_SCALE
4133 /*
4134  * Heterogenous multiprocessor (HMP) optimizations
4135  *
4136  * These functions allow to change the growing speed of the load_avg_ratio
4137  * by default it goes from 0 to 0.5 in LOAD_AVG_PERIOD = 32ms
4138  * This can now be changed with /sys/kernel/hmp/load_avg_period_ms.
4139  *
4140  * These functions also allow to change the up and down threshold of HMP
4141  * using /sys/kernel/hmp/{up,down}_threshold.
4142  * Both must be between 0 and 1023. The threshold that is compared
4143  * to the load_avg_ratio is up_threshold/1024 and down_threshold/1024.
4144  *
4145  * For instance, if load_avg_period = 64 and up_threshold = 512, an idle
4146  * task with a load of 0 will reach the threshold after 64ms of busy loop.
4147  *
4148  * Changing load_avg_periods_ms has the same effect than changing the
4149  * default scaling factor Y=1002/1024 in the load_avg_ratio computation to
4150  * (1002/1024.0)^(LOAD_AVG_PERIOD/load_avg_period_ms), but the last one
4151  * could trigger overflows.
4152  * For instance, with Y = 1023/1024 in __update_task_entity_contrib()
4153  * "contrib = se->avg.runnable_avg_sum * scale_load_down(se->load.weight);"
4154  * could be overflowed for a weight > 2^12 even is the load_avg_contrib
4155  * should still be a 32bits result. This would not happen by multiplicating
4156  * delta time by 1/22 and setting load_avg_period_ms = 706.
4157  */
4158
4159 /*
4160  * By scaling the delta time it end-up increasing or decrease the
4161  * growing speed of the per entity load_avg_ratio
4162  * The scale factor hmp_data.multiplier is a fixed point
4163  * number: (32-HMP_VARIABLE_SCALE_SHIFT).HMP_VARIABLE_SCALE_SHIFT
4164  */
4165 static u64 hmp_variable_scale_convert(u64 delta)
4166 {
4167         u64 high = delta >> 32ULL;
4168         u64 low = delta & 0xffffffffULL;
4169         low *= hmp_data.multiplier;
4170         high *= hmp_data.multiplier;
4171         return (low >> HMP_VARIABLE_SCALE_SHIFT)
4172                         + (high << (32ULL - HMP_VARIABLE_SCALE_SHIFT));
4173 }
4174
4175 static ssize_t hmp_show(struct kobject *kobj,
4176                                 struct attribute *attr, char *buf)
4177 {
4178         ssize_t ret = 0;
4179         struct hmp_global_attr *hmp_attr =
4180                 container_of(attr, struct hmp_global_attr, attr);
4181         int temp = *(hmp_attr->value);
4182         if (hmp_attr->to_sysfs != NULL)
4183                 temp = hmp_attr->to_sysfs(temp);
4184         ret = sprintf(buf, "%d\n", temp);
4185         return ret;
4186 }
4187
4188 static ssize_t hmp_store(struct kobject *a, struct attribute *attr,
4189                                 const char *buf, size_t count)
4190 {
4191         int temp;
4192         ssize_t ret = count;
4193         struct hmp_global_attr *hmp_attr =
4194                 container_of(attr, struct hmp_global_attr, attr);
4195         char *str = vmalloc(count + 1);
4196         if (str == NULL)
4197                 return -ENOMEM;
4198         memcpy(str, buf, count);
4199         str[count] = 0;
4200         if (sscanf(str, "%d", &temp) < 1)
4201                 ret = -EINVAL;
4202         else {
4203                 if (hmp_attr->from_sysfs != NULL)
4204                         temp = hmp_attr->from_sysfs(temp);
4205                 if (temp < 0)
4206                         ret = -EINVAL;
4207                 else
4208                         *(hmp_attr->value) = temp;
4209         }
4210         vfree(str);
4211         return ret;
4212 }
4213
4214 static int hmp_period_tofrom_sysfs(int value)
4215 {
4216         return (LOAD_AVG_PERIOD << HMP_VARIABLE_SCALE_SHIFT) / value;
4217 }
4218
4219 /* max value for threshold is 1024 */
4220 static int hmp_theshold_from_sysfs(int value)
4221 {
4222         if (value > 1024)
4223                 return -1;
4224         return value;
4225 }
4226 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4227 /* freqinvar control is only 0,1 off/on */
4228 static int hmp_freqinvar_from_sysfs(int value)
4229 {
4230         if (value < 0 || value > 1)
4231                 return -1;
4232         return value;
4233 }
4234 #endif
4235 static void hmp_attr_add(
4236         const char *name,
4237         int *value,
4238         int (*to_sysfs)(int),
4239         int (*from_sysfs)(int))
4240 {
4241         int i = 0;
4242         while (hmp_data.attributes[i] != NULL) {
4243                 i++;
4244                 if (i >= HMP_DATA_SYSFS_MAX)
4245                         return;
4246         }
4247         hmp_data.attr[i].attr.mode = 0644;
4248         hmp_data.attr[i].show = hmp_show;
4249         hmp_data.attr[i].store = hmp_store;
4250         hmp_data.attr[i].attr.name = name;
4251         hmp_data.attr[i].value = value;
4252         hmp_data.attr[i].to_sysfs = to_sysfs;
4253         hmp_data.attr[i].from_sysfs = from_sysfs;
4254         hmp_data.attributes[i] = &hmp_data.attr[i].attr;
4255         hmp_data.attributes[i + 1] = NULL;
4256 }
4257
4258 static int hmp_attr_init(void)
4259 {
4260         int ret;
4261         memset(&hmp_data, sizeof(hmp_data), 0);
4262         /* by default load_avg_period_ms == LOAD_AVG_PERIOD
4263          * meaning no change
4264          */
4265          /* LOAD_AVG_PERIOD is too short to trigger heavy task indicator
4266                 so we change it to LOAD_AVG_VARIABLE_PERIOD */
4267         hmp_data.multiplier = hmp_period_tofrom_sysfs(LOAD_AVG_VARIABLE_PERIOD);
4268
4269         hmp_attr_add("load_avg_period_ms",
4270                 &hmp_data.multiplier,
4271                 hmp_period_tofrom_sysfs,
4272                 hmp_period_tofrom_sysfs);
4273         hmp_attr_add("up_threshold",
4274                 &hmp_up_threshold,
4275                 NULL,
4276                 hmp_theshold_from_sysfs);
4277         hmp_attr_add("down_threshold",
4278                 &hmp_down_threshold,
4279                 NULL,
4280                 hmp_theshold_from_sysfs);
4281         hmp_attr_add("init_task_load_period",
4282                 &init_task_load_period,
4283                 NULL,
4284                 NULL);
4285 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
4286         /* default frequency-invariant scaling ON */
4287         hmp_data.freqinvar_load_scale_enabled = 1;
4288         hmp_attr_add("frequency_invariant_load_scale",
4289                 &hmp_data.freqinvar_load_scale_enabled,
4290                 NULL,
4291                 hmp_freqinvar_from_sysfs);
4292 #endif
4293         hmp_data.attr_group.name = "hmp";
4294         hmp_data.attr_group.attrs = hmp_data.attributes;
4295         ret = sysfs_create_group(kernel_kobj,
4296                 &hmp_data.attr_group);
4297         return 0;
4298 }
4299 late_initcall(hmp_attr_init);
4300 #endif /* CONFIG_HMP_VARIABLE_SCALE */
4301
4302 static inline unsigned int hmp_domain_min_load(struct hmp_domain *hmpd,
4303                                                 int *min_cpu)
4304 {
4305         int cpu;
4306         int min_cpu_runnable_temp = NR_CPUS;
4307         unsigned long min_runnable_load = INT_MAX;
4308         unsigned long contrib;
4309
4310         for_each_cpu_mask(cpu, hmpd->cpus) {
4311                 /* don't use the divisor in the loop, just at the end */
4312                 contrib = cpu_rq(cpu)->avg.runnable_avg_sum * scale_load_down(1024);
4313                 if (contrib < min_runnable_load) {
4314                         min_runnable_load = contrib;
4315                         min_cpu_runnable_temp = cpu;
4316                 }
4317         }
4318
4319         if (min_cpu)
4320                 *min_cpu = min_cpu_runnable_temp;
4321
4322         /* domain will often have at least one empty CPU */
4323         return min_runnable_load ? min_runnable_load / (LOAD_AVG_MAX + 1) : 0;
4324 }
4325
4326 /*
4327  * Calculate the task starvation
4328  * This is the ratio of actually running time vs. runnable time.
4329  * If the two are equal the task is getting the cpu time it needs or
4330  * it is alone on the cpu and the cpu is fully utilized.
4331  */
4332 static inline unsigned int hmp_task_starvation(struct sched_entity *se)
4333 {
4334         u32 starvation;
4335
4336         starvation = se->avg.usage_avg_sum * scale_load_down(NICE_0_LOAD);
4337         starvation /= (se->avg.runnable_avg_sum + 1);
4338
4339         return scale_load(starvation);
4340 }
4341
4342 static inline unsigned int hmp_offload_down(int cpu, struct sched_entity *se)
4343 {
4344         int min_usage;
4345         int dest_cpu = NR_CPUS;
4346
4347         if (hmp_cpu_is_slowest(cpu))
4348                 return NR_CPUS;
4349
4350         /* Is the current domain fully loaded? */
4351         /* load < ~50% */
4352         min_usage = hmp_domain_min_load(hmp_cpu_domain(cpu), NULL);
4353         if (min_usage < (NICE_0_LOAD>>1))
4354                 return NR_CPUS;
4355
4356         /* Is the task alone on the cpu? */
4357         if (cpu_rq(cpu)->cfs.nr_running < 2)
4358                 return NR_CPUS;
4359
4360         /* Is the task actually starving? */
4361         /* >=25% ratio running/runnable = starving */
4362         if (hmp_task_starvation(se) > 768)
4363                 return NR_CPUS;
4364
4365         /* Does the slower domain have spare cycles? */
4366         min_usage = hmp_domain_min_load(hmp_slower_domain(cpu), &dest_cpu);
4367         /* load > 50% */
4368         if (min_usage > NICE_0_LOAD/2)
4369                 return NR_CPUS;
4370
4371         if (cpumask_test_cpu(dest_cpu, &hmp_slower_domain(cpu)->cpus))
4372                 return dest_cpu;
4373
4374         return NR_CPUS;
4375 }
4376 #endif /* CONFIG_SCHED_HMP */
4377
4378
4379 #ifdef CONFIG_MTK_SCHED_CMP
4380 /* Check if cpu is in fastest hmp_domain */
4381 unsigned int cmp_up_threshold = 512;
4382 unsigned int cmp_down_threshold = 256;
4383 #endif /* CONFIG_MTK_SCHED_CMP */
4384
4385 #ifdef CONFIG_MTK_SCHED_CMP_TGS
4386 static void sched_tg_enqueue_fair(struct rq *rq, struct task_struct *p)
4387 {
4388         int id;
4389         unsigned long flags;
4390         struct task_struct *tg = p->group_leader;
4391
4392         if (group_leader_is_empty(p))
4393                 return;
4394         id = get_cluster_id(rq->cpu);
4395         if (unlikely(WARN_ON(id < 0)))
4396                 return;
4397
4398         raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags);
4399         tg->thread_group_info[id].cfs_nr_running++;
4400         raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags);
4401 }
4402
4403 static void sched_tg_dequeue_fair(struct rq *rq, struct task_struct *p)
4404 {
4405         int id;
4406         unsigned long flags;
4407         struct task_struct *tg = p->group_leader;
4408
4409         if (group_leader_is_empty(p))
4410                 return;
4411         id = get_cluster_id(rq->cpu);
4412         if (unlikely(WARN_ON(id < 0)))
4413                 return;
4414
4415         raw_spin_lock_irqsave(&tg->thread_group_info_lock, flags);
4416         tg->thread_group_info[id].cfs_nr_running--;
4417         raw_spin_unlock_irqrestore(&tg->thread_group_info_lock, flags);
4418 }
4419
4420 #endif
4421 /*
4422  * The enqueue_task method is called before nr_running is
4423  * increased. Here we update the fair scheduling stats and
4424  * then put the task into the rbtree:
4425  */
4426 static void
4427 enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4428 {
4429         struct cfs_rq *cfs_rq;
4430         struct sched_entity *se = &p->se;
4431
4432         for_each_sched_entity(se) {
4433                 if (se->on_rq)
4434                         break;
4435                 cfs_rq = cfs_rq_of(se);
4436                 enqueue_entity(cfs_rq, se, flags);
4437
4438                 /*
4439                  * end evaluation on encountering a throttled cfs_rq
4440                  *
4441                  * note: in the case of encountering a throttled cfs_rq we will
4442                  * post the final h_nr_running increment below.
4443                 */
4444                 if (cfs_rq_throttled(cfs_rq))
4445                         break;
4446                 cfs_rq->h_nr_running++;
4447
4448                 flags = ENQUEUE_WAKEUP;
4449         }
4450
4451         for_each_sched_entity(se) {
4452                 cfs_rq = cfs_rq_of(se);
4453                 cfs_rq->h_nr_running++;
4454
4455                 if (cfs_rq_throttled(cfs_rq))
4456                         break;
4457
4458                 update_cfs_shares(cfs_rq);
4459                 update_entity_load_avg(se, 1);
4460         }
4461
4462         if (!se) {
4463                 update_rq_runnable_avg(rq, rq->nr_running);
4464                 inc_nr_running(rq);
4465 #ifndef CONFIG_CFS_BANDWIDTH
4466                 BUG_ON(rq->cfs.nr_running > rq->cfs.h_nr_running);
4467 #endif
4468         }
4469         hrtick_update(rq);
4470 #ifdef CONFIG_HMP_TRACER
4471         trace_sched_runqueue_length(rq->cpu,rq->nr_running);
4472         trace_sched_cfs_length(rq->cpu,rq->cfs.h_nr_running);
4473 #endif
4474 #ifdef CONFIG_MET_SCHED_HMP
4475         RqLen(rq->cpu,rq->nr_running);
4476         CfsLen(rq->cpu,rq->cfs.h_nr_running);
4477 #endif
4478
4479 #ifdef CONFIG_MTK_SCHED_CMP_TGS
4480         sched_tg_enqueue_fair(rq, p);
4481 #endif
4482 }
4483
4484 static void set_next_buddy(struct sched_entity *se);
4485
4486 /*
4487  * The dequeue_task method is called before nr_running is
4488  * decreased. We remove the task from the rbtree and
4489  * update the fair scheduling stats:
4490  */
4491 static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
4492 {
4493         struct cfs_rq *cfs_rq;
4494         struct sched_entity *se = &p->se;
4495         int task_sleep = flags & DEQUEUE_SLEEP;
4496
4497         for_each_sched_entity(se) {
4498                 cfs_rq = cfs_rq_of(se);
4499                 dequeue_entity(cfs_rq, se, flags);
4500
4501                 /*
4502                  * end evaluation on encountering a throttled cfs_rq
4503                  *
4504                  * note: in the case of encountering a throttled cfs_rq we will
4505                  * post the final h_nr_running decrement below.
4506                 */
4507                 if (cfs_rq_throttled(cfs_rq))
4508                         break;
4509                 cfs_rq->h_nr_running--;
4510
4511                 /* Don't dequeue parent if it has other entities besides us */
4512                 if (cfs_rq->load.weight) {
4513                         /*
4514                          * Bias pick_next to pick a task from this cfs_rq, as
4515                          * p is sleeping when it is within its sched_slice.
4516                          */
4517                         if (task_sleep && parent_entity(se))
4518                                 set_next_buddy(parent_entity(se));
4519
4520                         /* avoid re-evaluating load for this entity */
4521                         se = parent_entity(se);
4522                         break;
4523                 }
4524                 flags |= DEQUEUE_SLEEP;
4525         }
4526
4527         for_each_sched_entity(se) {
4528                 cfs_rq = cfs_rq_of(se);
4529                 cfs_rq->h_nr_running--;
4530
4531                 if (cfs_rq_throttled(cfs_rq))
4532                         break;
4533
4534                 update_cfs_shares(cfs_rq);
4535                 update_entity_load_avg(se, 1);
4536         }
4537
4538         if (!se) {
4539                 dec_nr_running(rq);
4540 #ifndef CONFIG_CFS_BANDWIDTH
4541                 BUG_ON(rq->cfs.nr_running > rq->cfs.h_nr_running);
4542 #endif
4543                 update_rq_runnable_avg(rq, 1);
4544         }
4545         hrtick_update(rq);
4546 #ifdef CONFIG_HMP_TRACER
4547         trace_sched_runqueue_length(rq->cpu,rq->nr_running);
4548         trace_sched_cfs_length(rq->cpu,rq->cfs.h_nr_running);
4549 #endif
4550 #ifdef CONFIG_MET_SCHED_HMP
4551         RqLen(rq->cpu,rq->nr_running);
4552         CfsLen(rq->cpu,rq->cfs.h_nr_running);
4553 #endif
4554
4555 #ifdef CONFIG_MTK_SCHED_CMP_TGS
4556         sched_tg_dequeue_fair(rq, p);
4557 #endif
4558 }
4559
4560 #ifdef CONFIG_SMP
4561 /* Used instead of source_load when we know the type == 0 */
4562 static unsigned long weighted_cpuload(const int cpu)
4563 {
4564         return cpu_rq(cpu)->cfs.runnable_load_avg;
4565 }
4566
4567 /*
4568  * Return a low guess at the load of a migration-source cpu weighted
4569  * according to the scheduling class and "nice" value.
4570  *
4571  * We want to under-estimate the load of migration sources, to
4572  * balance conservatively.
4573  */
4574 static unsigned long source_load(int cpu, int type)
4575 {
4576         struct rq *rq = cpu_rq(cpu);
4577         unsigned long total = weighted_cpuload(cpu);
4578
4579         if (type == 0 || !sched_feat(LB_BIAS))
4580                 return total;
4581
4582         return min(rq->cpu_load[type-1], total);
4583 }
4584
4585 /*
4586  * Return a high guess at the load of a migration-target cpu weighted
4587  * according to the scheduling class and "nice" value.
4588  */
4589 static unsigned long target_load(int cpu, int type)
4590 {
4591         struct rq *rq = cpu_rq(cpu);
4592         unsigned long total = weighted_cpuload(cpu);
4593
4594         if (type == 0 || !sched_feat(LB_BIAS))
4595                 return total;
4596
4597         return max(rq->cpu_load[type-1], total);
4598 }
4599
4600 static unsigned long power_of(int cpu)
4601 {
4602         return cpu_rq(cpu)->cpu_power;
4603 }
4604
4605 static unsigned long cpu_avg_load_per_task(int cpu)
4606 {
4607         struct rq *rq = cpu_rq(cpu);
4608         unsigned long nr_running = ACCESS_ONCE(rq->nr_running);
4609         unsigned long load_avg = rq->cfs.runnable_load_avg;
4610
4611         if (nr_running)
4612                 return load_avg / nr_running;
4613
4614         return 0;
4615 }
4616
4617
4618 static void task_waking_fair(struct task_struct *p)
4619 {
4620         struct sched_entity *se = &p->se;
4621         struct cfs_rq *cfs_rq = cfs_rq_of(se);
4622         u64 min_vruntime;
4623
4624 #ifndef CONFIG_64BIT
4625         u64 min_vruntime_copy;
4626
4627         do {
4628                 min_vruntime_copy = cfs_rq->min_vruntime_copy;
4629                 smp_rmb();
4630                 min_vruntime = cfs_rq->min_vruntime;
4631         } while (min_vruntime != min_vruntime_copy);
4632 #else
4633         min_vruntime = cfs_rq->min_vruntime;
4634 #endif
4635
4636         se->vruntime -= min_vruntime;
4637 }
4638
4639 #ifdef CONFIG_FAIR_GROUP_SCHED
4640 /*
4641  * effective_load() calculates the load change as seen from the root_task_group
4642  *
4643  * Adding load to a group doesn't make a group heavier, but can cause movement
4644  * of group shares between cpus. Assuming the shares were perfectly aligned one
4645  * can calculate the shift in shares.
4646  *
4647  * Calculate the effective load difference if @wl is added (subtracted) to @tg
4648  * on this @cpu and results in a total addition (subtraction) of @wg to the
4649  * total group weight.
4650  *
4651  * Given a runqueue weight distribution (rw_i) we can compute a shares
4652  * distribution (s_i) using:
4653  *
4654  *   s_i = rw_i / \Sum rw_j                                             (1)
4655  *
4656  * Suppose we have 4 CPUs and our @tg is a direct child of the root group and
4657  * has 7 equal weight tasks, distributed as below (rw_i), with the resulting
4658  * shares distribution (s_i):
4659  *
4660  *   rw_i = {   2,   4,   1,   0 }
4661  *   s_i  = { 2/7, 4/7, 1/7,   0 }
4662  *
4663  * As per wake_affine() we're interested in the load of two CPUs (the CPU the
4664  * task used to run on and the CPU the waker is running on), we need to
4665  * compute the effect of waking a task on either CPU and, in case of a sync
4666  * wakeup, compute the effect of the current task going to sleep.
4667  *
4668  * So for a change of @wl to the local @cpu with an overall group weight change
4669  * of @wl we can compute the new shares distribution (s'_i) using:
4670  *
4671  *   s'_i = (rw_i + @wl) / (@wg + \Sum rw_j)                            (2)
4672  *
4673  * Suppose we're interested in CPUs 0 and 1, and want to compute the load
4674  * differences in waking a task to CPU 0. The additional task changes the
4675  * weight and shares distributions like:
4676  *
4677  *   rw'_i = {   3,   4,   1,   0 }
4678  *   s'_i  = { 3/8, 4/8, 1/8,   0 }
4679  *
4680  * We can then compute the difference in effective weight by using:
4681  *
4682  *   dw_i = S * (s'_i - s_i)                                            (3)
4683  *
4684  * Where 'S' is the group weight as seen by its parent.
4685  *
4686  * Therefore the effective change in loads on CPU 0 would be 5/56 (3/8 - 2/7)
4687  * times the weight of the group. The effect on CPU 1 would be -4/56 (4/8 -
4688  * 4/7) times the weight of the group.
4689  */
4690 static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
4691 {
4692         struct sched_entity *se = tg->se[cpu];
4693
4694         if (!tg->parent)        /* the trivial, non-cgroup case */
4695                 return wl;
4696
4697         for_each_sched_entity(se) {
4698                 long w, W;
4699
4700                 tg = se->my_q->tg;
4701
4702                 /*
4703                  * W = @wg + \Sum rw_j
4704                  */
4705                 W = wg + calc_tg_weight(tg, se->my_q);
4706
4707                 /*
4708                  * w = rw_i + @wl
4709                  */
4710                 w = se->my_q->load.weight + wl;
4711
4712                 /*
4713                  * wl = S * s'_i; see (2)
4714                  */
4715                 if (W > 0 && w < W)
4716                         wl = (w * tg->shares) / W;
4717                 else
4718                         wl = tg->shares;
4719
4720                 /*
4721                  * Per the above, wl is the new se->load.weight value; since
4722                  * those are clipped to [MIN_SHARES, ...) do so now. See
4723                  * calc_cfs_shares().
4724                  */
4725                 if (wl < MIN_SHARES)
4726                         wl = MIN_SHARES;
4727
4728                 /*
4729                  * wl = dw_i = S * (s'_i - s_i); see (3)
4730                  */
4731                 wl -= se->load.weight;
4732
4733                 /*
4734                  * Recursively apply this logic to all parent groups to compute
4735                  * the final effective load change on the root group. Since
4736                  * only the @tg group gets extra weight, all parent groups can
4737                  * only redistribute existing shares. @wl is the shift in shares
4738                  * resulting from this level per the above.
4739                  */
4740                 wg = 0;
4741         }
4742
4743         return wl;
4744 }
4745 #else
4746
4747 static inline unsigned long effective_load(struct task_group *tg, int cpu,
4748                 unsigned long wl, unsigned long wg)
4749 {
4750         return wl;
4751 }
4752
4753 #endif
4754
4755 static int wake_affine(struct sched_domain *sd, struct task_struct *p, int sync)
4756 {
4757         s64 this_load, load;
4758         int idx, this_cpu, prev_cpu;
4759         unsigned long tl_per_task;
4760         struct task_group *tg;
4761         unsigned long weight;
4762         int balanced;
4763
4764         idx       = sd->wake_idx;
4765         this_cpu  = smp_processor_id();
4766         prev_cpu  = task_cpu(p);
4767         load      = source_load(prev_cpu, idx);
4768         this_load = target_load(this_cpu, idx);
4769
4770         /*
4771          * If sync wakeup then subtract the (maximum possible)
4772          * effect of the currently running task from the load
4773          * of the current CPU:
4774          */
4775         if (sync) {
4776                 tg = task_group(current);
4777                 weight = current->se.load.weight;
4778
4779                 this_load += effective_load(tg, this_cpu, -weight, -weight);
4780                 load += effective_load(tg, prev_cpu, 0, -weight);
4781         }
4782
4783         tg = task_group(p);
4784         weight = p->se.load.weight;
4785
4786         /*
4787          * In low-load situations, where prev_cpu is idle and this_cpu is idle
4788          * due to the sync cause above having dropped this_load to 0, we'll
4789          * always have an imbalance, but there's really nothing you can do
4790          * about that, so that's good too.
4791          *
4792          * Otherwise check if either cpus are near enough in load to allow this
4793          * task to be woken on this_cpu.
4794          */
4795         if (this_load > 0) {
4796                 s64 this_eff_load, prev_eff_load;
4797
4798                 this_eff_load = 100;
4799                 this_eff_load *= power_of(prev_cpu);
4800                 this_eff_load *= this_load +
4801                         effective_load(tg, this_cpu, weight, weight);
4802
4803                 prev_eff_load = 100 + (sd->imbalance_pct - 100) / 2;
4804                 prev_eff_load *= power_of(this_cpu);
4805                 prev_eff_load *= load + effective_load(tg, prev_cpu, 0, weight);
4806
4807                 balanced = this_eff_load <= prev_eff_load;
4808         } else
4809                 balanced = true;
4810
4811         /*
4812          * If the currently running task will sleep within
4813          * a reasonable amount of time then attract this newly
4814          * woken task:
4815          */
4816         if (sync && balanced)
4817                 return 1;
4818
4819         schedstat_inc(p, se.statistics.nr_wakeups_affine_attempts);
4820         tl_per_task = cpu_avg_load_per_task(this_cpu);
4821
4822         if (balanced ||
4823             (this_load <= load &&
4824              this_load + target_load(prev_cpu, idx) <= tl_per_task)) {
4825                 /*
4826                  * This domain has SD_WAKE_AFFINE and
4827                  * p is cache cold in this domain, and
4828                  * there is no bad imbalance.
4829                  */
4830                 schedstat_inc(sd, ttwu_move_affine);
4831                 schedstat_inc(p, se.statistics.nr_wakeups_affine);
4832
4833                 return 1;
4834         }
4835         return 0;
4836 }
4837
4838 /*
4839  * find_idlest_group finds and returns the least busy CPU group within the
4840  * domain.
4841  */
4842 static struct sched_group *
4843 find_idlest_group(struct sched_domain *sd, struct task_struct *p,
4844                   int this_cpu, int load_idx)
4845 {
4846         struct sched_group *idlest = NULL, *group = sd->groups;
4847         unsigned long min_load = ULONG_MAX, this_load = 0;
4848         int imbalance = 100 + (sd->imbalance_pct-100)/2;
4849
4850         do {
4851                 unsigned long load, avg_load;
4852                 int local_group;
4853                 int i;
4854
4855                 /* Skip over this group if it has no CPUs allowed */
4856                 if (!cpumask_intersects(sched_group_cpus(group),
4857                                         tsk_cpus_allowed(p)))
4858                         continue;
4859
4860                 local_group = cpumask_test_cpu(this_cpu,
4861                                                sched_group_cpus(group));
4862
4863                 /* Tally up the load of all CPUs in the group */
4864                 avg_load = 0;
4865
4866                 for_each_cpu(i, sched_group_cpus(group)) {
4867                         /* Bias balancing toward cpus of our domain */
4868                         if (local_group)
4869                                 load = source_load(i, load_idx);
4870                         else
4871                                 load = target_load(i, load_idx);
4872
4873                         avg_load += load;
4874
4875                         mt_sched_printf("find_idlest_group cpu=%d avg=%lu",
4876                                 i, avg_load);
4877                 }
4878
4879                 /* Adjust by relative CPU power of the group */
4880                 avg_load = (avg_load * SCHED_POWER_SCALE) / group->sgp->power;
4881
4882                 if (local_group) {
4883                         this_load = avg_load;
4884                         mt_sched_printf("find_idlest_group this_load=%lu",
4885                                 this_load);
4886                 } else if (avg_load < min_load) {
4887                         min_load = avg_load;
4888                         idlest = group;
4889                         mt_sched_printf("find_idlest_group min_load=%lu",
4890                                 min_load);
4891                 }
4892         } while (group = group->next, group != sd->groups);
4893
4894         if (!idlest || 100*this_load < imbalance*min_load){
4895                 mt_sched_printf("find_idlest_group fail this_load=%lu min_load=%lu, imbalance=%d",
4896                         this_load, min_load, imbalance);
4897                 return NULL;
4898         }
4899         return idlest;
4900 }
4901
4902 /*
4903  * find_idlest_cpu - find the idlest cpu among the cpus in group.
4904  */
4905 static int
4906 find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu)
4907 {
4908         unsigned long load, min_load = ULONG_MAX;
4909         int idlest = -1;
4910         int i;
4911
4912         /* Traverse only the allowed CPUs */
4913         for_each_cpu_and(i, sched_group_cpus(group), tsk_cpus_allowed(p)) {
4914                 load = weighted_cpuload(i);
4915
4916                 if (load < min_load || (load == min_load && i == this_cpu)) {
4917                         min_load = load;
4918                         idlest = i;
4919                 }
4920         }
4921
4922         return idlest;
4923 }
4924
4925 /*
4926  * Try and locate an idle CPU in the sched_domain.
4927  */
4928 static int select_idle_sibling(struct task_struct *p, int target)
4929 {
4930         struct sched_domain *sd;
4931         struct sched_group *sg;
4932         int i = task_cpu(p);
4933
4934         if (idle_cpu(target))
4935                 return target;
4936
4937         /*
4938          * If the prevous cpu is cache affine and idle, don't be stupid.
4939          */
4940         if (i != target && cpus_share_cache(i, target) && idle_cpu(i))
4941                 return i;
4942
4943         /*
4944          * Otherwise, iterate the domains and find an elegible idle cpu.
4945          */
4946         sd = rcu_dereference(per_cpu(sd_llc, target));
4947         for_each_lower_domain(sd) {
4948                 sg = sd->groups;
4949                 do {
4950                         if (!cpumask_intersects(sched_group_cpus(sg),
4951                                                 tsk_cpus_allowed(p)))
4952                                 goto next;
4953
4954                         for_each_cpu(i, sched_group_cpus(sg)) {
4955                                 if (i == target || !idle_cpu(i))
4956                                         goto next;
4957                         }
4958
4959                         target = cpumask_first_and(sched_group_cpus(sg),
4960                                         tsk_cpus_allowed(p));
4961                         goto done;
4962 next:
4963                         sg = sg->next;
4964                 } while (sg != sd->groups);
4965         }
4966 done:
4967         return target;
4968 }
4969
4970 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
4971 /*
4972  * @p:      the task want to be located at.
4973  * @clid:   the CPU cluster id to be search for the target CPU
4974  * @target: the appropriate CPU for task p, updated by this function.
4975  *
4976  * Return:
4977  *
4978  * 1 on success
4979  * 0 if target CPU is not found in this CPU cluster
4980  */
4981 static int cmp_find_idle_cpu(struct task_struct *p, int clid, int *target)
4982 {
4983         struct cpumask cls_cpus;
4984         int j;
4985
4986         get_cluster_cpus(&cls_cpus, clid, true);
4987         *target = cpumask_any_and(&cls_cpus, tsk_cpus_allowed(p));
4988         for_each_cpu(j, &cls_cpus) {
4989                 if (idle_cpu(j) && cpumask_test_cpu(j, tsk_cpus_allowed(p))) {
4990                         *target = j;
4991                         break;
4992                 }
4993         }
4994         if (*target >= nr_cpu_ids)
4995                 return 0; // task is not allow in this CPU cluster
4996         mt_sched_printf("wakeup %d %s cpu=%d, max_clid/max_idle_clid=%d",
4997                 p->pid, p->comm, *target, clid);
4998
4999         return 1;
5000 }
5001
5002 #if !defined(CONFIG_SCHED_HMP)
5003 #define TGS_WAKEUP_EXPERIMENT
5004 #endif
5005 static int cmp_select_task_rq_fair(struct task_struct *p, int sd_flag, int *cpu)
5006 {
5007         int i, j;
5008         int max_cnt=0, tskcnt;
5009         int tgs_clid=-1;
5010         int idle_cnt, max_idle_cnt=0;
5011         int in_prev=0, prev_cluster=0;
5012         struct cpumask cls_cpus;
5013         int num_cluster;
5014
5015         num_cluster=arch_get_nr_clusters();
5016         for(i=0; i< num_cluster; i++) {
5017                 tskcnt= p->group_leader->thread_group_info[i].nr_running;
5018                 idle_cnt = 0;
5019                 get_cluster_cpus(&cls_cpus, i, true);
5020
5021                 for_each_cpu(j, &cls_cpus) {
5022 #ifdef TGS_WAKEUP_EXPERIMENT
5023                         if (arch_is_big_little()) {
5024                                 int bcpu = arch_cpu_is_big(j);
5025                                 if (bcpu && p->se.avg.load_avg_ratio >= cmp_up_threshold) {
5026                                         in_prev = 0;
5027                                         tgs_clid = i;
5028                                         mt_sched_printf("[heavy task] wakeup load=%ld up_th=%u pid=%d name=%s cpu=%d, tgs_clid=%d in_prev=%d",
5029                                                                         p->se.avg.load_avg_ratio, cmp_up_threshold, p->pid, p->comm, *cpu, tgs_clid, in_prev);
5030                                         goto find_idle_cpu;
5031                                 }
5032                                 if (!bcpu && p->se.avg.load_avg_ratio < cmp_down_threshold) {
5033                                         in_prev = 0;
5034                                         tgs_clid = i;
5035                                         mt_sched_printf("[light task] wakeup load=%ld down_th=%u pid=%d name=%s cpu=%d, tgs_clid=%d in_prev=%d",
5036                                                                         p->se.avg.load_avg_ratio, cmp_down_threshold, p->pid, p->comm, *cpu, tgs_clid, in_prev);
5037                                         goto find_idle_cpu;
5038                                 }
5039                         }
5040 #endif
5041                         if (idle_cpu(j))
5042                                 idle_cnt++;
5043                 }
5044                 mt_sched_printf("wakeup load=%ld pid=%d name=%s clid=%d idle_cnt=%d tskcnt=%d max_cnt=%d, cls_cpus=%02lx, onlineCPU=%02lx",
5045                                                 p->se.avg.load_avg_ratio, p->pid, p->comm, i, idle_cnt, tskcnt, max_cnt,
5046                                                 *cpumask_bits(&cls_cpus), *cpumask_bits(cpu_online_mask));
5047
5048                 if (idle_cnt == 0)
5049                         continue;
5050
5051                 if (i == get_cluster_id(*cpu))
5052                         prev_cluster = 1;
5053
5054                 if (tskcnt > 0) {
5055                         if ( (tskcnt > max_cnt) || ((tskcnt == max_cnt) && prev_cluster)) {
5056                                 in_prev = prev_cluster;
5057                                 tgs_clid = i;
5058                                 max_cnt = tskcnt;
5059                         }
5060                 } else if (0 == max_cnt) {
5061                         if ((idle_cnt > max_idle_cnt) || ((idle_cnt == max_idle_cnt) && prev_cluster)) {
5062                                 in_prev = prev_cluster;
5063                                 tgs_clid = i ;
5064                                 max_idle_cnt = idle_cnt;
5065                         }
5066
5067                 }
5068                 mt_sched_printf("wakeup %d %s i=%d idle_cnt=%d tgs_clid=%d max_cnt=%d max_idle_cnt=%d in_prev=%d",
5069                         p->pid, p->comm, i, idle_cnt, tgs_clid, max_cnt, max_idle_cnt, in_prev);
5070         }
5071
5072 #ifdef TGS_WAKEUP_EXPERIMENT
5073 find_idle_cpu:
5074 #endif
5075         mt_sched_printf("wakeup %d %s cpu=%d, tgs_clid=%d in_prev=%d",
5076                 p->pid, p->comm, *cpu, tgs_clid, in_prev);
5077
5078         if(-1 != tgs_clid && !in_prev && cmp_find_idle_cpu(p, tgs_clid, cpu))
5079                 return 1;
5080
5081         return 0;
5082 }
5083 #endif
5084
5085 #ifdef CONFIG_MTK_SCHED_TRACERS
5086 #define LB_RESET                0
5087 #define LB_AFFINITY             0x10
5088 #define LB_BUDDY                0x20
5089 #define LB_FORK                 0x30
5090 #define LB_CMP_SHIFT    8
5091 #define LB_CMP                  0x4000
5092 #define LB_SMP_SHIFT    16
5093 #define LB_SMP                  0x500000
5094 #define LB_HMP_SHIFT    24
5095 #define LB_HMP                  0x60000000
5096 #endif
5097
5098 /*
5099  * sched_balance_self: balance the current task (running on cpu) in domains
5100  * that have the 'flag' flag set. In practice, this is SD_BALANCE_FORK and
5101  * SD_BALANCE_EXEC.
5102  *
5103  * Balance, ie. select the least loaded group.
5104  *
5105  * Returns the target CPU number, or the same CPU if no balancing is needed.
5106  *
5107  * preempt must be disabled.
5108  */
5109 static int
5110 select_task_rq_fair(struct task_struct *p, int sd_flag, int wake_flags)
5111 {
5112         struct sched_domain *tmp, *affine_sd = NULL, *sd = NULL;
5113         int cpu = smp_processor_id();
5114         int prev_cpu = task_cpu(p);
5115         int new_cpu = cpu;
5116         int want_affine = 0;
5117         int sync = wake_flags & WF_SYNC;
5118 #if defined(CONFIG_SCHED_HMP) && !defined(CONFIG_SCHED_HMP_ENHANCEMENT)
5119         int target_cpu = nr_cpu_ids;
5120 #endif
5121 #ifdef CONFIG_MTK_SCHED_TRACERS
5122         int policy = 0;
5123 #endif
5124 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5125         int cmp_cpu;
5126         int cmp_cpu_found=0;
5127 #endif
5128 #ifdef CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK
5129         int buddy_cpu = per_cpu(sd_pack_buddy, cpu);
5130 #endif
5131
5132         if (p->nr_cpus_allowed == 1)
5133         {
5134 #ifdef CONFIG_MTK_SCHED_TRACERS
5135                 trace_sched_select_task_rq(p, (LB_AFFINITY | prev_cpu), prev_cpu, prev_cpu);
5136 #endif
5137                 return prev_cpu;
5138         }
5139
5140 #ifdef CONFIG_HMP_PACK_SMALL_TASK
5141 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5142         if (check_pack_buddy(cpu, p) && PA_ENABLE) {
5143                 PACK_FROM_CPUX_TO_CPUY_COUNT[cpu][per_cpu(sd_pack_buddy, cpu)]++;
5144
5145 #ifdef CONFIG_HMP_TRACER
5146                 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_PACK_FORM_CPUX_TO_CPUY, p->pid, cpu, per_cpu(sd_pack_buddy, cpu));
5147 #endif /* CONFIG_HMP_TRACER */
5148
5149                 if(PA_MON_ENABLE) {
5150                         if(strcmp(p->comm, PA_MON) == 0 && cpu != per_cpu(sd_pack_buddy, cpu)) {
5151                                 printk(KERN_EMERG "[PA] %s PACK From CPU%d to CPU%d\n", p->comm, cpu, per_cpu(sd_pack_buddy, cpu));
5152                                 printk(KERN_EMERG "[PA]   Buddy RQ Usage = %u, Period = %u, NR = %u\n",
5153                                                                                                                 per_cpu(BUDDY_CPU_RQ_USAGE, per_cpu(sd_pack_buddy, cpu)),
5154                                                                                                                 per_cpu(BUDDY_CPU_RQ_PERIOD, per_cpu(sd_pack_buddy, cpu)),
5155                                                                                                                 per_cpu(BUDDY_CPU_RQ_NR, per_cpu(sd_pack_buddy, cpu)));
5156                                 printk(KERN_EMERG "[PA]   Task Usage = %u, Period = %u\n",
5157                                                                                                                 per_cpu(TASK_USGAE, cpu),
5158                                                                                                                 per_cpu(TASK_PERIOD, cpu));
5159                         }
5160                 }
5161 #else /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5162         if (check_pack_buddy(cpu, p)) {
5163 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5164 #ifdef CONFIG_MTK_SCHED_TRACERS
5165                 new_cpu = per_cpu(sd_pack_buddy, cpu);
5166                 trace_sched_select_task_rq(p, (LB_BUDDY | new_cpu), prev_cpu, new_cpu);
5167 #endif
5168                 return per_cpu(sd_pack_buddy, cpu);
5169         }
5170 #elif defined (CONFIG_MTK_SCHED_CMP_PACK_SMALL_TASK)
5171 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5172         if (PA_ENABLE && (sd_flag & SD_BALANCE_WAKE) && (check_pack_buddy(buddy_cpu, p))) {
5173 #else
5174         if ((sd_flag & SD_BALANCE_WAKE) && (check_pack_buddy(buddy_cpu, p))) {
5175 #endif
5176                 struct thread_group_info_t *src_tginfo, *dst_tginfo;
5177                 src_tginfo = &p->group_leader->thread_group_info[get_cluster_id(prev_cpu)]; //Compare with previous cpu(Not current cpu)
5178                 dst_tginfo = &p->group_leader->thread_group_info[get_cluster_id(buddy_cpu)];
5179                 if((get_cluster_id(prev_cpu) == get_cluster_id(buddy_cpu)) ||
5180                         (src_tginfo->nr_running < dst_tginfo->nr_running))
5181                 {
5182 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5183                         PACK_FROM_CPUX_TO_CPUY_COUNT[cpu][buddy_cpu]++;
5184                         mt_sched_printf("[PA]pid=%d, Pack to CPU%d(CPU%d's buddy)\n", p->pid,buddy_cpu,cpu);
5185                         if(PA_MON_ENABLE) {
5186                                 u8 i=0;
5187                                 for(i=0;i<4; i++) {
5188                                         if(strcmp(p->comm, &PA_MON[i][0]) == 0) {
5189                                                 TASK_PACK_CPU_COUNT[i][buddy_cpu]++;
5190                                                 printk(KERN_EMERG "[PA] %s PACK to CPU%d(CPU%d's buddy), pre(cpu%d)\n", p->comm, buddy_cpu,cpu, prev_cpu);
5191                                                 printk(KERN_EMERG "[PA]   Buddy RQ Usage = %u, Period = %u, NR = %u\n",
5192                                                                                                                                 per_cpu(BUDDY_CPU_RQ_USAGE, buddy_cpu),
5193                                                                                                                                 per_cpu(BUDDY_CPU_RQ_PERIOD, buddy_cpu),
5194                                                                                                                                 per_cpu(BUDDY_CPU_RQ_NR, buddy_cpu));
5195                                                 printk(KERN_EMERG "[PA]   Task Usage = %u, Period = %u\n",
5196                                                                                                                                 per_cpu(TASK_USGAE, cpu),
5197                                                                                                                                 per_cpu(TASK_PERIOD, cpu));
5198                                                 break;
5199                                         }
5200                                 }
5201                         }
5202 #endif //CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
5203 #ifdef CONFIG_MTK_SCHED_TRACERS
5204                         trace_sched_select_task_rq(p, (LB_BUDDY | buddy_cpu), prev_cpu, buddy_cpu);
5205 #endif
5206                         return buddy_cpu;
5207                 }
5208         }
5209 #endif /* CONFIG_HMP_PACK_SMALL_TASK */
5210
5211 #ifdef CONFIG_SCHED_HMP
5212         /* always put non-kernel forking tasks on a big domain */
5213         if (p->mm && (sd_flag & SD_BALANCE_FORK)) {
5214                 if(hmp_cpu_is_fastest(prev_cpu)) {
5215                         struct hmp_domain *hmpdom = list_entry(&hmp_cpu_domain(prev_cpu)->hmp_domains, struct hmp_domain, hmp_domains);
5216                         __always_unused int lowest_ratio = hmp_domain_min_load(hmpdom, &new_cpu);
5217                         if(new_cpu < nr_cpu_ids && cpumask_test_cpu(new_cpu,tsk_cpus_allowed(p)))
5218                         {
5219 #ifdef CONFIG_MTK_SCHED_TRACERS
5220                                 trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
5221 #endif
5222                                 return new_cpu;
5223                         }
5224                         else
5225                         {
5226                                 new_cpu = cpumask_any_and(&hmp_faster_domain(cpu)->cpus,
5227                                                 tsk_cpus_allowed(p));
5228                                 if(new_cpu < nr_cpu_ids)
5229                                 {
5230 #ifdef CONFIG_MTK_SCHED_TRACERS
5231                                         trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
5232 #endif
5233                                         return new_cpu;
5234                                 }
5235                         }
5236                 } else {
5237                         new_cpu = hmp_select_faster_cpu(p, prev_cpu);
5238                         if (new_cpu < nr_cpu_ids)
5239                         {
5240 #ifdef CONFIG_MTK_SCHED_TRACERS
5241                                 trace_sched_select_task_rq(p, (LB_FORK | new_cpu), prev_cpu, new_cpu);
5242 #endif
5243                                 return new_cpu;
5244                         }
5245                 }
5246                 // to recover new_cpu value
5247                 if (new_cpu >= nr_cpu_ids)
5248                         new_cpu = cpu;
5249         }
5250 #endif
5251
5252         if (sd_flag & SD_BALANCE_WAKE) {
5253                 if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
5254                         want_affine = 1;
5255                 new_cpu = prev_cpu;
5256         }
5257
5258 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5259         cmp_cpu = prev_cpu;
5260         cmp_cpu_found = cmp_select_task_rq_fair(p, sd_flag, &cmp_cpu);
5261         if (cmp_cpu_found && (cmp_cpu < nr_cpu_ids)) {
5262                 cpu = cmp_cpu;
5263                 new_cpu = cmp_cpu;
5264 #ifdef CONFIG_MTK_SCHED_TRACERS
5265                 policy |= (new_cpu << LB_CMP_SHIFT);
5266                 policy |= LB_CMP;
5267 #endif
5268                 mt_sched_printf("wakeup %d %s sd_flag=%x cmp_cpu_found=%d, cpu=%d, want_affine=%d ",
5269                         p->pid, p->comm, sd_flag, cmp_cpu_found, cpu, want_affine);
5270                 goto cmp_found;
5271         }
5272 #endif
5273         rcu_read_lock();
5274         for_each_domain(cpu, tmp) {
5275                 mt_sched_printf("wakeup %d %s tmp->flags=%x, cpu=%d, prev_cpu=%d, new_cpu=%d",
5276                         p->pid, p->comm, tmp->flags, cpu, prev_cpu, new_cpu);
5277
5278                 if (!(tmp->flags & SD_LOAD_BALANCE))
5279                         continue;
5280
5281                 /*
5282                  * If both cpu and prev_cpu are part of this domain,
5283                  * cpu is a valid SD_WAKE_AFFINE target.
5284                  */
5285                 if (want_affine && (tmp->flags & SD_WAKE_AFFINE) &&
5286                     cpumask_test_cpu(prev_cpu, sched_domain_span(tmp))) {
5287                         affine_sd = tmp;
5288                         break;
5289                 }
5290
5291                 if (tmp->flags & sd_flag)
5292                         sd = tmp;
5293         }
5294
5295         if (affine_sd) {
5296                 if (cpu != prev_cpu && wake_affine(affine_sd, p, sync))
5297                         prev_cpu = cpu;
5298
5299                 new_cpu = select_idle_sibling(p, prev_cpu);
5300                 goto unlock;
5301         }
5302
5303         mt_sched_printf("wakeup %d %s sd=%p", p->pid, p->comm, sd);
5304
5305         while (sd) {
5306                 int load_idx = sd->forkexec_idx;
5307                 struct sched_group *group;
5308                 int weight;
5309
5310                 mt_sched_printf("wakeup %d %s find_idlest_group cpu=%d sd->flags=%x sd_flag=%x",
5311                         p->pid, p->comm, cpu, sd->flags, sd_flag);
5312
5313                 if (!(sd->flags & sd_flag)) {
5314                         sd = sd->child;
5315                         continue;
5316                 }
5317
5318                 if (sd_flag & SD_BALANCE_WAKE)
5319                         load_idx = sd->wake_idx;
5320
5321                 mt_sched_printf("wakeup %d %s find_idlest_group cpu=%d",
5322                         p->pid, p->comm, cpu);
5323                 group = find_idlest_group(sd, p, cpu, load_idx);
5324                 if (!group) {
5325                         sd = sd->child;
5326                         mt_sched_printf("wakeup %d %s find_idlest_group child",
5327                                 p->pid, p->comm);
5328                         continue;
5329                 }
5330
5331                 new_cpu = find_idlest_cpu(group, p, cpu);
5332                 if (new_cpu == -1 || new_cpu == cpu) {
5333                         /* Now try balancing at a lower domain level of cpu */
5334                         sd = sd->child;
5335                         mt_sched_printf("wakeup %d %s find_idlest_cpu sd->child=%p",
5336                                 p->pid, p->comm, sd);
5337                         continue;
5338                 }
5339
5340                 /* Now try balancing at a lower domain level of new_cpu */
5341                 mt_sched_printf("wakeup %d %s find_idlest_cpu cpu=%d sd=%p",
5342                                 p->pid, p->comm, new_cpu, sd);
5343                 cpu = new_cpu;
5344                 weight = sd->span_weight;
5345                 sd = NULL;
5346                 for_each_domain(cpu, tmp) {
5347                         if (weight <= tmp->span_weight)
5348                                 break;
5349                         if (tmp->flags & sd_flag)
5350                                 sd = tmp;
5351                         mt_sched_printf("wakeup %d %s sd=%p weight=%d, tmp->span_weight=%d",
5352                                 p->pid, p->comm, sd, weight, tmp->span_weight);
5353                 }
5354                 /* while loop will break here if sd == NULL */
5355         }
5356
5357 #ifdef CONFIG_MTK_SCHED_TRACERS
5358         policy |= (new_cpu << LB_SMP_SHIFT);
5359         policy |= LB_SMP;
5360 #endif
5361
5362 unlock:
5363         rcu_read_unlock();
5364         mt_sched_printf("wakeup %d %s new_cpu=%x", p->pid, p->comm, new_cpu);
5365
5366 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
5367 cmp_found:
5368 #endif
5369
5370 #ifdef CONFIG_SCHED_HMP
5371 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
5372         new_cpu = hmp_select_task_rq_fair(sd_flag, p, prev_cpu, new_cpu);
5373 #ifdef CONFIG_MTK_SCHED_TRACERS
5374         policy |= (new_cpu << LB_HMP_SHIFT);
5375         policy |= LB_HMP;
5376 #endif
5377
5378 #else
5379         if (hmp_up_migration(prev_cpu, &target_cpu, &p->se)) {
5380                 new_cpu = hmp_select_faster_cpu(p, prev_cpu);
5381                 hmp_next_up_delay(&p->se, new_cpu);
5382                 trace_sched_hmp_migrate(p, new_cpu, 0);
5383                 return new_cpu;
5384         }
5385         if (hmp_down_migration(prev_cpu, &p->se)) {
5386                 new_cpu = hmp_select_slower_cpu(p, prev_cpu);
5387                 hmp_next_down_delay(&p->se, new_cpu);
5388                 trace_sched_hmp_migrate(p, new_cpu, 0);
5389                 return new_cpu;
5390         }
5391         /* Make sure that the task stays in its previous hmp domain */
5392         if (!cpumask_test_cpu(new_cpu, &hmp_cpu_domain(prev_cpu)->cpus))
5393                 return prev_cpu;
5394 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
5395 #endif /* CONFIG_SCHED_HMP */
5396
5397 #ifdef CONFIG_MTK_SCHED_TRACERS
5398         trace_sched_select_task_rq(p, policy, prev_cpu, new_cpu);
5399 #endif
5400
5401 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5402         if(PA_MON_ENABLE) {
5403                 if(strcmp(p->comm, PA_MON) == 0 && cpu != new_cpu) {
5404                         printk(KERN_EMERG "[PA] %s Select From CPU%d to CPU%d\n", p->comm, cpu, new_cpu);
5405                 }
5406         }
5407 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5408
5409         return new_cpu;
5410 }
5411
5412 /*
5413  * Called immediately before a task is migrated to a new cpu; task_cpu(p) and
5414  * cfs_rq_of(p) references at time of call are still valid and identify the
5415  * previous cpu.  However, the caller only guarantees p->pi_lock is held; no
5416  * other assumptions, including the state of rq->lock, should be made.
5417  */
5418 static void
5419 migrate_task_rq_fair(struct task_struct *p, int next_cpu)
5420 {
5421         struct sched_entity *se = &p->se;
5422         struct cfs_rq *cfs_rq = cfs_rq_of(se);
5423
5424         /*
5425          * Load tracking: accumulate removed load so that it can be processed
5426          * when we next update owning cfs_rq under rq->lock.  Tasks contribute
5427          * to blocked load iff they have a positive decay-count.  It can never
5428          * be negative here since on-rq tasks have decay-count == 0.
5429          */
5430         if (se->avg.decay_count) {
5431                 se->avg.decay_count = -__synchronize_entity_decay(se);
5432                 atomic_long_add(se->avg.load_avg_contrib,
5433                                                 &cfs_rq->removed_load);
5434         }
5435 }
5436 #endif /* CONFIG_SMP */
5437
5438 static unsigned long
5439 wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
5440 {
5441         unsigned long gran = sysctl_sched_wakeup_granularity;
5442
5443         /*
5444          * Since its curr running now, convert the gran from real-time
5445          * to virtual-time in his units.
5446          *
5447          * By using 'se' instead of 'curr' we penalize light tasks, so
5448          * they get preempted easier. That is, if 'se' < 'curr' then
5449          * the resulting gran will be larger, therefore penalizing the
5450          * lighter, if otoh 'se' > 'curr' then the resulting gran will
5451          * be smaller, again penalizing the lighter task.
5452          *
5453          * This is especially important for buddies when the leftmost
5454          * task is higher priority than the buddy.
5455          */
5456         return calc_delta_fair(gran, se);
5457 }
5458
5459 /*
5460  * Should 'se' preempt 'curr'.
5461  *
5462  *             |s1
5463  *        |s2
5464  *   |s3
5465  *         g
5466  *      |<--->|c
5467  *
5468  *  w(c, s1) = -1
5469  *  w(c, s2) =  0
5470  *  w(c, s3) =  1
5471  *
5472  */
5473 static int
5474 wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
5475 {
5476         s64 gran, vdiff = curr->vruntime - se->vruntime;
5477
5478         if (vdiff <= 0)
5479                 return -1;
5480
5481         gran = wakeup_gran(curr, se);
5482         if (vdiff > gran)
5483                 return 1;
5484
5485         return 0;
5486 }
5487
5488 static void set_last_buddy(struct sched_entity *se)
5489 {
5490         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5491                 return;
5492
5493         for_each_sched_entity(se)
5494                 cfs_rq_of(se)->last = se;
5495 }
5496
5497 static void set_next_buddy(struct sched_entity *se)
5498 {
5499         if (entity_is_task(se) && unlikely(task_of(se)->policy == SCHED_IDLE))
5500                 return;
5501
5502         for_each_sched_entity(se)
5503                 cfs_rq_of(se)->next = se;
5504 }
5505
5506 static void set_skip_buddy(struct sched_entity *se)
5507 {
5508         for_each_sched_entity(se)
5509                 cfs_rq_of(se)->skip = se;
5510 }
5511
5512 /*
5513  * Preempt the current task with a newly woken task if needed:
5514  */
5515 static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_flags)
5516 {
5517         struct task_struct *curr = rq->curr;
5518         struct sched_entity *se = &curr->se, *pse = &p->se;
5519         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5520         int scale = cfs_rq->nr_running >= sched_nr_latency;
5521         int next_buddy_marked = 0;
5522
5523         if (unlikely(se == pse))
5524                 return;
5525
5526         /*
5527          * This is possible from callers such as move_task(), in which we
5528          * unconditionally check_prempt_curr() after an enqueue (which may have
5529          * lead to a throttle).  This both saves work and prevents false
5530          * next-buddy nomination below.
5531          */
5532         if (unlikely(throttled_hierarchy(cfs_rq_of(pse))))
5533                 return;
5534
5535         if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK)) {
5536                 set_next_buddy(pse);
5537                 next_buddy_marked = 1;
5538         }
5539
5540         /*
5541          * We can come here with TIF_NEED_RESCHED already set from new task
5542          * wake up path.
5543          *
5544          * Note: this also catches the edge-case of curr being in a throttled
5545          * group (e.g. via set_curr_task), since update_curr() (in the
5546          * enqueue of curr) will have resulted in resched being set.  This
5547          * prevents us from potentially nominating it as a false LAST_BUDDY
5548          * below.
5549          */
5550         if (test_tsk_need_resched(curr))
5551                 return;
5552
5553         /* Idle tasks are by definition preempted by non-idle tasks. */
5554         if (unlikely(curr->policy == SCHED_IDLE) &&
5555             likely(p->policy != SCHED_IDLE))
5556                 goto preempt;
5557
5558         /*
5559          * Batch and idle tasks do not preempt non-idle tasks (their preemption
5560          * is driven by the tick):
5561          */
5562         if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION))
5563                 return;
5564
5565         find_matching_se(&se, &pse);
5566         update_curr(cfs_rq_of(se));
5567         BUG_ON(!pse);
5568         if (wakeup_preempt_entity(se, pse) == 1) {
5569                 /*
5570                  * Bias pick_next to pick the sched entity that is
5571                  * triggering this preemption.
5572                  */
5573                 if (!next_buddy_marked)
5574                         set_next_buddy(pse);
5575                 goto preempt;
5576         }
5577
5578         return;
5579
5580 preempt:
5581         resched_task(curr);
5582         /*
5583          * Only set the backward buddy when the current task is still
5584          * on the rq. This can happen when a wakeup gets interleaved
5585          * with schedule on the ->pre_schedule() or idle_balance()
5586          * point, either of which can * drop the rq lock.
5587          *
5588          * Also, during early boot the idle thread is in the fair class,
5589          * for obvious reasons its a bad idea to schedule back to it.
5590          */
5591         if (unlikely(!se->on_rq || curr == rq->idle))
5592                 return;
5593
5594         if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
5595                 set_last_buddy(se);
5596 }
5597
5598 static struct task_struct *pick_next_task_fair(struct rq *rq)
5599 {
5600         struct task_struct *p;
5601         struct cfs_rq *cfs_rq = &rq->cfs;
5602         struct sched_entity *se;
5603
5604         // in case nr_running!=0 but h_nr_running==0
5605         if (!cfs_rq->nr_running || !cfs_rq->h_nr_running)
5606                 return NULL;
5607
5608         do {
5609                 se = pick_next_entity(cfs_rq);
5610                 set_next_entity(cfs_rq, se);
5611                 cfs_rq = group_cfs_rq(se);
5612         } while (cfs_rq);
5613
5614         p = task_of(se);
5615         if (hrtick_enabled(rq))
5616                 hrtick_start_fair(rq, p);
5617
5618         return p;
5619 }
5620
5621 /*
5622  * Account for a descheduled task:
5623  */
5624 static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
5625 {
5626         struct sched_entity *se = &prev->se;
5627         struct cfs_rq *cfs_rq;
5628
5629         for_each_sched_entity(se) {
5630                 cfs_rq = cfs_rq_of(se);
5631                 put_prev_entity(cfs_rq, se);
5632         }
5633 }
5634
5635 /*
5636  * sched_yield() is very simple
5637  *
5638  * The magic of dealing with the ->skip buddy is in pick_next_entity.
5639  */
5640 static void yield_task_fair(struct rq *rq)
5641 {
5642         struct task_struct *curr = rq->curr;
5643         struct cfs_rq *cfs_rq = task_cfs_rq(curr);
5644         struct sched_entity *se = &curr->se;
5645
5646         /*
5647          * Are we the only task in the tree?
5648          */
5649         if (unlikely(rq->nr_running == 1))
5650                 return;
5651
5652         clear_buddies(cfs_rq, se);
5653
5654         if (curr->policy != SCHED_BATCH) {
5655                 update_rq_clock(rq);
5656                 /*
5657                  * Update run-time statistics of the 'current'.
5658                  */
5659                 update_curr(cfs_rq);
5660                 /*
5661                  * Tell update_rq_clock() that we've just updated,
5662                  * so we don't do microscopic update in schedule()
5663                  * and double the fastpath cost.
5664                  */
5665                  rq->skip_clock_update = 1;
5666         }
5667
5668         set_skip_buddy(se);
5669 }
5670
5671 static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preempt)
5672 {
5673         struct sched_entity *se = &p->se;
5674
5675         /* throttled hierarchies are not runnable */
5676         if (!se->on_rq || throttled_hierarchy(cfs_rq_of(se)))
5677                 return false;
5678
5679         /* Tell the scheduler that we'd really like pse to run next. */
5680         set_next_buddy(se);
5681
5682         yield_task_fair(rq);
5683
5684         return true;
5685 }
5686
5687 #ifdef CONFIG_SMP
5688 /**************************************************
5689  * Fair scheduling class load-balancing methods.
5690  *
5691  * BASICS
5692  *
5693  * The purpose of load-balancing is to achieve the same basic fairness the
5694  * per-cpu scheduler provides, namely provide a proportional amount of compute
5695  * time to each task. This is expressed in the following equation:
5696  *
5697  *   W_i,n/P_i == W_j,n/P_j for all i,j                               (1)
5698  *
5699  * Where W_i,n is the n-th weight average for cpu i. The instantaneous weight
5700  * W_i,0 is defined as:
5701  *
5702  *   W_i,0 = \Sum_j w_i,j                                             (2)
5703  *
5704  * Where w_i,j is the weight of the j-th runnable task on cpu i. This weight
5705  * is derived from the nice value as per prio_to_weight[].
5706  *
5707  * The weight average is an exponential decay average of the instantaneous
5708  * weight:
5709  *
5710  *   W'_i,n = (2^n - 1) / 2^n * W_i,n + 1 / 2^n * W_i,0               (3)
5711  *
5712  * P_i is the cpu power (or compute capacity) of cpu i, typically it is the
5713  * fraction of 'recent' time available for SCHED_OTHER task execution. But it
5714  * can also include other factors [XXX].
5715  *
5716  * To achieve this balance we define a measure of imbalance which follows
5717  * directly from (1):
5718  *
5719  *   imb_i,j = max{ avg(W/P), W_i/P_i } - min{ avg(W/P), W_j/P_j }    (4)
5720  *
5721  * We them move tasks around to minimize the imbalance. In the continuous
5722  * function space it is obvious this converges, in the discrete case we get
5723  * a few fun cases generally called infeasible weight scenarios.
5724  *
5725  * [XXX expand on:
5726  *     - infeasible weights;
5727  *     - local vs global optima in the discrete case. ]
5728  *
5729  *
5730  * SCHED DOMAINS
5731  *
5732  * In order to solve the imbalance equation (4), and avoid the obvious O(n^2)
5733  * for all i,j solution, we create a tree of cpus that follows the hardware
5734  * topology where each level pairs two lower groups (or better). This results
5735  * in O(log n) layers. Furthermore we reduce the number of cpus going up the
5736  * tree to only the first of the previous level and we decrease the frequency
5737  * of load-balance at each level inv. proportional to the number of cpus in
5738  * the groups.
5739  *
5740  * This yields:
5741  *
5742  *     log_2 n     1     n
5743  *   \Sum       { --- * --- * 2^i } = O(n)                            (5)
5744  *     i = 0      2^i   2^i
5745  *                               `- size of each group
5746  *         |         |     `- number of cpus doing load-balance
5747  *         |         `- freq
5748  *         `- sum over all levels
5749  *
5750  * Coupled with a limit on how many tasks we can migrate every balance pass,
5751  * this makes (5) the runtime complexity of the balancer.
5752  *
5753  * An important property here is that each CPU is still (indirectly) connected
5754  * to every other cpu in at most O(log n) steps:
5755  *
5756  * The adjacency matrix of the resulting graph is given by:
5757  *
5758  *             log_2 n
5759  *   A_i,j = \Union     (i % 2^k == 0) && i / 2^(k+1) == j / 2^(k+1)  (6)
5760  *             k = 0
5761  *
5762  * And you'll find that:
5763  *
5764  *   A^(log_2 n)_i,j != 0  for all i,j                                (7)
5765  *
5766  * Showing there's indeed a path between every cpu in at most O(log n) steps.
5767  * The task movement gives a factor of O(m), giving a convergence complexity
5768  * of:
5769  *
5770  *   O(nm log n),  n := nr_cpus, m := nr_tasks                        (8)
5771  *
5772  *
5773  * WORK CONSERVING
5774  *
5775  * In order to avoid CPUs going idle while there's still work to do, new idle
5776  * balancing is more aggressive and has the newly idle cpu iterate up the domain
5777  * tree itself instead of relying on other CPUs to bring it work.
5778  *
5779  * This adds some complexity to both (5) and (8) but it reduces the total idle
5780  * time.
5781  *
5782  * [XXX more?]
5783  *
5784  *
5785  * CGROUPS
5786  *
5787  * Cgroups make a horror show out of (2), instead of a simple sum we get:
5788  *
5789  *                                s_k,i
5790  *   W_i,0 = \Sum_j \Prod_k w_k * -----                               (9)
5791  *                                 S_k
5792  *
5793  * Where
5794  *
5795  *   s_k,i = \Sum_j w_i,j,k  and  S_k = \Sum_i s_k,i                 (10)
5796  *
5797  * w_i,j,k is the weight of the j-th runnable task in the k-th cgroup on cpu i.
5798  *
5799  * The big problem is S_k, its a global sum needed to compute a local (W_i)
5800  * property.
5801  *
5802  * [XXX write more on how we solve this.. _after_ merging pjt's patches that
5803  *      rewrite all of this once again.]
5804  */
5805
5806 static unsigned long __read_mostly max_load_balance_interval = HZ/10;
5807
5808 #define LBF_ALL_PINNED  0x01
5809 #define LBF_NEED_BREAK  0x02
5810 #define LBF_SOME_PINNED 0x04
5811
5812 struct lb_env {
5813         struct sched_domain     *sd;
5814
5815         struct rq               *src_rq;
5816         int                     src_cpu;
5817
5818         int                     dst_cpu;
5819         struct rq               *dst_rq;
5820
5821         struct cpumask          *dst_grpmask;
5822         int                     new_dst_cpu;
5823         enum cpu_idle_type      idle;
5824         long                    imbalance;
5825         /* The set of CPUs under consideration for load-balancing */
5826         struct cpumask          *cpus;
5827
5828         unsigned int            flags;
5829
5830         unsigned int            loop;
5831         unsigned int            loop_break;
5832         unsigned int            loop_max;
5833 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
5834         int                     mt_check_cache_in_idle;
5835 #endif
5836 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5837         unsigned int            fail_reason;
5838 #endif
5839 };
5840
5841 /*
5842  * move_task - move a task from one runqueue to another runqueue.
5843  * Both runqueues must be locked.
5844  */
5845 static void move_task(struct task_struct *p, struct lb_env *env)
5846 {
5847         deactivate_task(env->src_rq, p, 0);
5848         set_task_cpu(p, env->dst_cpu);
5849         activate_task(env->dst_rq, p, 0);
5850         check_preempt_curr(env->dst_rq, p, 0);
5851
5852 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
5853         if(PA_MON_ENABLE) {
5854                 if(strcmp(p->comm, PA_MON) == 0) {
5855                         printk(KERN_EMERG "[PA] %s Balance From CPU%d to CPU%d\n", p->comm, env->src_rq->cpu, env->dst_rq->cpu);
5856                 }
5857         }
5858 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
5859
5860 }
5861
5862 /*
5863  * Is this task likely cache-hot:
5864  */
5865 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
5866 static int
5867 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd, int mt_check_cache_in_idle)
5868 #else
5869 static int
5870 task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
5871 #endif
5872 {
5873         s64 delta;
5874
5875         if (p->sched_class != &fair_sched_class)
5876                 return 0;
5877
5878         if (unlikely(p->policy == SCHED_IDLE))
5879                 return 0;
5880
5881         /*
5882          * Buddy candidates are cache hot:
5883          */
5884 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
5885         if (!mt_check_cache_in_idle){
5886                 if ( !this_rq()->nr_running && (task_rq(p)->nr_running >= 2) )
5887                         return 0;
5888         }
5889 #endif
5890         if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
5891                         (&p->se == cfs_rq_of(&p->se)->next ||
5892                          &p->se == cfs_rq_of(&p->se)->last))
5893                 return 1;
5894
5895         if (sysctl_sched_migration_cost == -1)
5896                 return 1;
5897         if (sysctl_sched_migration_cost == 0)
5898                 return 0;
5899
5900         delta = now - p->se.exec_start;
5901
5902         return delta < (s64)sysctl_sched_migration_cost;
5903 }
5904
5905 /*
5906  * can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
5907  */
5908 static
5909 int can_migrate_task(struct task_struct *p, struct lb_env *env)
5910 {
5911         int tsk_cache_hot = 0;
5912         /*
5913          * We do not migrate tasks that are:
5914          * 1) throttled_lb_pair, or
5915          * 2) cannot be migrated to this CPU due to cpus_allowed, or
5916          * 3) running (obviously), or
5917          * 4) are cache-hot on their current CPU.
5918          */
5919         if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu))
5920                 return 0;
5921
5922         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
5923                 int cpu;
5924
5925                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
5926 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5927                 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_AFFINITY);
5928                 if(mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){
5929                         char strings[128]="";
5930                         snprintf(strings, 128, "%d:balance fail:affinity:%d:%d:%s:0x%lu"
5931                                 , env->dst_cpu, env->src_cpu, p->pid, p->comm, p->cpus_allowed.bits[0]);
5932                         trace_sched_lbprof_log(strings);
5933                 }
5934 #endif
5935
5936                 /*
5937                  * Remember if this task can be migrated to any other cpu in
5938                  * our sched_group. We may want to revisit it if we couldn't
5939                  * meet load balance goals by pulling other tasks on src_cpu.
5940                  *
5941                  * Also avoid computing new_dst_cpu if we have already computed
5942                  * one in current iteration.
5943                  */
5944                 if (!env->dst_grpmask || (env->flags & LBF_SOME_PINNED))
5945                         return 0;
5946
5947                 /* Prevent to re-select dst_cpu via env's cpus */
5948                 for_each_cpu_and(cpu, env->dst_grpmask, env->cpus) {
5949                         if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) {
5950                                 env->flags |= LBF_SOME_PINNED;
5951                                 env->new_dst_cpu = cpu;
5952                                 break;
5953                         }
5954                 }
5955
5956                 return 0;
5957         }
5958
5959         /* Record that we found atleast one task that could run on dst_cpu */
5960         env->flags &= ~LBF_ALL_PINNED;
5961
5962         if (task_running(env->src_rq, p)) {
5963                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
5964 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5965                 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_RUNNING);
5966                 if( mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){
5967                         char strings[128]="";
5968                         snprintf(strings, 128, "%d:balance fail:running:%d:%d:%s"
5969                                 , env->dst_cpu, env->src_cpu, p->pid, p->comm);
5970                         trace_sched_lbprof_log(strings);
5971                 }
5972 #endif
5973                 return 0;
5974         }
5975
5976         /*
5977          * Aggressive migration if:
5978          * 1) task is cache cold, or
5979          * 2) too many balance attempts have failed.
5980          */
5981 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
5982         tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd, env->mt_check_cache_in_idle);
5983 #else
5984         tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
5985 #endif
5986         if (!tsk_cache_hot ||
5987                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
5988
5989                 if (tsk_cache_hot) {
5990                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
5991                         schedstat_inc(p, se.statistics.nr_forced_migrations);
5992                 }
5993
5994                 return 1;
5995         }
5996
5997         schedstat_inc(p, se.statistics.nr_failed_migrations_hot);
5998 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
5999         mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_CACHEHOT);
6000         if(mt_lbprof_lt (env->sd->mt_lbprof_nr_balance_failed, MT_LBPROF_NR_BALANCED_FAILED_UPPER_BOUND)){
6001                 char strings[128]="";
6002                 snprintf(strings, 128, "%d:balance fail:cache hot:%d:%d:%s"
6003                         , env->dst_cpu, env->src_cpu, p->pid, p->comm);
6004                 trace_sched_lbprof_log(strings);
6005         }
6006 #endif
6007         return 0;
6008 }
6009
6010 /*
6011  * move_one_task tries to move exactly one task from busiest to this_rq, as
6012  * part of active balancing operations within "domain".
6013  * Returns 1 if successful and 0 otherwise.
6014  *
6015  * Called with both runqueues locked.
6016  */
6017 static int move_one_task(struct lb_env *env)
6018 {
6019         struct task_struct *p, *n;
6020 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
6021         env->mt_check_cache_in_idle = 1;
6022 #endif
6023 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
6024         mt_lbprof_stat_set(env->fail_reason, MT_LBPROF_NO_TRIGGER);
6025 #endif
6026
6027         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
6028 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6029                 if(need_lazy_balance(env->dst_cpu, env->src_cpu, p))
6030                         continue;
6031 #endif
6032                 if (!can_migrate_task(p, env))
6033                         continue;
6034
6035                 move_task(p, env);
6036                 /*
6037                  * Right now, this is only the second place move_task()
6038                  * is called, so we can safely collect move_task()
6039                  * stats here rather than inside move_task().
6040                  */
6041                 schedstat_inc(env->sd, lb_gained[env->idle]);
6042                 return 1;
6043         }
6044         return 0;
6045 }
6046
6047 static unsigned long task_h_load(struct task_struct *p);
6048
6049 static const unsigned int sched_nr_migrate_break = 32;
6050
6051 /* in second round load balance, we migrate heavy load_weight task
6052      as long as RT tasks exist in busy cpu*/
6053 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
6054         #define over_imbalance(lw, im) \
6055                 (((lw)/2 > (im)) && \
6056         ((env->mt_check_cache_in_idle==1) || \
6057          (env->src_rq->rt.rt_nr_running==0) || \
6058          (pulled>0)))
6059 #else
6060         #define over_imbalance(lw, im) (((lw) / 2) > (im))
6061 #endif
6062
6063 /*
6064  * move_tasks tries to move up to imbalance weighted load from busiest to
6065  * this_rq, as part of a balancing operation within domain "sd".
6066  * Returns 1 if successful and 0 otherwise.
6067  *
6068  * Called with both runqueues locked.
6069  */
6070 static int move_tasks(struct lb_env *env)
6071 {
6072         struct list_head *tasks = &env->src_rq->cfs_tasks;
6073         struct task_struct *p;
6074         unsigned long load;
6075         int pulled = 0;
6076
6077         if (env->imbalance <= 0)
6078                 return 0;
6079
6080         mt_sched_printf("move_tasks start ");
6081
6082         while (!list_empty(tasks)) {
6083                 p = list_first_entry(tasks, struct task_struct, se.group_node);
6084
6085                 env->loop++;
6086                 /* We've more or less seen every task there is, call it quits */
6087                 if (env->loop > env->loop_max)
6088                         break;
6089
6090                 /* take a breather every nr_migrate tasks */
6091                 if (env->loop > env->loop_break) {
6092                         env->loop_break += sched_nr_migrate_break;
6093                         env->flags |= LBF_NEED_BREAK;
6094                         break;
6095                 }
6096 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6097                 if(need_lazy_balance(env->dst_cpu, env->src_cpu, p))
6098                         goto next;
6099 #endif
6100                 if (!can_migrate_task(p, env))
6101                         goto next;
6102
6103                 load = task_h_load(p);
6104
6105                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed)
6106                         goto next;
6107
6108                 if (over_imbalance(load, env->imbalance))
6109                         {
6110                         goto next;
6111                         }
6112
6113                 move_task(p, env);
6114                 pulled++;
6115                 env->imbalance -= load;
6116
6117 #ifdef CONFIG_PREEMPT
6118                 /*
6119                  * NEWIDLE balancing is a source of latency, so preemptible
6120                  * kernels will stop after the first task is pulled to minimize
6121                  * the critical section.
6122                  */
6123                 if (env->idle == CPU_NEWLY_IDLE)
6124                         break;
6125 #endif
6126
6127                 /*
6128                  * We only want to steal up to the prescribed amount of
6129                  * weighted load.
6130                  */
6131                 if (env->imbalance <= 0)
6132                         break;
6133
6134                 continue;
6135 next:
6136                 list_move_tail(&p->se.group_node, tasks);
6137         }
6138
6139         /*
6140          * Right now, this is one of only two places move_task() is called,
6141          * so we can safely collect move_task() stats here rather than
6142          * inside move_task().
6143          */
6144         schedstat_add(env->sd, lb_gained[env->idle], pulled);
6145
6146         mt_sched_printf("move_tasks end");
6147
6148         return pulled;
6149 }
6150
6151 #ifdef CONFIG_MTK_SCHED_CMP
6152 #ifdef CONFIG_MTK_SCHED_CMP_TGS
6153 static int cmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
6154 {
6155         struct sched_domain *sd = env->sd;
6156
6157         BUG_ON(sd == NULL);
6158
6159         if (!(sd->flags & SD_BALANCE_TG))
6160                 return 0;
6161
6162         if (arch_is_multi_cluster()) {
6163                 int src_clid, dst_clid;
6164                 int src_nr_cpus;
6165                 struct thread_group_info_t *src_tginfo, *dst_tginfo;
6166
6167                 src_clid = get_cluster_id(env->src_cpu);
6168                 dst_clid = get_cluster_id(env->dst_cpu);
6169                 BUG_ON(dst_clid == -1 || src_clid == -1);
6170                 BUG_ON(p == NULL || p->group_leader == NULL);
6171                 src_tginfo = &p->group_leader->thread_group_info[src_clid];
6172                 dst_tginfo = &p->group_leader->thread_group_info[dst_clid];
6173                 src_nr_cpus = nr_cpus_in_cluster(src_clid, false);
6174
6175 #ifdef CONFIG_MT_SCHED_INFO
6176                 mt_sched_printf("check rule0: pid=%d comm=%s load=%ld src:clid=%d tginfo->nr_running=%ld nr_cpus=%d load_avg_ratio=%ld",
6177                         p->pid, p->comm, p->se.avg.load_avg_ratio,
6178                         src_clid, src_tginfo->nr_running, src_nr_cpus,
6179                         src_tginfo->load_avg_ratio);
6180 #endif
6181 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6182                 if ( (!thread_group_empty(p)) &&
6183                          (src_tginfo->nr_running <= src_nr_cpus) &&
6184                          (src_tginfo->nr_running > dst_tginfo->nr_running)){
6185                         mt_sched_printf("hit ruleA: bypass pid=%d comm=%s src:nr_running=%lu nr_cpus=%d dst:nr_running=%lu",
6186                                 p->pid, p->comm, src_tginfo->nr_running, src_nr_cpus, dst_tginfo->nr_running);
6187                         return 0;
6188                 }
6189 #endif
6190         }
6191         return 1;
6192 }
6193
6194 static int need_migrate_task_immediately(struct task_struct *p,
6195            struct lb_env *env, struct clb_env *clbenv)
6196 {
6197         struct sched_domain *sd = env->sd;
6198
6199         BUG_ON(sd == NULL);
6200
6201         if (arch_is_big_little()) {
6202                 mt_sched_printf("[%s] b.L arch", __func__);
6203 #ifdef CONFIG_MT_SCHED_INFO
6204                 mt_sched_printf("check rule0: pid=%d comm=%s src=%d dst=%d p->prio=%d p->se.avg.load_avg_ratio=%ld",
6205                         p->pid, p->comm, env->src_cpu, env->dst_cpu, p->prio, p->se.avg.load_avg_ratio);
6206 #endif
6207                 /* from LITTLE to big */
6208                 if (arch_cpu_is_little(env->src_cpu) && arch_cpu_is_big(env->dst_cpu)) {
6209                         BUG_ON(env->src_cpu != clbenv->ltarget);
6210                         if (p->se.avg.load_avg_ratio >= clbenv->bstats.threshold)
6211                                 return 1;
6212
6213                 /* from big to LITTLE */
6214                 } else if (arch_cpu_is_big(env->src_cpu) && arch_cpu_is_little(env->dst_cpu)) {
6215                         BUG_ON(env->src_cpu != clbenv->btarget);
6216                         if (p->se.avg.load_avg_ratio < clbenv->lstats.threshold)
6217                                 return 1;
6218                 }
6219                 return 0;
6220         }
6221
6222         if (arch_is_multi_cluster() && (sd->flags & SD_BALANCE_TG)) {
6223                 int src_clid, dst_clid;
6224                 int src_nr_cpus;
6225                 struct thread_group_info_t *src_tginfo, *dst_tginfo;
6226
6227                 src_clid = get_cluster_id(env->src_cpu);
6228                 dst_clid = get_cluster_id(env->dst_cpu);
6229                 BUG_ON(dst_clid == -1 || src_clid == -1);
6230                 BUG_ON(p == NULL || p->group_leader == NULL);
6231                 src_tginfo = &p->group_leader->thread_group_info[src_clid];
6232                 dst_tginfo = &p->group_leader->thread_group_info[dst_clid];
6233                 src_nr_cpus = nr_cpus_in_cluster(src_clid, false);
6234                 mt_sched_printf("[%s] L.L arch", __func__);
6235
6236                 if ((p->se.avg.load_avg_ratio*4  >=  NICE_0_LOAD*3) &&
6237                         src_tginfo->nr_running > src_nr_cpus &&
6238                         src_tginfo->load_avg_ratio*10 > NICE_0_LOAD*src_nr_cpus*9) {
6239                         //pr_warn("[%s] hit rule0, candidate_load_move/load_move (%ld/%ld)\n",
6240                         //      __func__, candidate_load_move, env->imbalance);
6241                         return 1;
6242                 }
6243         }
6244
6245         return 0;
6246 }
6247 #endif
6248
6249 /*
6250  * move_tasks tries to move up to load_move weighted load from busiest to
6251  * this_rq, as part of a balancing operation within domain "sd".
6252  * Returns 1 if successful and 0 otherwise.
6253  *
6254  * Called with both runqueues locked.
6255  */
6256 static int cmp_move_tasks(struct sched_domain *sd, struct lb_env *env)
6257 {
6258         struct list_head *tasks = &env->src_rq->cfs_tasks;
6259         struct task_struct *p;
6260         unsigned long load = 0;
6261         int pulled = 0;
6262
6263         long tg_load_move, other_load_move;
6264         struct list_head tg_tasks, other_tasks;
6265         int src_clid, dst_clid;
6266 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6267         struct cpumask tmp, *cpus = &tmp;
6268 #endif
6269 #ifdef MTK_QUICK
6270         int flag = 0;
6271 #endif
6272         struct clb_env clbenv;
6273         struct cpumask srcmask, dstmask;
6274
6275         if (env->imbalance <= 0)
6276                 return 0;
6277
6278         other_load_move = env->imbalance;
6279         INIT_LIST_HEAD(&other_tasks);
6280
6281 //      if (sd->flags & SD_BALANCE_TG) {
6282                 tg_load_move = env->imbalance;
6283                 INIT_LIST_HEAD(&tg_tasks);
6284                 src_clid = get_cluster_id(env->src_cpu);
6285                 dst_clid = get_cluster_id(env->dst_cpu);
6286                 BUG_ON(dst_clid == -1 || src_clid == -1);
6287
6288 #ifdef CONFIG_MTK_SCHED_CMP_TGS_WAKEUP
6289                 get_cluster_cpus(cpus, src_clid, true);
6290 #endif
6291                 mt_sched_printf("move_tasks_tg start: src:cpu=%d clid=%d runnable_load=%lu dst:cpu=%d clid=%d runnable_load=%lu imbalance=%ld curr->on_rq=%d",
6292                         env->src_cpu, src_clid, cpu_rq(env->src_cpu)->cfs.runnable_load_avg,
6293                         env->dst_cpu, dst_clid, cpu_rq(env->dst_cpu)->cfs.runnable_load_avg,
6294                         env->imbalance, env->dst_rq->curr->on_rq);
6295 //      }
6296
6297         mt_sched_printf("max=%d busiest->nr_running=%d",
6298                 env->loop_max, cpu_rq(env->src_cpu)->nr_running);
6299
6300         if (arch_is_big_little()) {
6301                 get_cluster_cpus(&srcmask, src_clid, true);
6302                 get_cluster_cpus(&dstmask, dst_clid, true);
6303                 memset(&clbenv, 0, sizeof(clbenv));
6304                 clbenv.flags |= HMP_LB;
6305                 clbenv.ltarget = arch_cpu_is_little(env->src_cpu) ? env->src_cpu : env->dst_cpu;
6306                 clbenv.btarget = arch_cpu_is_big(env->src_cpu) ? env->src_cpu : env->dst_cpu;
6307                 clbenv.lcpus = arch_cpu_is_little(env->src_cpu) ? &srcmask : &dstmask;
6308                 clbenv.bcpus = arch_cpu_is_big(env->src_cpu) ? &srcmask : &dstmask;
6309                 sched_update_clbstats(&clbenv);
6310         }
6311
6312         while (!list_empty(tasks)) {
6313                 struct thread_group_info_t *src_tginfo, *dst_tginfo;
6314
6315                 p = list_first_entry(tasks, struct task_struct, se.group_node);
6316
6317 #ifdef CONFIG_MT_SCHED_INFO
6318                 mt_sched_printf("check: pid=%d comm=%s load_avg_contrib=%lu h_load=%lu runnable_load_avg=%lu loop=%d, env->imbalance=%ld tg_load_move=%ld",
6319                         p->pid, p->comm, p->se.avg.load_avg_contrib,
6320                         task_cfs_rq(p)->h_load, task_cfs_rq(p)->runnable_load_avg,
6321                         env->loop, env->imbalance, tg_load_move);
6322 #endif
6323                 env->loop++;
6324                 /* We've more or less seen every task there is, call it quits */
6325                 if (env->loop > env->loop_max)
6326                         break;
6327
6328 #if 0 // TO check
6329                 /* take a breather every nr_migrate tasks */
6330                 if (env->loop > env->loop_break) {
6331                         env->loop_break += sched_nr_migrate_break;
6332                         env->flags |= LBF_NEED_BREAK;
6333                         break;
6334                 }
6335 #endif
6336                 BUG_ON(p == NULL || p->group_leader == NULL);
6337                 src_tginfo = &p->group_leader->thread_group_info[src_clid];
6338                 dst_tginfo = &p->group_leader->thread_group_info[dst_clid];
6339
6340                 /* rule0 */
6341                 if (!can_migrate_task(p, env)) {
6342                         mt_sched_printf("can not migrate: pid=%d comm=%s",
6343                                 p->pid, p->comm);
6344                         goto next;
6345                 }
6346
6347                 load = task_h_load(p);
6348
6349                 if (sched_feat(LB_MIN) && load < 16 && !env->sd->nr_balance_failed) {
6350                         mt_sched_printf("can not migrate: pid=%d comm=%s sched_feat",
6351                                 p->pid, p->comm );
6352                         goto next;
6353                 }
6354
6355                 if (over_imbalance(load, env->imbalance)) {
6356                         mt_sched_printf("can not migrate: pid=%d comm=%s load=%ld imbalance=%ld",
6357                                 p->pid, p->comm, load, env->imbalance );
6358                         goto next;
6359                 }
6360
6361                 /* meet rule0 , migrate immediately */
6362                 if (need_migrate_task_immediately(p, env, &clbenv)) {
6363                         pulled++;
6364                         env->imbalance -= load;
6365                         tg_load_move -= load;
6366                         other_load_move -= load;
6367                         mt_sched_printf("hit rule0: pid=%d comm=%s load=%ld imbalance=%ld tg_imbalance=%ld other_load_move=%ld",
6368                                 p->pid, p->comm, load, env->imbalance, tg_load_move, other_load_move);
6369                         move_task(p, env);
6370                         if (env->imbalance <= 0)
6371                                 break;
6372                         continue;
6373                 }
6374
6375                 /* for TGS */
6376                 if (!cmp_can_migrate_task(p, env))
6377                         goto next;
6378
6379                 if (sd->flags & SD_BALANCE_TG){
6380                         if (over_imbalance(load, tg_load_move)) {
6381                                 mt_sched_printf("can not migrate: pid=%d comm=%s load=%ld imbalance=%ld",
6382                                         p->pid, p->comm, load, tg_load_move );
6383                                 goto next;
6384                         }
6385
6386 #ifdef MTK_QUICK
6387                         if (candidate_load_move <= 0) {
6388                                 mt_sched_printf("check: pid=%d comm=%s candidate_load_move=%d",
6389                                         p->pid, p->comm, candidate_load_move);
6390                                 goto next;
6391                         }
6392 #endif
6393
6394                         /* rule1, single thread */
6395 #ifdef CONFIG_MT_SCHED_INFO
6396                         mt_sched_printf("check rule1: pid=%d p->comm=%s thread_group_cnt=%lu thread_group_empty(p)=%d",
6397                                 p->pid, p->comm,
6398                                 p->group_leader->thread_group_info[0].nr_running +
6399                                 p->group_leader->thread_group_info[1].nr_running,
6400                                 thread_group_empty(p));
6401 #endif
6402
6403                         if (thread_group_empty(p)) {
6404                                 list_move_tail(&p->se.group_node, &tg_tasks);
6405                                 tg_load_move -= load;
6406                                 other_load_move -= load;
6407                                 mt_sched_printf("hit rule1: pid=%d p->comm=%s load=%ld tg_imbalance=%ld",
6408                                    p->pid, p->comm, load, tg_load_move);
6409                                 continue;
6410                         }
6411
6412                         /* rule2 */
6413 #ifdef CONFIG_MT_SCHED_INFO
6414                         mt_sched_printf("check rule2: pid=%d p->comm=%s %ld, %ld, %ld, %ld, %ld",
6415                                 p->pid, p->comm, src_tginfo->nr_running, src_tginfo->cfs_nr_running, dst_tginfo->nr_running,
6416                                 p->se.avg.load_avg_ratio, src_tginfo->load_avg_ratio);
6417 #endif
6418                         if ((src_tginfo->nr_running < dst_tginfo->nr_running) &&
6419                            ((p->se.avg.load_avg_ratio * src_tginfo->cfs_nr_running) <=
6420                             src_tginfo->load_avg_ratio)) {
6421                                 list_move_tail(&p->se.group_node, &tg_tasks);
6422                                 tg_load_move -= load;
6423                                 other_load_move -= load;
6424                                 mt_sched_printf("hit rule2: pid=%d p->comm=%s load=%ld tg_imbalance=%ld",
6425                                    p->pid, p->comm, load, tg_load_move);
6426                                 continue;
6427                         }
6428
6429                         if (over_imbalance(load, other_load_move))
6430                                 goto next;
6431 /*
6432                         if (other_load_move <= 0)
6433                                 goto next;
6434 */
6435
6436                         list_move_tail(&p->se.group_node, &other_tasks);
6437                         other_load_move -= load;
6438                         continue;
6439                 }else{
6440                         list_move_tail(&p->se.group_node, &other_tasks);
6441                         other_load_move -= load;
6442                         continue;
6443                 }
6444
6445         // ytchang
6446 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6447                 if(need_lazy_balance(env->dst_cpu, env->src_cpu, p))
6448                         goto next;
6449 #endif
6450
6451 next:
6452                 /* original rule */
6453                 list_move_tail(&p->se.group_node, tasks);
6454         } // end of while()
6455
6456         if ( sd->flags & SD_BALANCE_TG){
6457                 while (!list_empty(&tg_tasks)) {
6458                         p = list_first_entry(&tg_tasks, struct task_struct, se.group_node);
6459                         list_move_tail(&p->se.group_node, tasks);
6460
6461                         if (env->imbalance > 0) {
6462                                 load = task_h_load(p);
6463                                 if (over_imbalance(load, env->imbalance)){
6464                                         mt_sched_printf("overload rule1,2: pid=%d p->comm=%s load=%ld imbalance=%ld",
6465                                            p->pid, p->comm, load, env->imbalance);
6466 #ifdef MTK_QUICK
6467
6468                                         flag=1;
6469 #endif
6470                                         continue;
6471                                 }
6472
6473                                 move_task(p, env);
6474                                 env->imbalance -= load;
6475                                 pulled++;
6476
6477                                 mt_sched_printf("migrate hit rule1,2: pid=%d p->comm=%s load=%ld imbalance=%ld",
6478                                    p->pid, p->comm, load, env->imbalance);
6479                         }
6480                 }
6481         }
6482
6483         mt_sched_printf("move_tasks_tg finish rule migrate");
6484
6485         while (!list_empty(&other_tasks)) {
6486                 p = list_first_entry(&other_tasks, struct task_struct, se.group_node);
6487                 list_move_tail(&p->se.group_node, tasks);
6488
6489 #ifdef MTK_QUICK
6490                 if (!flag && (env->imbalance > 0)) {
6491 #else
6492                 if (env->imbalance > 0) {
6493 #endif
6494                         load = task_h_load(p);
6495
6496                         if (over_imbalance(load, env->imbalance)){
6497                                 mt_sched_printf("overload others: pid=%d p->comm=%s load=%ld imbalance=%ld",
6498                                    p->pid, p->comm, load, env->imbalance);
6499                                 continue;
6500                         }
6501
6502                         move_task(p, env);
6503                         env->imbalance -= load;
6504                         pulled++;
6505
6506                         mt_sched_printf("migrate others: pid=%d p->comm=%s load=%ld imbalance=%ld",
6507                            p->pid, p->comm, load, env->imbalance);
6508                 }
6509         }
6510
6511         /*
6512          * Right now, this is one of only two places move_task() is called,
6513          * so we can safely collect move_task() stats here rather than
6514          * inside move_task().
6515          */
6516         schedstat_add(env->sd, lb_gained[env->idle], pulled);
6517
6518         mt_sched_printf("move_tasks_tg finish pulled=%d imbalance=%ld", pulled, env->imbalance);
6519
6520         return pulled;
6521 }
6522
6523 #endif /* CONFIG_MTK_SCHED_CMP */
6524
6525
6526 #if defined (CONFIG_MTK_SCHED_CMP_LAZY_BALANCE) && !defined(CONFIG_HMP_LAZY_BALANCE)
6527 static int need_lazy_balance(int dst_cpu, int src_cpu, struct task_struct *p)
6528 {
6529                 /* Lazy balnace for small task
6530                 1. src cpu is buddy cpu
6531                 2. src cpu is not busy cpu
6532                 3. p is light task
6533                 */
6534 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
6535                 if ( PA_ENABLE && cpumask_test_cpu(src_cpu, &buddy_cpu_map) &&
6536                         !is_buddy_busy(src_cpu) && is_light_task(p)) {
6537 #else
6538                 if (cpumask_test_cpu(src_cpu, &buddy_cpu_map) &&
6539                         !is_buddy_busy(src_cpu) && is_light_task(p)) {
6540 #endif
6541 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
6542                         unsigned int i;
6543                         AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[src_cpu][dst_cpu]++;
6544                         mt_sched_printf("[PA]pid=%d, Lazy balance from CPU%d to CPU%d\n)\n", p->pid, src_cpu, dst_cpu);
6545                         for(i=0;i<4;i++) {
6546                                 if(PA_MON_ENABLE && (strcmp(p->comm, &PA_MON[i][0]) == 0)) {
6547                                         printk(KERN_EMERG "[PA] %s Lazy balance from CPU%d to CPU%d\n", p->comm, src_cpu, dst_cpu);
6548         //                              printk(KERN_EMERG "[PA]   src_cpu RQ Usage = %u, Period = %u, NR = %u\n",
6549         //                                                                                                              per_cpu(BUDDY_CPU_RQ_USAGE, src_cpu),
6550         //                                                                                                              per_cpu(BUDDY_CPU_RQ_PERIOD, src_cpu),
6551         //                                                                                                              per_cpu(BUDDY_CPU_RQ_NR, src_cpu));
6552         //                              printk(KERN_EMERG "[PA]   Task Usage = %u, Period = %u\n",
6553         //                                                                                                              p->se.avg.usage_avg_sum,
6554         //                                                                                                              p->se.avg.runnable_avg_period);
6555                                 }
6556                         }
6557 #endif
6558                         return 1;
6559                 }
6560                 else
6561                         return 0;
6562 }
6563 #endif
6564 #ifdef CONFIG_FAIR_GROUP_SCHED
6565 /*
6566  * update tg->load_weight by folding this cpu's load_avg
6567  */
6568 static void __update_blocked_averages_cpu(struct task_group *tg, int cpu)
6569 {
6570         struct sched_entity *se = tg->se[cpu];
6571         struct cfs_rq *cfs_rq = tg->cfs_rq[cpu];
6572
6573         /* throttled entities do not contribute to load */
6574         if (throttled_hierarchy(cfs_rq))
6575                 return;
6576
6577         update_cfs_rq_blocked_load(cfs_rq, 1);
6578
6579         if (se) {
6580                 update_entity_load_avg(se, 1);
6581                 /*
6582                  * We pivot on our runnable average having decayed to zero for
6583                  * list removal.  This generally implies that all our children
6584                  * have also been removed (modulo rounding error or bandwidth
6585                  * control); however, such cases are rare and we can fix these
6586                  * at enqueue.
6587                  *
6588                  * TODO: fix up out-of-order children on enqueue.
6589                  */
6590                 if (!se->avg.runnable_avg_sum && !cfs_rq->nr_running)
6591                         list_del_leaf_cfs_rq(cfs_rq);
6592         } else {
6593                 struct rq *rq = rq_of(cfs_rq);
6594                 update_rq_runnable_avg(rq, rq->nr_running);
6595         }
6596 }
6597
6598 static void update_blocked_averages(int cpu)
6599 {
6600         struct rq *rq = cpu_rq(cpu);
6601         struct cfs_rq *cfs_rq;
6602         unsigned long flags;
6603
6604         raw_spin_lock_irqsave(&rq->lock, flags);
6605         update_rq_clock(rq);
6606         /*
6607          * Iterates the task_group tree in a bottom up fashion, see
6608          * list_add_leaf_cfs_rq() for details.
6609          */
6610         for_each_leaf_cfs_rq(rq, cfs_rq) {
6611                 /*
6612                  * Note: We may want to consider periodically releasing
6613                  * rq->lock about these updates so that creating many task
6614                  * groups does not result in continually extending hold time.
6615                  */
6616                 __update_blocked_averages_cpu(cfs_rq->tg, rq->cpu);
6617         }
6618
6619         raw_spin_unlock_irqrestore(&rq->lock, flags);
6620 }
6621
6622 /*
6623  * Compute the cpu's hierarchical load factor for each task group.
6624  * This needs to be done in a top-down fashion because the load of a child
6625  * group is a fraction of its parents load.
6626  */
6627 static int tg_load_down(struct task_group *tg, void *data)
6628 {
6629         unsigned long load;
6630         long cpu = (long)data;
6631
6632         if (!tg->parent) {
6633                 /*
6634                  * rq's sched_avg is not updated accordingly. adopt rq's
6635                  * corresponding cfs_rq runnable loading instead.
6636                  *
6637                  * a003a25b sched: Consider runnable load average...
6638                  *
6639
6640                 load = cpu_rq(cpu)->avg.load_avg_contrib;
6641
6642                  */
6643                 load = cpu_rq(cpu)->cfs.runnable_load_avg;
6644         } else {
6645                 load = tg->parent->cfs_rq[cpu]->h_load;
6646                 load = div64_ul(load * tg->se[cpu]->avg.load_avg_contrib,
6647                                 tg->parent->cfs_rq[cpu]->runnable_load_avg + 1);
6648         }
6649
6650         tg->cfs_rq[cpu]->h_load = load;
6651
6652         return 0;
6653 }
6654
6655 static void update_h_load(long cpu)
6656 {
6657         rcu_read_lock();
6658         walk_tg_tree(tg_load_down, tg_nop, (void *)cpu);
6659         rcu_read_unlock();
6660 }
6661
6662 static unsigned long task_h_load(struct task_struct *p)
6663 {
6664         struct cfs_rq *cfs_rq = task_cfs_rq(p);
6665
6666         return div64_ul(p->se.avg.load_avg_contrib * cfs_rq->h_load,
6667                         cfs_rq->runnable_load_avg + 1);
6668 }
6669 #else
6670 static inline void update_blocked_averages(int cpu)
6671 {
6672 }
6673
6674 static inline void update_h_load(long cpu)
6675 {
6676 }
6677
6678 static unsigned long task_h_load(struct task_struct *p)
6679 {
6680         return p->se.avg.load_avg_contrib;
6681 }
6682 #endif
6683
6684 /********** Helpers for find_busiest_group ************************/
6685 /*
6686  * sd_lb_stats - Structure to store the statistics of a sched_domain
6687  *              during load balancing.
6688  */
6689 struct sd_lb_stats {
6690         struct sched_group *busiest; /* Busiest group in this sd */
6691         struct sched_group *this;  /* Local group in this sd */
6692         unsigned long total_load;  /* Total load of all groups in sd */
6693         unsigned long total_pwr;   /*   Total power of all groups in sd */
6694         unsigned long avg_load;    /* Average load across all groups in sd */
6695
6696         /** Statistics of this group */
6697         unsigned long this_load;
6698         unsigned long this_load_per_task;
6699         unsigned long this_nr_running;
6700         unsigned long this_has_capacity;
6701         unsigned int  this_idle_cpus;
6702
6703         /* Statistics of the busiest group */
6704         unsigned int  busiest_idle_cpus;
6705         unsigned long max_load;
6706         unsigned long busiest_load_per_task;
6707         unsigned long busiest_nr_running;
6708         unsigned long busiest_group_capacity;
6709         unsigned long busiest_has_capacity;
6710         unsigned int  busiest_group_weight;
6711
6712         int group_imb; /* Is there imbalance in this sd */
6713 };
6714
6715 /*
6716  * sg_lb_stats - stats of a sched_group required for load_balancing
6717  */
6718 struct sg_lb_stats {
6719         unsigned long avg_load; /*Avg load across the CPUs of the group */
6720         unsigned long group_load; /* Total load over the CPUs of the group */
6721         unsigned long sum_nr_running; /* Nr tasks running in the group */
6722         unsigned long sum_weighted_load; /* Weighted load of group's tasks */
6723         unsigned long group_capacity;
6724         unsigned long idle_cpus;
6725         unsigned long group_weight;
6726         int group_imb; /* Is there an imbalance in the group ? */
6727         int group_has_capacity; /* Is there extra capacity in the group? */
6728 };
6729
6730 /**
6731  * get_sd_load_idx - Obtain the load index for a given sched domain.
6732  * @sd: The sched_domain whose load_idx is to be obtained.
6733  * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
6734  */
6735 static inline int get_sd_load_idx(struct sched_domain *sd,
6736                                         enum cpu_idle_type idle)
6737 {
6738         int load_idx;
6739
6740         switch (idle) {
6741         case CPU_NOT_IDLE:
6742                 load_idx = sd->busy_idx;
6743                 break;
6744
6745         case CPU_NEWLY_IDLE:
6746                 load_idx = sd->newidle_idx;
6747                 break;
6748         default:
6749                 load_idx = sd->idle_idx;
6750                 break;
6751         }
6752
6753         return load_idx;
6754 }
6755
6756 static unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu)
6757 {
6758         return SCHED_POWER_SCALE;
6759 }
6760
6761 unsigned long __weak arch_scale_freq_power(struct sched_domain *sd, int cpu)
6762 {
6763         return default_scale_freq_power(sd, cpu);
6764 }
6765
6766 static unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu)
6767 {
6768         unsigned long weight = sd->span_weight;
6769         unsigned long smt_gain = sd->smt_gain;
6770
6771         smt_gain /= weight;
6772
6773         return smt_gain;
6774 }
6775
6776 unsigned long __weak arch_scale_smt_power(struct sched_domain *sd, int cpu)
6777 {
6778         return default_scale_smt_power(sd, cpu);
6779 }
6780
6781 static unsigned long scale_rt_power(int cpu)
6782 {
6783         struct rq *rq = cpu_rq(cpu);
6784         u64 total, available, age_stamp, avg;
6785
6786         /*
6787          * Since we're reading these variables without serialization make sure
6788          * we read them once before doing sanity checks on them.
6789          */
6790         age_stamp = ACCESS_ONCE(rq->age_stamp);
6791         avg = ACCESS_ONCE(rq->rt_avg);
6792
6793         total = sched_avg_period() + (rq->clock - age_stamp);
6794
6795         if (unlikely(total < avg)) {
6796                 /* Ensures that power won't end up being negative */
6797                 available = 0;
6798         } else {
6799                 available = total - avg;
6800         }
6801
6802         if (unlikely((s64)total < SCHED_POWER_SCALE))
6803                 total = SCHED_POWER_SCALE;
6804
6805         total >>= SCHED_POWER_SHIFT;
6806
6807         return div_u64(available, total);
6808 }
6809
6810 static void update_cpu_power(struct sched_domain *sd, int cpu)
6811 {
6812         unsigned long weight = sd->span_weight;
6813         unsigned long power = SCHED_POWER_SCALE;
6814         struct sched_group *sdg = sd->groups;
6815
6816         if ((sd->flags & SD_SHARE_CPUPOWER) && weight > 1) {
6817                 if (sched_feat(ARCH_POWER))
6818                         power *= arch_scale_smt_power(sd, cpu);
6819                 else
6820                         power *= default_scale_smt_power(sd, cpu);
6821
6822                 power >>= SCHED_POWER_SHIFT;
6823         }
6824
6825         sdg->sgp->power_orig = power;
6826
6827         if (sched_feat(ARCH_POWER))
6828                 power *= arch_scale_freq_power(sd, cpu);
6829         else
6830                 power *= default_scale_freq_power(sd, cpu);
6831
6832         power >>= SCHED_POWER_SHIFT;
6833
6834         power *= scale_rt_power(cpu);
6835         power >>= SCHED_POWER_SHIFT;
6836
6837         if (!power)
6838                 power = 1;
6839
6840         cpu_rq(cpu)->cpu_power = power;
6841         sdg->sgp->power = power;
6842 }
6843
6844 void update_group_power(struct sched_domain *sd, int cpu)
6845 {
6846         struct sched_domain *child = sd->child;
6847         struct sched_group *group, *sdg = sd->groups;
6848         unsigned long power;
6849         unsigned long interval;
6850
6851         interval = msecs_to_jiffies(sd->balance_interval);
6852         interval = clamp(interval, 1UL, max_load_balance_interval);
6853         sdg->sgp->next_update = jiffies + interval;
6854
6855         if (!child) {
6856                 update_cpu_power(sd, cpu);
6857                 return;
6858         }
6859
6860         power = 0;
6861
6862         if (child->flags & SD_OVERLAP) {
6863                 /*
6864                  * SD_OVERLAP domains cannot assume that child groups
6865                  * span the current group.
6866                  */
6867
6868                 for_each_cpu(cpu, sched_group_cpus(sdg))
6869                         power += power_of(cpu);
6870         } else  {
6871                 /*
6872                  * !SD_OVERLAP domains can assume that child groups
6873                  * span the current group.
6874                  */
6875
6876                 group = child->groups;
6877                 do {
6878                         power += group->sgp->power;
6879                         group = group->next;
6880                 } while (group != child->groups);
6881         }
6882
6883         sdg->sgp->power_orig = sdg->sgp->power = power;
6884 }
6885
6886 /*
6887  * Try and fix up capacity for tiny siblings, this is needed when
6888  * things like SD_ASYM_PACKING need f_b_g to select another sibling
6889  * which on its own isn't powerful enough.
6890  *
6891  * See update_sd_pick_busiest() and check_asym_packing().
6892  */
6893 static inline int
6894 fix_small_capacity(struct sched_domain *sd, struct sched_group *group)
6895 {
6896         /*
6897          * Only siblings can have significantly less than SCHED_POWER_SCALE
6898          */
6899         if (!(sd->flags & SD_SHARE_CPUPOWER))
6900                 return 0;
6901
6902         /*
6903          * If ~90% of the cpu_power is still there, we're good.
6904          */
6905         if (group->sgp->power * 32 > group->sgp->power_orig * 29)
6906                 return 1;
6907
6908         return 0;
6909 }
6910
6911 /**
6912  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
6913  * @env: The load balancing environment.
6914  * @group: sched_group whose statistics are to be updated.
6915  * @load_idx: Load index of sched_domain of this_cpu for load calc.
6916  * @local_group: Does group contain this_cpu.
6917  * @balance: Should we balance.
6918  * @sgs: variable to hold the statistics for this group.
6919  */
6920 static inline void update_sg_lb_stats(struct lb_env *env,
6921                         struct sched_group *group, int load_idx,
6922                         int local_group, int *balance, struct sg_lb_stats *sgs)
6923 {
6924         unsigned long nr_running, max_nr_running, min_nr_running;
6925         unsigned long load, max_cpu_load, min_cpu_load;
6926         unsigned int balance_cpu = -1, first_idle_cpu = 0;
6927         unsigned long avg_load_per_task = 0;
6928         int i;
6929
6930         if (local_group)
6931                 balance_cpu = group_balance_cpu(group);
6932
6933         /* Tally up the load of all CPUs in the group */
6934         max_cpu_load = 0;
6935         min_cpu_load = ~0UL;
6936         max_nr_running = 0;
6937         min_nr_running = ~0UL;
6938
6939         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
6940                 struct rq *rq = cpu_rq(i);
6941
6942                 nr_running = rq->nr_running;
6943
6944                 /* Bias balancing toward cpus of our domain */
6945                 if (local_group) {
6946                         if (idle_cpu(i) && !first_idle_cpu &&
6947                                         cpumask_test_cpu(i, sched_group_mask(group))) {
6948                                 first_idle_cpu = 1;
6949                                 balance_cpu = i;
6950                         }
6951
6952                         load = target_load(i, load_idx);
6953                 } else {
6954                         load = source_load(i, load_idx);
6955                         if (load > max_cpu_load)
6956                                 max_cpu_load = load;
6957                         if (min_cpu_load > load)
6958                                 min_cpu_load = load;
6959
6960                         if (nr_running > max_nr_running)
6961                                 max_nr_running = nr_running;
6962                         if (min_nr_running > nr_running)
6963                                 min_nr_running = nr_running;
6964
6965 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
6966                         if((load_idx > 0) && (load == cpu_rq(i)->cpu_load[load_idx-1]))
6967                                 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_HISTORY);
6968 #endif
6969                 }
6970
6971                 sgs->group_load += load;
6972                 sgs->sum_nr_running += nr_running;
6973                 sgs->sum_weighted_load += weighted_cpuload(i);
6974                 if (idle_cpu(i))
6975                         sgs->idle_cpus++;
6976         }
6977
6978         /*
6979          * First idle cpu or the first cpu(busiest) in this sched group
6980          * is eligible for doing load balancing at this and above
6981          * domains. In the newly idle case, we will allow all the cpu's
6982          * to do the newly idle load balance.
6983          */
6984         if (local_group) {
6985                 if (env->idle != CPU_NEWLY_IDLE) {
6986                         if (balance_cpu != env->dst_cpu) {
6987                                 *balance = 0;
6988                                 return;
6989                         }
6990                         update_group_power(env->sd, env->dst_cpu);
6991                 } else if (time_after_eq(jiffies, group->sgp->next_update))
6992                         update_group_power(env->sd, env->dst_cpu);
6993         }
6994
6995         /* Adjust by relative CPU power of the group */
6996         sgs->avg_load = (sgs->group_load*SCHED_POWER_SCALE) / group->sgp->power;
6997
6998         /*
6999          * Consider the group unbalanced when the imbalance is larger
7000          * than the average weight of a task.
7001          *
7002          * APZ: with cgroup the avg task weight can vary wildly and
7003          *      might not be a suitable number - should we keep a
7004          *      normalized nr_running number somewhere that negates
7005          *      the hierarchy?
7006          */
7007         if (sgs->sum_nr_running)
7008                 avg_load_per_task = sgs->sum_weighted_load / sgs->sum_nr_running;
7009
7010         if ((max_cpu_load - min_cpu_load) >= avg_load_per_task &&
7011             (max_nr_running - min_nr_running) > 1)
7012                 sgs->group_imb = 1;
7013
7014         sgs->group_capacity = DIV_ROUND_CLOSEST(group->sgp->power,
7015                                                 SCHED_POWER_SCALE);
7016         if (!sgs->group_capacity)
7017                 sgs->group_capacity = fix_small_capacity(env->sd, group);
7018         sgs->group_weight = group->group_weight;
7019
7020         if (sgs->group_capacity > sgs->sum_nr_running)
7021                 sgs->group_has_capacity = 1;
7022 }
7023
7024 /**
7025  * update_sd_pick_busiest - return 1 on busiest group
7026  * @env: The load balancing environment.
7027  * @sds: sched_domain statistics
7028  * @sg: sched_group candidate to be checked for being the busiest
7029  * @sgs: sched_group statistics
7030  *
7031  * Determine if @sg is a busier group than the previously selected
7032  * busiest group.
7033  */
7034 static bool update_sd_pick_busiest(struct lb_env *env,
7035                                    struct sd_lb_stats *sds,
7036                                    struct sched_group *sg,
7037                                    struct sg_lb_stats *sgs)
7038 {
7039         if (sgs->avg_load <= sds->max_load) {
7040                 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_PICK_BUSIEST_FAIL_1);
7041                 return false;
7042         }
7043
7044         if (sgs->sum_nr_running > sgs->group_capacity)
7045                 return true;
7046
7047         if (sgs->group_imb)
7048                 return true;
7049
7050         /*
7051          * ASYM_PACKING needs to move all the work to the lowest
7052          * numbered CPUs in the group, therefore mark all groups
7053          * higher than ourself as busy.
7054          */
7055         if ((env->sd->flags & SD_ASYM_PACKING) && sgs->sum_nr_running &&
7056             env->dst_cpu < group_first_cpu(sg)) {
7057                 if (!sds->busiest)
7058                         return true;
7059
7060                 if (group_first_cpu(sds->busiest) > group_first_cpu(sg))
7061                         return true;
7062         }
7063
7064         mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_PICK_BUSIEST_FAIL_2);
7065         return false;
7066 }
7067
7068 /**
7069  * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
7070  * @env: The load balancing environment.
7071  * @balance: Should we balance.
7072  * @sds: variable to hold the statistics for this sched_domain.
7073  */
7074 static inline void update_sd_lb_stats(struct lb_env *env,
7075                                         int *balance, struct sd_lb_stats *sds)
7076 {
7077         struct sched_domain *child = env->sd->child;
7078         struct sched_group *sg = env->sd->groups;
7079         struct sg_lb_stats sgs;
7080         int load_idx, prefer_sibling = 0;
7081
7082         if (child && child->flags & SD_PREFER_SIBLING)
7083                 prefer_sibling = 1;
7084
7085         load_idx = get_sd_load_idx(env->sd, env->idle);
7086
7087         do {
7088                 int local_group;
7089
7090                 local_group = cpumask_test_cpu(env->dst_cpu, sched_group_cpus(sg));
7091                 memset(&sgs, 0, sizeof(sgs));
7092                 update_sg_lb_stats(env, sg, load_idx, local_group, balance, &sgs);
7093
7094                 if (local_group && !(*balance))
7095                         return;
7096
7097                 sds->total_load += sgs.group_load;
7098                 sds->total_pwr += sg->sgp->power;
7099
7100                 /*
7101                  * In case the child domain prefers tasks go to siblings
7102                  * first, lower the sg capacity to one so that we'll try
7103                  * and move all the excess tasks away. We lower the capacity
7104                  * of a group only if the local group has the capacity to fit
7105                  * these excess tasks, i.e. nr_running < group_capacity. The
7106                  * extra check prevents the case where you always pull from the
7107                  * heaviest group when it is already under-utilized (possible
7108                  * with a large weight task outweighs the tasks on the system).
7109                  */
7110                 if (prefer_sibling && !local_group && sds->this_has_capacity)
7111                         sgs.group_capacity = min(sgs.group_capacity, 1UL);
7112
7113                 if (local_group) {
7114                         sds->this_load = sgs.avg_load;
7115                         sds->this = sg;
7116                         sds->this_nr_running = sgs.sum_nr_running;
7117                         sds->this_load_per_task = sgs.sum_weighted_load;
7118                         sds->this_has_capacity = sgs.group_has_capacity;
7119                         sds->this_idle_cpus = sgs.idle_cpus;
7120                 } else if (update_sd_pick_busiest(env, sds, sg, &sgs)) {
7121                         sds->max_load = sgs.avg_load;
7122                         sds->busiest = sg;
7123                         sds->busiest_nr_running = sgs.sum_nr_running;
7124                         sds->busiest_idle_cpus = sgs.idle_cpus;
7125                         sds->busiest_group_capacity = sgs.group_capacity;
7126                         sds->busiest_load_per_task = sgs.sum_weighted_load;
7127                         sds->busiest_has_capacity = sgs.group_has_capacity;
7128                         sds->busiest_group_weight = sgs.group_weight;
7129                         sds->group_imb = sgs.group_imb;
7130                 }
7131
7132                 sg = sg->next;
7133         } while (sg != env->sd->groups);
7134 }
7135
7136 /**
7137  * check_asym_packing - Check to see if the group is packed into the
7138  *                      sched doman.
7139  *
7140  * This is primarily intended to used at the sibling level.  Some
7141  * cores like POWER7 prefer to use lower numbered SMT threads.  In the
7142  * case of POWER7, it can move to lower SMT modes only when higher
7143  * threads are idle.  When in lower SMT modes, the threads will
7144  * perform better since they share less core resources.  Hence when we
7145  * have idle threads, we want them to be the higher ones.
7146  *
7147  * This packing function is run on idle threads.  It checks to see if
7148  * the busiest CPU in this domain (core in the P7 case) has a higher
7149  * CPU number than the packing function is being run on.  Here we are
7150  * assuming lower CPU number will be equivalent to lower a SMT thread
7151  * number.
7152  *
7153  * Returns 1 when packing is required and a task should be moved to
7154  * this CPU.  The amount of the imbalance is returned in *imbalance.
7155  *
7156  * @env: The load balancing environment.
7157  * @sds: Statistics of the sched_domain which is to be packed
7158  */
7159 static int check_asym_packing(struct lb_env *env, struct sd_lb_stats *sds)
7160 {
7161         int busiest_cpu;
7162
7163         if (!(env->sd->flags & SD_ASYM_PACKING))
7164                 return 0;
7165
7166         if (!sds->busiest)
7167                 return 0;
7168
7169         busiest_cpu = group_first_cpu(sds->busiest);
7170         if (env->dst_cpu > busiest_cpu)
7171                 return 0;
7172
7173         env->imbalance = DIV_ROUND_CLOSEST(
7174                 sds->max_load * sds->busiest->sgp->power, SCHED_POWER_SCALE);
7175
7176         return 1;
7177 }
7178
7179 /**
7180  * fix_small_imbalance - Calculate the minor imbalance that exists
7181  *                      amongst the groups of a sched_domain, during
7182  *                      load balancing.
7183  * @env: The load balancing environment.
7184  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
7185  */
7186 static inline
7187 void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7188 {
7189         unsigned long tmp, pwr_now = 0, pwr_move = 0;
7190         unsigned int imbn = 2;
7191         unsigned long scaled_busy_load_per_task;
7192
7193         if (sds->this_nr_running) {
7194                 sds->this_load_per_task /= sds->this_nr_running;
7195                 if (sds->busiest_load_per_task >
7196                                 sds->this_load_per_task)
7197                         imbn = 1;
7198         } else {
7199                 sds->this_load_per_task =
7200                         cpu_avg_load_per_task(env->dst_cpu);
7201         }
7202
7203         scaled_busy_load_per_task = sds->busiest_load_per_task
7204                                          * SCHED_POWER_SCALE;
7205         scaled_busy_load_per_task /= sds->busiest->sgp->power;
7206
7207         if (sds->max_load - sds->this_load + scaled_busy_load_per_task >=
7208                         (scaled_busy_load_per_task * imbn)) {
7209                 env->imbalance = sds->busiest_load_per_task;
7210                 return;
7211         }
7212
7213         /*
7214          * OK, we don't have enough imbalance to justify moving tasks,
7215          * however we may be able to increase total CPU power used by
7216          * moving them.
7217          */
7218
7219         pwr_now += sds->busiest->sgp->power *
7220                         min(sds->busiest_load_per_task, sds->max_load);
7221         pwr_now += sds->this->sgp->power *
7222                         min(sds->this_load_per_task, sds->this_load);
7223         pwr_now /= SCHED_POWER_SCALE;
7224
7225         /* Amount of load we'd subtract */
7226         tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
7227                 sds->busiest->sgp->power;
7228         if (sds->max_load > tmp)
7229                 pwr_move += sds->busiest->sgp->power *
7230                         min(sds->busiest_load_per_task, sds->max_load - tmp);
7231
7232         /* Amount of load we'd add */
7233         if (sds->max_load * sds->busiest->sgp->power <
7234                 sds->busiest_load_per_task * SCHED_POWER_SCALE)
7235                 tmp = (sds->max_load * sds->busiest->sgp->power) /
7236                         sds->this->sgp->power;
7237         else
7238                 tmp = (sds->busiest_load_per_task * SCHED_POWER_SCALE) /
7239                         sds->this->sgp->power;
7240         pwr_move += sds->this->sgp->power *
7241                         min(sds->this_load_per_task, sds->this_load + tmp);
7242         pwr_move /= SCHED_POWER_SCALE;
7243
7244         /* Move if we gain throughput */
7245         if (pwr_move > pwr_now)
7246                 env->imbalance = sds->busiest_load_per_task;
7247 }
7248
7249 /**
7250  * calculate_imbalance - Calculate the amount of imbalance present within the
7251  *                       groups of a given sched_domain during load balance.
7252  * @env: load balance environment
7253  * @sds: statistics of the sched_domain whose imbalance is to be calculated.
7254  */
7255 static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
7256 {
7257         unsigned long max_pull, load_above_capacity = ~0UL;
7258
7259         sds->busiest_load_per_task /= sds->busiest_nr_running;
7260         if (sds->group_imb) {
7261                 sds->busiest_load_per_task =
7262                         min(sds->busiest_load_per_task, sds->avg_load);
7263         }
7264
7265         /*
7266          * In the presence of smp nice balancing, certain scenarios can have
7267          * max load less than avg load(as we skip the groups at or below
7268          * its cpu_power, while calculating max_load..)
7269          */
7270         if (sds->max_load < sds->avg_load) {
7271                 env->imbalance = 0;
7272                 return fix_small_imbalance(env, sds);
7273         }
7274
7275         if (!sds->group_imb) {
7276                 /*
7277                  * Don't want to pull so many tasks that a group would go idle.
7278                  */
7279                 load_above_capacity = (sds->busiest_nr_running -
7280                                                 sds->busiest_group_capacity);
7281
7282                 load_above_capacity *= (SCHED_LOAD_SCALE * SCHED_POWER_SCALE);
7283
7284                 load_above_capacity /= sds->busiest->sgp->power;
7285         }
7286
7287         /*
7288          * We're trying to get all the cpus to the average_load, so we don't
7289          * want to push ourselves above the average load, nor do we wish to
7290          * reduce the max loaded cpu below the average load. At the same time,
7291          * we also don't want to reduce the group load below the group capacity
7292          * (so that we can implement power-savings policies etc). Thus we look
7293          * for the minimum possible imbalance.
7294          * Be careful of negative numbers as they'll appear as very large values
7295          * with unsigned longs.
7296          */
7297         max_pull = min(sds->max_load - sds->avg_load, load_above_capacity);
7298
7299         /* How much load to actually move to equalise the imbalance */
7300         env->imbalance = min(max_pull * sds->busiest->sgp->power,
7301                 (sds->avg_load - sds->this_load) * sds->this->sgp->power)
7302                         / SCHED_POWER_SCALE;
7303
7304         /*
7305          * if *imbalance is less than the average load per runnable task
7306          * there is no guarantee that any tasks will be moved so we'll have
7307          * a think about bumping its value to force at least one task to be
7308          * moved
7309          */
7310         if (env->imbalance < sds->busiest_load_per_task)
7311                 return fix_small_imbalance(env, sds);
7312
7313 }
7314
7315 /******* find_busiest_group() helpers end here *********************/
7316
7317 /**
7318  * find_busiest_group - Returns the busiest group within the sched_domain
7319  * if there is an imbalance. If there isn't an imbalance, and
7320  * the user has opted for power-savings, it returns a group whose
7321  * CPUs can be put to idle by rebalancing those tasks elsewhere, if
7322  * such a group exists.
7323  *
7324  * Also calculates the amount of weighted load which should be moved
7325  * to restore balance.
7326  *
7327  * @env: The load balancing environment.
7328  * @balance: Pointer to a variable indicating if this_cpu
7329  *      is the appropriate cpu to perform load balancing at this_level.
7330  *
7331  * Returns:     - the busiest group if imbalance exists.
7332  *              - If no imbalance and user has opted for power-savings balance,
7333  *                 return the least loaded group whose CPUs can be
7334  *                 put to idle by rebalancing its tasks onto our group.
7335  */
7336 static struct sched_group *
7337 find_busiest_group(struct lb_env *env, int *balance)
7338 {
7339         struct sd_lb_stats sds;
7340
7341         memset(&sds, 0, sizeof(sds));
7342
7343         /*
7344          * Compute the various statistics relavent for load balancing at
7345          * this level.
7346          */
7347         update_sd_lb_stats(env, balance, &sds);
7348
7349         /*
7350          * this_cpu is not the appropriate cpu to perform load balancing at
7351          * this level.
7352          */
7353         if (!(*balance)){
7354                 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_BALANCE);
7355                 goto ret;
7356         }
7357
7358         if ((env->idle == CPU_IDLE || env->idle == CPU_NEWLY_IDLE) &&
7359             check_asym_packing(env, &sds))
7360                 return sds.busiest;
7361
7362         /* There is no busy sibling group to pull tasks from */
7363         if (!sds.busiest || sds.busiest_nr_running == 0){
7364                 if(!sds.busiest){
7365                         mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_BUSIEST_NO_TASK);
7366                 }else{
7367                         mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_BUSIEST);
7368                 }
7369                 goto out_balanced;
7370         }
7371
7372         sds.avg_load = (SCHED_POWER_SCALE * sds.total_load) / sds.total_pwr;
7373
7374         /*
7375          * If the busiest group is imbalanced the below checks don't
7376          * work because they assumes all things are equal, which typically
7377          * isn't true due to cpus_allowed constraints and the like.
7378          */
7379         if (sds.group_imb)
7380                 goto force_balance;
7381
7382         /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */
7383         if (env->idle == CPU_NEWLY_IDLE && sds.this_has_capacity &&
7384                         !sds.busiest_has_capacity)
7385                 goto force_balance;
7386
7387         /*
7388          * If the local group is more busy than the selected busiest group
7389          * don't try and pull any tasks.
7390          */
7391         if (sds.this_load >= sds.max_load){
7392                 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_LARGER_THAN);
7393                 goto out_balanced;
7394         }
7395
7396         /*
7397          * Don't pull any tasks if this group is already above the domain
7398          * average load.
7399          */
7400         if (sds.this_load >= sds.avg_load){
7401                 mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_NO_LARGER_THAN);
7402                 goto out_balanced;
7403         }
7404
7405         if (env->idle == CPU_IDLE) {
7406                 /*
7407                  * This cpu is idle. If the busiest group load doesn't
7408                  * have more tasks than the number of available cpu's and
7409                  * there is no imbalance between this and busiest group
7410                  * wrt to idle cpu's, it is balanced.
7411                  */
7412                 if ((sds.this_idle_cpus <= sds.busiest_idle_cpus + 1) &&
7413                     sds.busiest_nr_running <= sds.busiest_group_weight)
7414                         goto out_balanced;
7415         } else {
7416                 /*
7417                  * In the CPU_NEWLY_IDLE, CPU_NOT_IDLE cases, use
7418                  * imbalance_pct to be conservative.
7419                  */
7420                 if (100 * sds.max_load <= env->sd->imbalance_pct * sds.this_load){
7421                         mt_lbprof_stat_or(env->fail_reason, MT_LBPROF_NOBUSYG_CHECK_FAIL);
7422                         goto out_balanced;
7423                 }
7424         }
7425
7426 force_balance:
7427         /* Looks like there is an imbalance. Compute it */
7428         calculate_imbalance(env, &sds);
7429         return sds.busiest;
7430
7431 out_balanced:
7432 ret:
7433         env->imbalance = 0;
7434         return NULL;
7435 }
7436
7437 /*
7438  * find_busiest_queue - find the busiest runqueue among the cpus in group.
7439  */
7440 static struct rq *find_busiest_queue(struct lb_env *env,
7441                                      struct sched_group *group)
7442 {
7443         struct rq *busiest = NULL, *rq;
7444         unsigned long max_load = 0;
7445         int i;
7446
7447         for_each_cpu(i, sched_group_cpus(group)) {
7448                 unsigned long power = power_of(i);
7449                 unsigned long capacity = DIV_ROUND_CLOSEST(power,
7450                                                            SCHED_POWER_SCALE);
7451                 unsigned long wl;
7452
7453                 if (!capacity)
7454                         capacity = fix_small_capacity(env->sd, group);
7455
7456                 if (!cpumask_test_cpu(i, env->cpus))
7457                         continue;
7458
7459                 rq = cpu_rq(i);
7460                 wl = weighted_cpuload(i);
7461
7462                 /*
7463                  * When comparing with imbalance, use weighted_cpuload()
7464                  * which is not scaled with the cpu power.
7465                  */
7466                 if (capacity && rq->nr_running == 1 && wl > env->imbalance)
7467                         continue;
7468
7469                 /*
7470                  * For the load comparisons with the other cpu's, consider
7471                  * the weighted_cpuload() scaled with the cpu power, so that
7472                  * the load can be moved away from the cpu that is potentially
7473                  * running at a lower capacity.
7474                  */
7475                 wl = (wl * SCHED_POWER_SCALE) / power;
7476
7477                 if (wl > max_load) {
7478                         max_load = wl;
7479                         busiest = rq;
7480                 }
7481         }
7482
7483         return busiest;
7484 }
7485
7486 /*
7487  * Max backoff if we encounter pinned tasks. Pretty arbitrary value, but
7488  * so long as it is large enough.
7489  */
7490 #define MAX_PINNED_INTERVAL     512
7491
7492 /* Working cpumask for load_balance and load_balance_newidle. */
7493 DEFINE_PER_CPU(cpumask_var_t, load_balance_mask);
7494
7495 static int need_active_balance(struct lb_env *env)
7496 {
7497         struct sched_domain *sd = env->sd;
7498
7499         if (env->idle == CPU_NEWLY_IDLE) {
7500
7501                 /*
7502                  * ASYM_PACKING needs to force migrate tasks from busy but
7503                  * higher numbered CPUs in order to pack all tasks in the
7504                  * lowest numbered CPUs.
7505                  */
7506                 if ((sd->flags & SD_ASYM_PACKING) && env->src_cpu > env->dst_cpu)
7507                         return 1;
7508         }
7509
7510         return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
7511 }
7512
7513 static int active_load_balance_cpu_stop(void *data);
7514
7515 /*
7516  * Check this_cpu to ensure it is balanced within domain. Attempt to move
7517  * tasks if there is an imbalance.
7518  */
7519 static int load_balance(int this_cpu, struct rq *this_rq,
7520                         struct sched_domain *sd, enum cpu_idle_type idle,
7521                         int *balance)
7522 {
7523         int ld_moved, cur_ld_moved, active_balance = 0;
7524         struct sched_group *group;
7525         struct rq *busiest;
7526         unsigned long flags;
7527         struct cpumask *cpus = __get_cpu_var(load_balance_mask);
7528
7529         struct lb_env env = {
7530                 .sd             = sd,
7531                 .dst_cpu        = this_cpu,
7532                 .dst_rq         = this_rq,
7533                 .dst_grpmask    = sched_group_cpus(sd->groups),
7534                 .idle           = idle,
7535                 .loop_break     = sched_nr_migrate_break,
7536                 .cpus           = cpus,
7537 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7538                 .fail_reason= MT_LBPROF_NO_TRIGGER,
7539 #endif
7540         };
7541
7542         /*
7543          * For NEWLY_IDLE load_balancing, we don't need to consider
7544          * other cpus in our group
7545          */
7546         if (idle == CPU_NEWLY_IDLE)
7547                 env.dst_grpmask = NULL;
7548
7549         cpumask_copy(cpus, cpu_active_mask);
7550
7551         schedstat_inc(sd, lb_count[idle]);
7552
7553 redo:
7554         group = find_busiest_group(&env, balance);
7555
7556         if (*balance == 0)
7557                 goto out_balanced;
7558
7559         if (!group) {
7560                 schedstat_inc(sd, lb_nobusyg[idle]);
7561                 if(mt_lbprof_test(env.fail_reason, MT_LBPROF_HISTORY)){
7562                         int tmp_cpu;
7563                         for_each_cpu(tmp_cpu, cpu_possible_mask){
7564                                 if (tmp_cpu == this_rq->cpu)
7565                                         continue;
7566                                 mt_lbprof_update_state(tmp_cpu, MT_LBPROF_BALANCE_FAIL_STATE);
7567                         }
7568                 }
7569                 goto out_balanced;
7570         }
7571
7572         busiest = find_busiest_queue(&env, group);
7573         if (!busiest) {
7574                 schedstat_inc(sd, lb_nobusyq[idle]);
7575                 mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_NOBUSYQ);
7576                 goto out_balanced;
7577         }
7578
7579 #ifdef CONFIG_HMP_LAZY_BALANCE
7580
7581 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7582         if (PA_ENABLE && LB_ENABLE) {
7583 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7584
7585                 if (per_cpu(sd_pack_buddy, this_cpu) == busiest->cpu && !is_buddy_busy(per_cpu(sd_pack_buddy, this_cpu))) {
7586
7587 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7588                         AVOID_LOAD_BALANCE_FROM_CPUX_TO_CPUY_COUNT[this_cpu][busiest->cpu]++;
7589
7590 #ifdef CONFIG_HMP_TRACER
7591                         trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_BALANCE_FORM_CPUX_TO_CPUY, 0, this_cpu, busiest->cpu);
7592 #endif /* CONFIG_HMP_TRACER */
7593
7594 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7595
7596                         schedstat_inc(sd, lb_nobusyq[idle]);
7597                         goto out_balanced;
7598                 }
7599
7600 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
7601         }
7602 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
7603
7604 #endif /* CONFIG_HMP_LAZY_BALANCE */
7605
7606         BUG_ON(busiest == env.dst_rq);
7607
7608         schedstat_add(sd, lb_imbalance[idle], env.imbalance);
7609
7610         ld_moved = 0;
7611         if (busiest->nr_running > 1) {
7612                 /*
7613                  * Attempt to move tasks. If find_busiest_group has found
7614                  * an imbalance but busiest->nr_running <= 1, the group is
7615                  * still unbalanced. ld_moved simply stays zero, so it is
7616                  * correctly treated as an imbalance.
7617                  */
7618                 env.flags |= LBF_ALL_PINNED;
7619                 env.src_cpu   = busiest->cpu;
7620                 env.src_rq    = busiest;
7621                 env.loop_max  = min(sysctl_sched_nr_migrate, busiest->nr_running);
7622 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7623                 env.mt_check_cache_in_idle = 1;
7624 #endif
7625
7626                 update_h_load(env.src_cpu);
7627 more_balance:
7628                 local_irq_save(flags);
7629                 double_rq_lock(env.dst_rq, busiest);
7630 #ifdef CONFIG_MTK_SCHED_CMP
7631                 env.loop_max    = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
7632                 mt_sched_printf("1 env.loop_max=%d, busiest->nr_running=%d src=%d, dst=%d, cpus_share_cache=%d",
7633                         env.loop_max, busiest->nr_running, env.src_cpu, env.dst_cpu, cpus_share_cache(env.src_cpu, env.dst_cpu));
7634 #endif /* CONFIG_MTK_SCHED_CMP */
7635                 /*
7636                  * cur_ld_moved - load moved in current iteration
7637                  * ld_moved     - cumulative load moved across iterations
7638                  */
7639 #ifdef CONFIG_MTK_SCHED_CMP
7640                 if (!cpus_share_cache(env.src_cpu, env.dst_cpu))
7641                         cur_ld_moved = cmp_move_tasks(sd, &env);
7642                 else
7643                         cur_ld_moved = move_tasks(&env);
7644 #else /* !CONFIG_MTK_SCHED_CMP */
7645                 cur_ld_moved = move_tasks(&env);
7646 #endif /* CONFIG_MTK_SCHED_CMP */
7647                 ld_moved += cur_ld_moved;
7648                 double_rq_unlock(env.dst_rq, busiest);
7649                 local_irq_restore(flags);
7650
7651                 /*
7652                  * some other cpu did the load balance for us.
7653                  */
7654                 if (cur_ld_moved && env.dst_cpu != smp_processor_id())
7655                         resched_cpu(env.dst_cpu);
7656
7657                 if (env.flags & LBF_NEED_BREAK) {
7658                         env.flags &= ~LBF_NEED_BREAK;
7659                         goto more_balance;
7660                 }
7661
7662                 /*
7663                  * Revisit (affine) tasks on src_cpu that couldn't be moved to
7664                  * us and move them to an alternate dst_cpu in our sched_group
7665                  * where they can run. The upper limit on how many times we
7666                  * iterate on same src_cpu is dependent on number of cpus in our
7667                  * sched_group.
7668                  *
7669                  * This changes load balance semantics a bit on who can move
7670                  * load to a given_cpu. In addition to the given_cpu itself
7671                  * (or a ilb_cpu acting on its behalf where given_cpu is
7672                  * nohz-idle), we now have balance_cpu in a position to move
7673                  * load to given_cpu. In rare situations, this may cause
7674                  * conflicts (balance_cpu and given_cpu/ilb_cpu deciding
7675                  * _independently_ and at _same_ time to move some load to
7676                  * given_cpu) causing exceess load to be moved to given_cpu.
7677                  * This however should not happen so much in practice and
7678                  * moreover subsequent load balance cycles should correct the
7679                  * excess load moved.
7680                  */
7681                 if ((env.flags & LBF_SOME_PINNED) && env.imbalance > 0) {
7682
7683                         env.dst_rq       = cpu_rq(env.new_dst_cpu);
7684                         env.dst_cpu      = env.new_dst_cpu;
7685                         env.flags       &= ~LBF_SOME_PINNED;
7686                         env.loop         = 0;
7687                         env.loop_break   = sched_nr_migrate_break;
7688
7689                         /* Prevent to re-select dst_cpu via env's cpus */
7690                         cpumask_clear_cpu(env.dst_cpu, env.cpus);
7691
7692                         /*
7693                          * Go back to "more_balance" rather than "redo" since we
7694                          * need to continue with same src_cpu.
7695                          */
7696                         goto more_balance;
7697                 }
7698
7699                 /* All tasks on this runqueue were pinned by CPU affinity */
7700                 if (unlikely(env.flags & LBF_ALL_PINNED)) {
7701                         mt_lbprof_update_state(busiest->cpu, MT_LBPROF_ALLPINNED);
7702                         cpumask_clear_cpu(cpu_of(busiest), cpus);
7703                         if (!cpumask_empty(cpus)) {
7704                                 env.loop = 0;
7705                                 env.loop_break = sched_nr_migrate_break;
7706                                 goto redo;
7707                         }
7708                         goto out_balanced;
7709                 }
7710
7711 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7712                 /* when move tasks fil, force migration no matter cache-hot */
7713                 /*  use mt_check_cache_in_idle */
7714                 if (!ld_moved && ((CPU_NEWLY_IDLE == idle) || (CPU_IDLE == idle) ) ) {
7715 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7716                         mt_lbprof_stat_set(env.fail_reason, MT_LBPROF_DO_LB);
7717 #endif
7718                         env.mt_check_cache_in_idle = 0;
7719                         env.loop = 0;
7720                         local_irq_save(flags);
7721                         double_rq_lock(env.dst_rq, busiest);
7722 #ifdef CONFIG_MTK_SCHED_CMP
7723                         env.loop_max    = min_t(unsigned long, sysctl_sched_nr_migrate, busiest->nr_running);
7724                         mt_sched_printf("2 env.loop_max=%d, busiest->nr_running=%d",
7725                         env.loop_max, busiest->nr_running);
7726 #endif /* CONFIG_MTK_SCHED_CMP */
7727                         if (!env.loop)
7728                                 update_h_load(env.src_cpu);
7729 #ifdef CONFIG_MTK_SCHED_CMP_TGS
7730                         if (!cpus_share_cache(env.src_cpu, env.dst_cpu))
7731                                 ld_moved = cmp_move_tasks(sd, &env);
7732                         else{
7733                                 ld_moved = move_tasks(&env);
7734                         }
7735 #else /* !CONFIG_MTK_SCHED_CMP_TGS */
7736                         ld_moved = move_tasks(&env);
7737 #endif /* CONFIG_MTK_SCHED_CMP_TGS */
7738                         double_rq_unlock(env.dst_rq, busiest);
7739                         local_irq_restore(flags);
7740
7741                         /*
7742                          * some other cpu did the load balance for us.
7743                         */
7744                         if (ld_moved && this_cpu != smp_processor_id())
7745                                 resched_cpu(this_cpu);
7746                 }
7747 #endif
7748         }
7749
7750         if (!ld_moved) {
7751                 schedstat_inc(sd, lb_failed[idle]);
7752                 mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_FAILED);
7753                 if ( mt_lbprof_test(env.fail_reason, MT_LBPROF_AFFINITY) ) {
7754                         mt_lbprof_update_state(busiest->cpu, MT_LBPROF_FAILURE_STATE);
7755                 }else if ( mt_lbprof_test(env.fail_reason, MT_LBPROF_CACHEHOT) ) {
7756                         mt_lbprof_update_state(busiest->cpu, MT_LBPROF_FAILURE_STATE);
7757                 }
7758
7759                 /*
7760                  * Increment the failure counter only on periodic balance.
7761                  * We do not want newidle balance, which can be very
7762                  * frequent, pollute the failure counter causing
7763                  * excessive cache_hot migrations and active balances.
7764                  */
7765                 if (idle != CPU_NEWLY_IDLE)
7766                         sd->nr_balance_failed++;
7767                 mt_lbprof_stat_inc(sd, mt_lbprof_nr_balance_failed);
7768
7769                 if (need_active_balance(&env)) {
7770                         raw_spin_lock_irqsave(&busiest->lock, flags);
7771
7772                         /* don't kick the active_load_balance_cpu_stop,
7773                          * if the curr task on busiest cpu can't be
7774                          * moved to this_cpu
7775                          */
7776                         if (!cpumask_test_cpu(this_cpu,
7777                                         tsk_cpus_allowed(busiest->curr))) {
7778                                 raw_spin_unlock_irqrestore(&busiest->lock,
7779                                                             flags);
7780                                 env.flags |= LBF_ALL_PINNED;
7781                                 goto out_one_pinned;
7782                         }
7783
7784                         /*
7785                          * ->active_balance synchronizes accesses to
7786                          * ->active_balance_work.  Once set, it's cleared
7787                          * only after active load balance is finished.
7788                          */
7789                         if (!busiest->active_balance) {
7790                                 busiest->active_balance = 1;
7791                                 busiest->push_cpu = this_cpu;
7792                                 active_balance = 1;
7793                         }
7794                         raw_spin_unlock_irqrestore(&busiest->lock, flags);
7795
7796                         if (active_balance) {
7797                                 stop_one_cpu_nowait(cpu_of(busiest),
7798                                         active_load_balance_cpu_stop, busiest,
7799                                         &busiest->active_balance_work);
7800                         }
7801
7802                         /*
7803                          * We've kicked active balancing, reset the failure
7804                          * counter.
7805                          */
7806                         sd->nr_balance_failed = sd->cache_nice_tries+1;
7807                 }
7808         } else
7809                 sd->nr_balance_failed = 0;
7810
7811         if (likely(!active_balance)) {
7812                 /* We were unbalanced, so reset the balancing interval */
7813                 sd->balance_interval = sd->min_interval;
7814         } else {
7815                 /*
7816                  * If we've begun active balancing, start to back off. This
7817                  * case may not be covered by the all_pinned logic if there
7818                  * is only 1 task on the busy runqueue (because we don't call
7819                  * move_tasks).
7820                  */
7821                 if (sd->balance_interval < sd->max_interval)
7822                         sd->balance_interval *= 2;
7823         }
7824
7825         goto out;
7826
7827 out_balanced:
7828         schedstat_inc(sd, lb_balanced[idle]);
7829
7830         sd->nr_balance_failed = 0;
7831         mt_lbprof_stat_set(sd->mt_lbprof_nr_balance_failed, 0);
7832
7833 out_one_pinned:
7834         /* tune up the balancing interval */
7835         if (((env.flags & LBF_ALL_PINNED) &&
7836                         sd->balance_interval < MAX_PINNED_INTERVAL) ||
7837                         (sd->balance_interval < sd->max_interval))
7838                 sd->balance_interval *= 2;
7839
7840         ld_moved = 0;
7841 out:
7842         if (ld_moved){
7843                 mt_lbprof_stat_or(env.fail_reason, MT_LBPROF_SUCCESS);
7844                 mt_lbprof_stat_set(sd->mt_lbprof_nr_balance_failed, 0);
7845         }
7846
7847 #ifdef CONFIG_MT_LOAD_BALANCE_PROFILER
7848         if( CPU_NEWLY_IDLE == idle){
7849                 char strings[128]="";
7850                 snprintf(strings, 128, "%d:idle balance:%d:0x%x ", this_cpu, ld_moved, env.fail_reason);
7851                 mt_lbprof_rqinfo(strings);
7852                 trace_sched_lbprof_log(strings);
7853         }else{
7854                 char strings[128]="";
7855                 snprintf(strings, 128, "%d:periodic balance:%d:0x%x ", this_cpu, ld_moved, env.fail_reason);
7856                 mt_lbprof_rqinfo(strings);
7857                 trace_sched_lbprof_log(strings);
7858         }
7859 #endif
7860
7861         return ld_moved;
7862 }
7863
7864 /*
7865  * idle_balance is called by schedule() if this_cpu is about to become
7866  * idle. Attempts to pull tasks from other CPUs.
7867  */
7868 void idle_balance(int this_cpu, struct rq *this_rq)
7869 {
7870         struct sched_domain *sd;
7871         int pulled_task = 0;
7872         unsigned long next_balance = jiffies + HZ;
7873 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT) || defined(CONFIG_MT_LOAD_BALANCE_PROFILER)
7874         unsigned long counter = 0;
7875 #endif
7876
7877         this_rq->idle_stamp = this_rq->clock;
7878
7879         mt_lbprof_update_state_has_lock(this_cpu, MT_LBPROF_UPDATE_STATE);
7880 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7881         #ifdef CONFIG_LOCAL_TIMERS
7882                 counter = localtimer_get_counter();
7883                 if ( counter >= 260000 )  // 20ms
7884                         goto must_do;
7885                 if ( time_before(jiffies + 2, this_rq->next_balance) )  // 20ms
7886                         goto must_do;
7887         #endif
7888 #endif
7889
7890         if (this_rq->avg_idle < sysctl_sched_migration_cost){
7891 #if defined(CONFIG_MT_LOAD_BALANCE_PROFILER)
7892                 char strings[128]="";
7893                 mt_lbprof_update_state_has_lock(this_cpu, MT_LBPROF_ALLOW_UNBLANCE_STATE);
7894                 snprintf(strings, 128, "%d:idle balance bypass: %llu %lu ", this_cpu, this_rq->avg_idle, counter);
7895                 mt_lbprof_rqinfo(strings);
7896                 trace_sched_lbprof_log(strings);
7897 #endif
7898                 return;
7899         }
7900
7901 #ifdef CONFIG_MT_LOAD_BALANCE_ENHANCEMENT
7902         must_do:
7903 #endif
7904
7905         /*
7906          * Drop the rq->lock, but keep IRQ/preempt disabled.
7907          */
7908         raw_spin_unlock(&this_rq->lock);
7909
7910         mt_lbprof_update_status();
7911         update_blocked_averages(this_cpu);
7912         rcu_read_lock();
7913         for_each_domain(this_cpu, sd) {
7914                 unsigned long interval;
7915                 int balance = 1;
7916
7917                 if (!(sd->flags & SD_LOAD_BALANCE))
7918                         continue;
7919
7920                 if (sd->flags & SD_BALANCE_NEWIDLE) {
7921                         /* If we've pulled tasks over stop searching: */
7922                         pulled_task = load_balance(this_cpu, this_rq,
7923                                                    sd, CPU_NEWLY_IDLE, &balance);
7924                 }
7925
7926                 interval = msecs_to_jiffies(sd->balance_interval);
7927                 if (time_after(next_balance, sd->last_balance + interval))
7928                         next_balance = sd->last_balance + interval;
7929                 if (pulled_task) {
7930                         this_rq->idle_stamp = 0;
7931                         break;
7932                 }
7933         }
7934         rcu_read_unlock();
7935
7936         raw_spin_lock(&this_rq->lock);
7937
7938         if (pulled_task || time_after(jiffies, this_rq->next_balance)) {
7939                 /*
7940                  * We are going idle. next_balance may be set based on
7941                  * a busy processor. So reset next_balance.
7942                  */
7943                 this_rq->next_balance = next_balance;
7944         }
7945 }
7946
7947 /*
7948  * active_load_balance_cpu_stop is run by cpu stopper. It pushes
7949  * running tasks off the busiest CPU onto idle CPUs. It requires at
7950  * least 1 task to be running on each physical CPU where possible, and
7951  * avoids physical / logical imbalances.
7952  */
7953 static int active_load_balance_cpu_stop(void *data)
7954 {
7955         struct rq *busiest_rq = data;
7956         int busiest_cpu = cpu_of(busiest_rq);
7957         int target_cpu = busiest_rq->push_cpu;
7958         struct rq *target_rq = cpu_rq(target_cpu);
7959         struct sched_domain *sd;
7960
7961         raw_spin_lock_irq(&busiest_rq->lock);
7962
7963         /* make sure the requested cpu hasn't gone down in the meantime */
7964         if (unlikely(busiest_cpu != smp_processor_id() ||
7965                      !busiest_rq->active_balance))
7966                 goto out_unlock;
7967
7968         /* Is there any task to move? */
7969         if (busiest_rq->nr_running <= 1)
7970                 goto out_unlock;
7971
7972         /*
7973          * This condition is "impossible", if it occurs
7974          * we need to fix it. Originally reported by
7975          * Bjorn Helgaas on a 128-cpu setup.
7976          */
7977         BUG_ON(busiest_rq == target_rq);
7978
7979         /* move a task from busiest_rq to target_rq */
7980         double_lock_balance(busiest_rq, target_rq);
7981
7982         /* Search for an sd spanning us and the target CPU. */
7983         rcu_read_lock();
7984         for_each_domain(target_cpu, sd) {
7985                 if ((sd->flags & SD_LOAD_BALANCE) &&
7986                     cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
7987                                 break;
7988         }
7989
7990         if (likely(sd)) {
7991                 struct lb_env env = {
7992                         .sd             = sd,
7993                         .dst_cpu        = target_cpu,
7994                         .dst_rq         = target_rq,
7995                         .src_cpu        = busiest_rq->cpu,
7996                         .src_rq         = busiest_rq,
7997                         .idle           = CPU_IDLE,
7998                 };
7999
8000                 schedstat_inc(sd, alb_count);
8001
8002                 if (move_one_task(&env))
8003                         schedstat_inc(sd, alb_pushed);
8004                 else
8005                         schedstat_inc(sd, alb_failed);
8006         }
8007         rcu_read_unlock();
8008         double_unlock_balance(busiest_rq, target_rq);
8009 out_unlock:
8010         busiest_rq->active_balance = 0;
8011         raw_spin_unlock_irq(&busiest_rq->lock);
8012         return 0;
8013 }
8014
8015 #ifdef CONFIG_NO_HZ_COMMON
8016 /*
8017  * idle load balancing details
8018  * - When one of the busy CPUs notice that there may be an idle rebalancing
8019  *   needed, they will kick the idle load balancer, which then does idle
8020  *   load balancing for all the idle CPUs.
8021  */
8022 static struct {
8023         cpumask_var_t idle_cpus_mask;
8024         atomic_t nr_cpus;
8025         unsigned long next_balance;     /* in jiffy units */
8026 } nohz ____cacheline_aligned;
8027
8028
8029 static inline int find_new_ilb(int call_cpu)
8030 {
8031 #ifdef CONFIG_HMP_PACK_SMALL_TASK
8032
8033 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
8034
8035         struct sched_domain *sd;
8036
8037         int ilb_new = nr_cpu_ids;
8038
8039         int ilb_return = 0;
8040
8041         int ilb = cpumask_first(nohz.idle_cpus_mask);
8042
8043
8044         if(PA_ENABLE)
8045         {
8046                 int buddy = per_cpu(sd_pack_buddy, call_cpu);
8047
8048                 /*
8049                 * If we have a pack buddy CPU, we try to run load balance on a CPU
8050                 * that is close to the buddy.
8051                 */
8052                 if (buddy != -1)
8053                         for_each_domain(buddy, sd) {
8054                                 if (sd->flags & SD_SHARE_CPUPOWER)
8055                                         continue;
8056
8057                                 ilb_new = cpumask_first_and(sched_domain_span(sd),
8058                                                 nohz.idle_cpus_mask);
8059
8060                                 if (ilb_new < nr_cpu_ids)
8061                                         break;
8062
8063                         }
8064         }
8065
8066         if (ilb < nr_cpu_ids && idle_cpu(ilb)) {
8067                 ilb_return = 1;
8068         }
8069
8070         if (ilb_new < nr_cpu_ids) {
8071                 if (idle_cpu(ilb_new)) {
8072                         if(PA_ENABLE && ilb_return && ilb_new != ilb) {
8073                                 AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[call_cpu][ilb]++;
8074
8075 #ifdef CONFIG_HMP_TRACER
8076                                 trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_WAKE_UP_FORM_CPUX_TO_CPUY, 0, call_cpu, ilb);
8077 #endif /* CONFIG_HMP_TRACER */
8078
8079                         }
8080                         return ilb_new;
8081                 }
8082         }
8083
8084         if(ilb_return) {
8085                 return ilb;
8086         }
8087
8088         return nr_cpu_ids;
8089
8090 #else  /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
8091
8092         struct sched_domain *sd;
8093         int ilb = cpumask_first(nohz.idle_cpus_mask);
8094         int buddy = per_cpu(sd_pack_buddy, call_cpu);
8095
8096         /*
8097          * If we have a pack buddy CPU, we try to run load balance on a CPU
8098          * that is close to the buddy.
8099          */
8100         if (buddy != -1)
8101                 for_each_domain(buddy, sd) {
8102                         if (sd->flags & SD_SHARE_CPUPOWER)
8103                                 continue;
8104
8105                         ilb = cpumask_first_and(sched_domain_span(sd),
8106                                         nohz.idle_cpus_mask);
8107
8108                         if (ilb < nr_cpu_ids)
8109                                 break;
8110                 }
8111
8112         if (ilb < nr_cpu_ids && idle_cpu(ilb))
8113                 return ilb;
8114
8115         return nr_cpu_ids;
8116
8117 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
8118
8119 #else /* CONFIG_HMP_PACK_SMALL_TASK */
8120
8121         int ilb = cpumask_first(nohz.idle_cpus_mask);
8122 #ifdef CONFIG_MTK_SCHED_CMP_TGS
8123         /* Find nohz balancing to occur in the same cluster firstly */
8124         int new_ilb;
8125         struct cpumask tmp;
8126         //Find idle cpu with online one
8127         get_cluster_cpus(&tmp, get_cluster_id(call_cpu), true);
8128         new_ilb = cpumask_first_and(nohz.idle_cpus_mask, &tmp);
8129         if (new_ilb < nr_cpu_ids && idle_cpu(new_ilb))
8130         {
8131 #ifdef CONFIG_MTK_SCHED_CMP_POWER_AWARE_CONTROLLER
8132                 if(new_ilb != ilb)
8133                 {
8134                         mt_sched_printf("[PA]find_new_ilb(cpu%x), new_ilb = %d, ilb = %d\n", call_cpu, new_ilb, ilb);
8135                         AVOID_WAKE_UP_FROM_CPUX_TO_CPUY_COUNT[call_cpu][ilb]++;
8136                 }
8137 #endif
8138                 return new_ilb;
8139         }
8140 #endif /* CONFIG_MTK_SCHED_CMP_TGS */
8141
8142         if (ilb < nr_cpu_ids && idle_cpu(ilb))
8143                 return ilb;
8144
8145         return nr_cpu_ids;
8146
8147 #endif /* CONFIG_HMP_PACK_SMALL_TASK */
8148
8149 }
8150
8151
8152 /*
8153  * Kick a CPU to do the nohz balancing, if it is time for it. We pick the
8154  * nohz_load_balancer CPU (if there is one) otherwise fallback to any idle
8155  * CPU (if there is one).
8156  */
8157 static void nohz_balancer_kick(int cpu)
8158 {
8159         int ilb_cpu;
8160
8161         nohz.next_balance++;
8162
8163         ilb_cpu = find_new_ilb(cpu);
8164
8165         if (ilb_cpu >= nr_cpu_ids)
8166                 return;
8167
8168         if (test_and_set_bit(NOHZ_BALANCE_KICK, nohz_flags(ilb_cpu)))
8169                 return;
8170         /*
8171          * Use smp_send_reschedule() instead of resched_cpu().
8172          * This way we generate a sched IPI on the target cpu which
8173          * is idle. And the softirq performing nohz idle load balance
8174          * will be run before returning from the IPI.
8175          */
8176         smp_send_reschedule(ilb_cpu);
8177         return;
8178 }
8179
8180 static inline void nohz_balance_exit_idle(int cpu)
8181 {
8182         if (unlikely(test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))) {
8183                 cpumask_clear_cpu(cpu, nohz.idle_cpus_mask);
8184                 atomic_dec(&nohz.nr_cpus);
8185                 clear_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8186         }
8187 }
8188
8189 static inline void set_cpu_sd_state_busy(void)
8190 {
8191         struct sched_domain *sd;
8192         int cpu = smp_processor_id();
8193
8194         rcu_read_lock();
8195         sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
8196
8197         if (!sd || !sd->nohz_idle)
8198                 goto unlock;
8199         sd->nohz_idle = 0;
8200
8201         for (; sd; sd = sd->parent)
8202                 atomic_inc(&sd->groups->sgp->nr_busy_cpus);
8203 unlock:
8204         rcu_read_unlock();
8205 }
8206
8207 void set_cpu_sd_state_idle(void)
8208 {
8209         struct sched_domain *sd;
8210         int cpu = smp_processor_id();
8211
8212         rcu_read_lock();
8213         sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd);
8214
8215         if (!sd || sd->nohz_idle)
8216                 goto unlock;
8217         sd->nohz_idle = 1;
8218
8219         for (; sd; sd = sd->parent)
8220                 atomic_dec(&sd->groups->sgp->nr_busy_cpus);
8221 unlock:
8222         rcu_read_unlock();
8223 }
8224
8225 /*
8226  * This routine will record that the cpu is going idle with tick stopped.
8227  * This info will be used in performing idle load balancing in the future.
8228  */
8229 void nohz_balance_enter_idle(int cpu)
8230 {
8231         /*
8232          * If this cpu is going down, then nothing needs to be done.
8233          */
8234         if (!cpu_active(cpu))
8235                 return;
8236
8237         if (test_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu)))
8238                 return;
8239
8240         cpumask_set_cpu(cpu, nohz.idle_cpus_mask);
8241         atomic_inc(&nohz.nr_cpus);
8242         set_bit(NOHZ_TICK_STOPPED, nohz_flags(cpu));
8243 }
8244
8245 static int __cpuinit sched_ilb_notifier(struct notifier_block *nfb,
8246                                         unsigned long action, void *hcpu)
8247 {
8248         switch (action & ~CPU_TASKS_FROZEN) {
8249         case CPU_DYING:
8250                 nohz_balance_exit_idle(smp_processor_id());
8251                 return NOTIFY_OK;
8252         default:
8253                 return NOTIFY_DONE;
8254         }
8255 }
8256 #endif
8257
8258 static DEFINE_SPINLOCK(balancing);
8259
8260 /*
8261  * Scale the max load_balance interval with the number of CPUs in the system.
8262  * This trades load-balance latency on larger machines for less cross talk.
8263  */
8264 void update_max_interval(void)
8265 {
8266         max_load_balance_interval = HZ*num_online_cpus()/10;
8267 }
8268
8269 /*
8270  * It checks each scheduling domain to see if it is due to be balanced,
8271  * and initiates a balancing operation if so.
8272  *
8273  * Balancing parameters are set up in init_sched_domains.
8274  */
8275 static void rebalance_domains(int cpu, enum cpu_idle_type idle)
8276 {
8277         int balance = 1;
8278         struct rq *rq = cpu_rq(cpu);
8279         unsigned long interval;
8280         struct sched_domain *sd;
8281         /* Earliest time when we have to do rebalance again */
8282         unsigned long next_balance = jiffies + 60*HZ;
8283         int update_next_balance = 0;
8284         int need_serialize;
8285
8286         update_blocked_averages(cpu);
8287
8288         rcu_read_lock();
8289         for_each_domain(cpu, sd) {
8290                 if (!(sd->flags & SD_LOAD_BALANCE))
8291                         continue;
8292
8293                 interval = sd->balance_interval;
8294                 if (idle != CPU_IDLE)
8295                         interval *= sd->busy_factor;
8296
8297                 /* scale ms to jiffies */
8298                 interval = msecs_to_jiffies(interval);
8299                 interval = clamp(interval, 1UL, max_load_balance_interval);
8300
8301                 need_serialize = sd->flags & SD_SERIALIZE;
8302
8303                 if (need_serialize) {
8304                         if (!spin_trylock(&balancing))
8305                                 goto out;
8306                 }
8307
8308                 if (time_after_eq(jiffies, sd->last_balance + interval)) {
8309                         if (load_balance(cpu, rq, sd, idle, &balance)) {
8310                                 /*
8311                                  * The LBF_SOME_PINNED logic could have changed
8312                                  * env->dst_cpu, so we can't know our idle
8313                                  * state even if we migrated tasks. Update it.
8314                                  */
8315                                 idle = idle_cpu(cpu) ? CPU_IDLE : CPU_NOT_IDLE;
8316                         }
8317                         sd->last_balance = jiffies;
8318                 }
8319                 if (need_serialize)
8320                         spin_unlock(&balancing);
8321 out:
8322                 if (time_after(next_balance, sd->last_balance + interval)) {
8323                         next_balance = sd->last_balance + interval;
8324                         update_next_balance = 1;
8325                 }
8326
8327                 /*
8328                  * Stop the load balance at this level. There is another
8329                  * CPU in our sched group which is doing load balancing more
8330                  * actively.
8331                  */
8332                 if (!balance)
8333                         break;
8334         }
8335         rcu_read_unlock();
8336
8337         /*
8338          * next_balance will be updated only when there is a need.
8339          * When the cpu is attached to null domain for ex, it will not be
8340          * updated.
8341          */
8342         if (likely(update_next_balance))
8343                 rq->next_balance = next_balance;
8344 }
8345
8346 #ifdef CONFIG_NO_HZ_COMMON
8347 /*
8348  * In CONFIG_NO_HZ_COMMON case, the idle balance kickee will do the
8349  * rebalancing for all the cpus for whom scheduler ticks are stopped.
8350  */
8351 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle)
8352 {
8353         struct rq *this_rq = cpu_rq(this_cpu);
8354         struct rq *rq;
8355         int balance_cpu;
8356
8357         if (idle != CPU_IDLE ||
8358             !test_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu)))
8359                 goto end;
8360
8361         for_each_cpu(balance_cpu, nohz.idle_cpus_mask) {
8362                 if (balance_cpu == this_cpu || !idle_cpu(balance_cpu))
8363                         continue;
8364
8365                 /*
8366                  * If this cpu gets work to do, stop the load balancing
8367                  * work being done for other cpus. Next load
8368                  * balancing owner will pick it up.
8369                  */
8370                 if (need_resched())
8371                         break;
8372
8373                 rq = cpu_rq(balance_cpu);
8374
8375                 raw_spin_lock_irq(&rq->lock);
8376                 update_rq_clock(rq);
8377                 update_idle_cpu_load(rq);
8378                 raw_spin_unlock_irq(&rq->lock);
8379
8380                 rebalance_domains(balance_cpu, CPU_IDLE);
8381
8382                 if (time_after(this_rq->next_balance, rq->next_balance))
8383                         this_rq->next_balance = rq->next_balance;
8384         }
8385         nohz.next_balance = this_rq->next_balance;
8386 end:
8387         clear_bit(NOHZ_BALANCE_KICK, nohz_flags(this_cpu));
8388 }
8389
8390 /*
8391  * Current heuristic for kicking the idle load balancer in the presence
8392  * of an idle cpu is the system.
8393  *   - This rq has more than one task.
8394  *   - At any scheduler domain level, this cpu's scheduler group has multiple
8395  *     busy cpu's exceeding the group's power.
8396  *   - For SD_ASYM_PACKING, if the lower numbered cpu's in the scheduler
8397  *     domain span are idle.
8398  */
8399 static inline int nohz_kick_needed(struct rq *rq, int cpu)
8400 {
8401         unsigned long now = jiffies;
8402         struct sched_domain *sd;
8403
8404         if (unlikely(idle_cpu(cpu)))
8405                 return 0;
8406
8407        /*
8408         * We may be recently in ticked or tickless idle mode. At the first
8409         * busy tick after returning from idle, we will update the busy stats.
8410         */
8411         set_cpu_sd_state_busy();
8412         nohz_balance_exit_idle(cpu);
8413
8414         /*
8415          * None are in tickless mode and hence no need for NOHZ idle load
8416          * balancing.
8417          */
8418         if (likely(!atomic_read(&nohz.nr_cpus)))
8419                 return 0;
8420
8421         if (time_before(now, nohz.next_balance))
8422                 return 0;
8423
8424 #ifdef CONFIG_SCHED_HMP
8425         /*
8426          * Bail out if there are no nohz CPUs in our
8427          * HMP domain, since we will move tasks between
8428          * domains through wakeup and force balancing
8429          * as necessary based upon task load.
8430          */
8431         if (cpumask_first_and(nohz.idle_cpus_mask,
8432                         &((struct hmp_domain *)hmp_cpu_domain(cpu))->cpus) >= nr_cpu_ids)
8433                 return 0;
8434 #endif
8435
8436         if (rq->nr_running >= 2)
8437                 goto need_kick;
8438
8439         rcu_read_lock();
8440         for_each_domain(cpu, sd) {
8441                 struct sched_group *sg = sd->groups;
8442                 struct sched_group_power *sgp = sg->sgp;
8443                 int nr_busy = atomic_read(&sgp->nr_busy_cpus);
8444
8445                 if (sd->flags & SD_SHARE_PKG_RESOURCES && nr_busy > 1)
8446                         goto need_kick_unlock;
8447
8448                 if (sd->flags & SD_ASYM_PACKING && nr_busy != sg->group_weight
8449                     && (cpumask_first_and(nohz.idle_cpus_mask,
8450                                           sched_domain_span(sd)) < cpu))
8451                         goto need_kick_unlock;
8452
8453                 if (!(sd->flags & (SD_SHARE_PKG_RESOURCES | SD_ASYM_PACKING)))
8454                         break;
8455         }
8456         rcu_read_unlock();
8457         return 0;
8458
8459 need_kick_unlock:
8460         rcu_read_unlock();
8461 need_kick:
8462         return 1;
8463 }
8464 #else
8465 static void nohz_idle_balance(int this_cpu, enum cpu_idle_type idle) { }
8466 #endif
8467
8468 #ifdef CONFIG_SCHED_HMP
8469 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
8470
8471 /*
8472  * Heterogenous Multi-Processor (HMP) - Declaration and Useful Macro
8473  */
8474
8475 /* Function Declaration */
8476 static int hmp_up_stable(int cpu);
8477 static int hmp_down_stable(int cpu);
8478 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se,
8479                     struct clb_env *clbenv);
8480 static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se,
8481                     struct clb_env *clbenv);
8482
8483 #define hmp_caller_is_gb(caller) ((HMP_GB == caller)?1:0)
8484
8485 #define hmp_cpu_is_fast(cpu) cpumask_test_cpu(cpu,&hmp_fast_cpu_mask)
8486 #define hmp_cpu_is_slow(cpu) cpumask_test_cpu(cpu,&hmp_slow_cpu_mask)
8487 #define hmp_cpu_stable(cpu) (hmp_cpu_is_fast(cpu)? \
8488                         hmp_up_stable(cpu):hmp_down_stable(cpu))
8489
8490 #define hmp_inc(v) ((v) + 1)
8491 #define hmp_dec(v) ((v) - 1)
8492 #define hmp_pos(v) ((v) < (0) ? (0) : (v))
8493
8494 #define task_created(f) ((SD_BALANCE_EXEC == f || SD_BALANCE_FORK == f)?1:0)
8495 #define task_cpus_allowed(mask,p) cpumask_intersects(mask,tsk_cpus_allowed(p))
8496 #define task_slow_cpu_allowed(p) task_cpus_allowed(&hmp_slow_cpu_mask,p)
8497 #define task_fast_cpu_allowed(p) task_cpus_allowed(&hmp_fast_cpu_mask,p)
8498
8499 /*
8500  * Heterogenous Multi-Processor (HMP) - Utility Function
8501  */
8502
8503 /*
8504  * These functions add next up/down migration delay that prevents the task from
8505  * doing another migration in the same direction until the delay has expired.
8506  */
8507 static int hmp_up_stable(int cpu)
8508 {
8509         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8510         u64 now = cfs_rq_clock_task(cfs_rq);
8511         if (((now - hmp_last_up_migration(cpu)) >> 10) < hmp_next_up_threshold)
8512                 return 0;
8513         return 1;
8514 }
8515
8516 static int hmp_down_stable(int cpu)
8517 {
8518         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8519         u64 now = cfs_rq_clock_task(cfs_rq);
8520         if (((now - hmp_last_down_migration(cpu)) >> 10) < hmp_next_down_threshold)
8521                 return 0;
8522         return 1;
8523 }
8524
8525 /* Select the most appropriate CPU from hmp cluster */
8526 static unsigned int hmp_select_cpu(unsigned int caller, struct task_struct *p,
8527                         struct cpumask *mask, int prev)
8528 {
8529         int curr = 0;
8530         int target = NR_CPUS;
8531         unsigned long curr_wload = 0;
8532         unsigned long target_wload = 0;
8533         struct cpumask srcp;
8534         cpumask_and(&srcp, cpu_online_mask, mask);
8535         target = cpumask_any_and(&srcp, tsk_cpus_allowed(p));
8536         if (NR_CPUS == target)
8537                 goto out;
8538
8539         /*
8540          * RT class is taken into account because CPU load is multiplied
8541          * by the total number of CPU runnable tasks that includes RT tasks.
8542          */
8543         target_wload = hmp_inc(cfs_load(target));
8544         target_wload += cfs_pending_load(target);
8545         target_wload *= rq_length(target);
8546         for_each_cpu(curr, mask) {
8547                 /* Check CPU status and task affinity */
8548                 if(!cpu_online(curr) || !cpumask_test_cpu(curr, tsk_cpus_allowed(p)))
8549                         continue;
8550
8551                 /* For global load balancing, unstable CPU will be bypassed */
8552                 if(hmp_caller_is_gb(caller) && !hmp_cpu_stable(curr))
8553                         continue;
8554
8555                 curr_wload = hmp_inc(cfs_load(curr));
8556                 curr_wload += cfs_pending_load(curr);
8557                 curr_wload *= rq_length(curr);
8558                 if(curr_wload < target_wload) {
8559                         target_wload = curr_wload;
8560                         target = curr;
8561                 } else if(curr_wload == target_wload && curr == prev) {
8562                         target = curr;
8563                 }
8564         }
8565
8566 out:
8567         return target;
8568 }
8569
8570 /*
8571  * Heterogenous Multi-Processor (HMP) - Task Runqueue Selection
8572  */
8573
8574 /* This function enhances the original task selection function */
8575 static int hmp_select_task_rq_fair(int sd_flag, struct task_struct *p,
8576                         int prev_cpu, int new_cpu)
8577 {
8578 #ifdef CONFIG_HMP_TASK_ASSIGNMENT
8579         int step = 0;
8580         struct sched_entity *se = &p->se;
8581         int B_target = NR_CPUS;
8582         int L_target = NR_CPUS;
8583         struct clb_env clbenv;
8584
8585 #ifdef CONFIG_HMP_TRACER
8586         int cpu = 0;
8587         for_each_online_cpu(cpu)
8588                 trace_sched_cfs_runnable_load(cpu,cfs_load(cpu),cfs_length(cpu));
8589 #endif
8590
8591         // error handling
8592         if (prev_cpu >= NR_CPUS)
8593                 return new_cpu;
8594
8595         /*
8596          * Skip all the checks if only one CPU is online.
8597          * Otherwise, select the most appropriate CPU from cluster.
8598          */
8599         if (num_online_cpus() == 1)
8600                 goto out;
8601         B_target = hmp_select_cpu(HMP_SELECT_RQ,p,&hmp_fast_cpu_mask,prev_cpu);
8602         L_target = hmp_select_cpu(HMP_SELECT_RQ,p,&hmp_slow_cpu_mask,prev_cpu);
8603
8604         /*
8605          * Only one cluster exists or only one cluster is allowed for this task
8606          * Case 1: return the runqueue whose load is minimum
8607          * Case 2: return original CFS runqueue selection result
8608          */
8609 #ifdef CONFIG_HMP_DISCARD_CFS_SELECTION_RESULT
8610         if(NR_CPUS == B_target && NR_CPUS == L_target)
8611                 goto out;
8612         if(NR_CPUS == B_target)
8613                 goto select_slow;
8614         if(NR_CPUS == L_target)
8615                 goto select_fast;
8616 #else
8617         if(NR_CPUS == B_target || NR_CPUS == L_target)
8618                 goto out;
8619 #endif
8620
8621         /*
8622          * Two clusters exist and both clusters are allowed for this task
8623          * Step 1: Move newly created task to the cpu where no tasks are running
8624          * Step 2: Migrate heavy-load task to big
8625          * Step 3: Migrate light-load task to LITTLE
8626          * Step 4: Make sure the task stays in its previous hmp domain
8627          */
8628         step = 1;
8629         if (task_created(sd_flag) && !task_low_priority(p->prio)) {
8630                 if (!rq_length(B_target))
8631                         goto select_fast;
8632                 if (!rq_length(L_target))
8633                         goto select_slow;
8634         }
8635         memset(&clbenv, 0, sizeof(clbenv));
8636         clbenv.flags |= HMP_SELECT_RQ;
8637         clbenv.lcpus = &hmp_slow_cpu_mask;
8638         clbenv.bcpus = &hmp_fast_cpu_mask;
8639         clbenv.ltarget = L_target;
8640         clbenv.btarget = B_target;
8641         sched_update_clbstats(&clbenv);
8642         step = 2;
8643         if (hmp_up_migration(L_target, &B_target, se, &clbenv))
8644                 goto select_fast;
8645         step = 3;
8646         if (hmp_down_migration(B_target, &L_target, se, &clbenv))
8647                 goto select_slow;
8648         step = 4;
8649         if (hmp_cpu_is_slow(prev_cpu))
8650                 goto select_slow;
8651         goto select_fast;
8652
8653 select_fast:
8654         new_cpu = B_target;
8655         goto out;
8656 select_slow:
8657         new_cpu = L_target;
8658         goto out;
8659
8660 out:
8661
8662         // it happens when num_online_cpus=1
8663         if (new_cpu >= nr_cpu_ids)
8664         {
8665                 //BUG_ON(1);
8666                 new_cpu = prev_cpu;
8667         }
8668
8669         cfs_nr_pending(new_cpu)++;
8670         cfs_pending_load(new_cpu) += se_load(se);
8671 #ifdef CONFIG_HMP_TRACER
8672         trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
8673         trace_sched_hmp_select_task_rq(p,step,sd_flag,prev_cpu,new_cpu,
8674                         se_load(se),&clbenv.bstats,&clbenv.lstats);
8675 #endif
8676 #ifdef CONFIG_MET_SCHED_HMP
8677         HmpLoad(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
8678 #endif
8679 #endif /* CONFIG_HMP_TASK_ASSIGNMENT */
8680         return new_cpu;
8681 }
8682
8683 /*
8684  * Heterogenous Multi-Processor (HMP) - Task Dynamic Migration Threshold
8685  */
8686
8687 /*
8688  * If the workload between clusters is not balanced, adjust migration
8689  * threshold in an attempt to move task to the cluster where the workload
8690  * is not heavy
8691  */
8692
8693 /*
8694  * According to ARM's cpu_efficiency table, the computing power of CA15 and
8695  * CA7 are 3891 and 2048 respectively. Thus, we assume big has twice the
8696  * computing power of LITTLE
8697  */
8698
8699 #define HMP_RATIO(v) ((v)*17/10)
8700
8701 #define hmp_fast_cpu_has_spare_cycles(B,cpu_load) (cpu_load < \
8702                         (HMP_RATIO(B->cpu_capacity) - (B->cpu_capacity >> 2)))
8703
8704 #define hmp_task_fast_cpu_afford(B,se,cpu) (B->acap > 0 \
8705                         && hmp_fast_cpu_has_spare_cycles(B,se_load(se) + cfs_load(cpu)))
8706
8707 #define hmp_fast_cpu_oversubscribed(caller,B,se,cpu) \
8708                         (hmp_caller_is_gb(caller)? \
8709                         !hmp_fast_cpu_has_spare_cycles(B,cfs_load(cpu)): \
8710                         !hmp_task_fast_cpu_afford(B,se,cpu))
8711
8712 #define hmp_task_slow_cpu_afford(L,se) \
8713                         (L->acap > 0 && L->acap >= se_load(se))
8714
8715 /* Macro used by low-priorty task filter */
8716 #define hmp_low_prio_task_up_rejected(p,B,L) \
8717                         (task_low_priority(p->prio) && \
8718                         (B->ntask >= B->ncpu || 0 != L->nr_normal_prio_task) && \
8719                          (p->se.avg.load_avg_ratio < 800))
8720
8721 #define hmp_low_prio_task_down_allowed(p,B,L) \
8722                         (task_low_priority(p->prio) && !B->nr_dequeuing_low_prio && \
8723                         B->ntask >= B->ncpu && 0 != L->nr_normal_prio_task && \
8724                         (p->se.avg.load_avg_ratio < 800))
8725
8726 /* Migration check result */
8727 #define HMP_BIG_NOT_OVERSUBSCRIBED           (0x01)
8728 #define HMP_BIG_CAPACITY_INSUFFICIENT        (0x02)
8729 #define HMP_LITTLE_CAPACITY_INSUFFICIENT     (0x04)
8730 #define HMP_LOW_PRIORITY_FILTER              (0x08)
8731 #define HMP_BIG_BUSY_LITTLE_IDLE             (0x10)
8732 #define HMP_BIG_IDLE                         (0x20)
8733 #define HMP_MIGRATION_APPROVED              (0x100)
8734 #define HMP_TASK_UP_MIGRATION               (0x200)
8735 #define HMP_TASK_DOWN_MIGRATION             (0x400)
8736
8737 /* Migration statistics */
8738 #ifdef CONFIG_HMP_TRACER
8739 struct hmp_statisic hmp_stats;
8740 #endif
8741
8742 static inline void hmp_dynamic_threshold(struct clb_env *clbenv)
8743 {
8744         struct clb_stats *L = &clbenv->lstats;
8745         struct clb_stats *B = &clbenv->bstats;
8746         unsigned int hmp_threshold_diff = hmp_up_threshold - hmp_down_threshold;
8747         unsigned int B_normalized_acap = hmp_pos(HMP_RATIO(B->scaled_acap));
8748         unsigned int B_normalized_atask = hmp_pos(HMP_RATIO(B->scaled_atask));
8749         unsigned int L_normalized_acap = hmp_pos(L->scaled_acap);
8750         unsigned int L_normalized_atask = hmp_pos(L->scaled_atask);
8751
8752 #ifdef CONFIG_HMP_DYNAMIC_THRESHOLD
8753         L->threshold = hmp_threshold_diff;
8754         L->threshold *= hmp_inc(L_normalized_acap) * hmp_inc(L_normalized_atask);
8755         L->threshold /= hmp_inc(B_normalized_acap + L_normalized_acap);
8756         L->threshold /= hmp_inc(B_normalized_atask + L_normalized_atask);
8757         L->threshold = hmp_down_threshold + L->threshold;
8758
8759         B->threshold = hmp_threshold_diff;
8760         B->threshold *= hmp_inc(B_normalized_acap) * hmp_inc(B_normalized_atask);
8761         B->threshold /= hmp_inc(B_normalized_acap + L_normalized_acap);
8762         B->threshold /= hmp_inc(B_normalized_atask + L_normalized_atask);
8763         B->threshold = hmp_up_threshold - B->threshold;
8764 #else /* !CONFIG_HMP_DYNAMIC_THRESHOLD */
8765         clbenv->lstats.threshold = hmp_down_threshold; // down threshold
8766         clbenv->bstats.threshold = hmp_up_threshold; // up threshold
8767 #endif /* CONFIG_HMP_DYNAMIC_THRESHOLD */
8768
8769         mt_sched_printf("[%s]\tup/dl:%4d/%4d bcpu(%d):%d/%d, lcpu(%d):%d/%d\n", __func__,
8770                                         B->threshold, L->threshold,
8771                                         clbenv->btarget, clbenv->bstats.cpu_capacity, clbenv->bstats.cpu_power,
8772                                         clbenv->ltarget, clbenv->lstats.cpu_capacity, clbenv->lstats.cpu_power);
8773 }
8774
8775 /*
8776  * Check whether this task should be migrated to big
8777  * Briefly summarize the flow as below;
8778  * 1) Migration stabilizing
8779  * 1.5) Keep all cpu busy
8780  * 2) Filter low-priorty task
8781  * 3) Check CPU capacity
8782  * 4) Check dynamic migration threshold
8783  */
8784 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se,
8785                                      struct clb_env *clbenv)
8786 {
8787         struct task_struct *p = task_of(se);
8788         struct clb_stats *L, *B;
8789         struct mcheck *check;
8790         int curr_cpu = cpu;
8791         unsigned int caller = clbenv->flags;
8792
8793         L = &clbenv->lstats;
8794         B = &clbenv->bstats;
8795         check = &clbenv->mcheck;
8796
8797         check->status = clbenv->flags;
8798         check->status |= HMP_TASK_UP_MIGRATION;
8799         check->result = 0;
8800
8801         /*
8802          * No migration is needed if
8803          * 1) There is only one cluster
8804          * 2) Task is already in big cluster
8805          * 3) It violates task affinity
8806          */
8807         if (!L->ncpu || !B->ncpu
8808                 || cpumask_test_cpu(curr_cpu, clbenv->bcpus)
8809                 || !cpumask_intersects(clbenv->bcpus, tsk_cpus_allowed(p)))
8810                 goto out;
8811
8812         /*
8813          * [1] Migration stabilizing
8814          * Let the task load settle before doing another up migration.
8815          * It can prevent a bunch of tasks from migrating to a unstable CPU.
8816          */
8817         if (!hmp_up_stable(*target_cpu))
8818                 goto out;
8819
8820         /* [2] Filter low-priorty task */
8821 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8822         if (hmp_low_prio_task_up_rejected(p,B,L)) {
8823                 check->status |= HMP_LOW_PRIORITY_FILTER;
8824                 goto trace;
8825         }
8826 #endif
8827
8828         // [2.5]if big is idle, just go to big
8829         if (rq_length(*target_cpu)==0)
8830         {
8831                 check->status |= HMP_BIG_IDLE;
8832                 check->status |= HMP_MIGRATION_APPROVED;
8833                 check->result = 1;
8834                 goto trace;
8835         }
8836
8837         /*
8838          * [3] Check CPU capacity
8839          * Forbid up-migration if big CPU can't handle this task
8840          */
8841         if (!hmp_task_fast_cpu_afford(B,se,*target_cpu)) {
8842                 check->status |= HMP_BIG_CAPACITY_INSUFFICIENT;
8843                 goto trace;
8844         }
8845
8846         /*
8847          * [4] Check dynamic migration threshold
8848          * Migrate task from LITTLE to big if load is greater than up-threshold
8849          */
8850         if (se_load(se) > B->threshold) {
8851                 check->status |= HMP_MIGRATION_APPROVED;
8852                 check->result = 1;
8853         }
8854
8855 trace:
8856 #ifdef CONFIG_HMP_TRACER
8857         if(check->result && hmp_caller_is_gb(caller))
8858                 hmp_stats.nr_force_up++;
8859         trace_sched_hmp_stats(&hmp_stats);
8860         trace_sched_dynamic_threshold(task_of(se),B->threshold,check->status,
8861                         curr_cpu,*target_cpu,se_load(se),B,L);
8862 #endif
8863 #ifdef CONFIG_MET_SCHED_HMP
8864         TaskTh(B->threshold,L->threshold);
8865         HmpStat(&hmp_stats);
8866 #endif
8867 out:
8868         return check->result;
8869 }
8870
8871 /*
8872  * Check whether this task should be migrated to LITTLE
8873  * Briefly summarize the flow as below;
8874  * 1) Migration stabilizing
8875  * 1.5) Keep all cpu busy
8876  * 2) Filter low-priorty task
8877  * 3) Check CPU capacity
8878  * 4) Check dynamic migration threshold
8879  */
8880 static unsigned int hmp_down_migration(int cpu, int *target_cpu, struct sched_entity *se,
8881                                        struct clb_env *clbenv)
8882 {
8883         struct task_struct *p = task_of(se);
8884         struct clb_stats *L, *B;
8885         struct mcheck *check;
8886         int curr_cpu = cpu;
8887         unsigned int caller = clbenv->flags;
8888
8889         L = &clbenv->lstats;
8890         B = &clbenv->bstats;
8891         check = &clbenv->mcheck;
8892
8893         check->status = caller;
8894         check->status |= HMP_TASK_DOWN_MIGRATION;
8895         check->result = 0;
8896
8897         /*
8898          * No migration is needed if
8899          * 1) There is only one cluster
8900          * 2) Task is already in LITTLE cluster
8901          * 3) It violates task affinity
8902          */
8903         if (!L->ncpu || !B->ncpu
8904                 || cpumask_test_cpu(curr_cpu, clbenv->lcpus)
8905                 || !cpumask_intersects(clbenv->lcpus, tsk_cpus_allowed(p)))
8906                 goto out;
8907
8908         /*
8909          * [1] Migration stabilizing
8910          * Let the task load settle before doing another down migration.
8911          * It can prevent a bunch of tasks from migrating to a unstable CPU.
8912          */
8913         if (!hmp_down_stable(*target_cpu))
8914                 goto out;
8915
8916         // [1.5]if big is busy and little is idle, just go to little
8917         if (rq_length(*target_cpu)==0 && caller == HMP_SELECT_RQ && rq_length(curr_cpu)>0)
8918         {
8919                 check->status |= HMP_BIG_BUSY_LITTLE_IDLE;
8920                 check->status |= HMP_MIGRATION_APPROVED;
8921                 check->result = 1;
8922                 goto trace;
8923         }
8924
8925         /* [2] Filter low-priorty task */
8926 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8927         if (hmp_low_prio_task_down_allowed(p,B,L)) {
8928                 cfs_nr_dequeuing_low_prio(curr_cpu)++;
8929                 check->status |= HMP_LOW_PRIORITY_FILTER;
8930                 check->status |= HMP_MIGRATION_APPROVED;
8931                 check->result = 1;
8932                 goto trace;
8933         }
8934 #endif
8935
8936         /*
8937          * [3] Check CPU capacity
8938          * Forbid down-migration if either of the following conditions is true
8939          * 1) big cpu is not oversubscribed (if big CPU seems to have spare
8940          *    cycles, do not force this task to run on LITTLE CPU, but
8941          *    keep it staying in its previous cluster instead)
8942          * 2) LITTLE cpu doesn't have available capacity for this new task
8943          */
8944         if (!hmp_fast_cpu_oversubscribed(caller,B,se,curr_cpu)) {
8945                 check->status |= HMP_BIG_NOT_OVERSUBSCRIBED;
8946                 goto trace;
8947         }
8948
8949         if (!hmp_task_slow_cpu_afford(L,se)) {
8950                 check->status |= HMP_LITTLE_CAPACITY_INSUFFICIENT;
8951                 goto trace;
8952         }
8953
8954         /*
8955          * [4] Check dynamic migration threshold
8956          * Migrate task from big to LITTLE if load ratio is less than
8957          * or equal to down-threshold
8958          */
8959         if (L->threshold >= se_load(se)) {
8960                 check->status |= HMP_MIGRATION_APPROVED;
8961                 check->result = 1;
8962         }
8963
8964 trace:
8965 #ifdef CONFIG_HMP_TRACER
8966         if (check->result && hmp_caller_is_gb(caller))
8967                 hmp_stats.nr_force_down++;
8968         trace_sched_hmp_stats(&hmp_stats);
8969         trace_sched_dynamic_threshold(task_of(se),L->threshold,check->status,
8970                         curr_cpu,*target_cpu,se_load(se),B,L);
8971 #endif
8972 #ifdef CONFIG_MET_SCHED_HMP
8973         TaskTh(B->threshold,L->threshold);
8974         HmpStat(&hmp_stats);
8975 #endif
8976 out:
8977         return check->result;
8978 }
8979 #else /* CONFIG_SCHED_HMP_ENHANCEMENT */
8980 /* Check if task should migrate to a faster cpu */
8981 static unsigned int hmp_up_migration(int cpu, int *target_cpu, struct sched_entity *se)
8982 {
8983         struct task_struct *p = task_of(se);
8984         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
8985         u64 now;
8986
8987         if (target_cpu)
8988                 *target_cpu = NR_CPUS;
8989
8990         if (hmp_cpu_is_fastest(cpu))
8991                 return 0;
8992
8993 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
8994         /* Filter by task priority */
8995         if (p->prio >= hmp_up_prio)
8996                 return 0;
8997 #endif
8998         if (se->avg.load_avg_ratio < hmp_up_threshold)
8999                 return 0;
9000
9001         /* Let the task load settle before doing another up migration */
9002         now = cfs_rq_clock_task(cfs_rq);
9003         if (((now - se->avg.hmp_last_up_migration) >> 10)
9004                                         < hmp_next_up_threshold)
9005                 return 0;
9006
9007         /* Target domain load < 94% */
9008         if (hmp_domain_min_load(hmp_faster_domain(cpu), target_cpu)
9009                         > NICE_0_LOAD-64)
9010                 return 0;
9011
9012         if (cpumask_intersects(&hmp_faster_domain(cpu)->cpus,
9013                         tsk_cpus_allowed(p)))
9014                 return 1;
9015
9016         return 0;
9017 }
9018
9019 /* Check if task should migrate to a slower cpu */
9020 static unsigned int hmp_down_migration(int cpu, struct sched_entity *se)
9021 {
9022         struct task_struct *p = task_of(se);
9023         struct cfs_rq *cfs_rq = &cpu_rq(cpu)->cfs;
9024         u64 now;
9025
9026         if (hmp_cpu_is_slowest(cpu))
9027                 return 0;
9028
9029 #ifdef CONFIG_SCHED_HMP_PRIO_FILTER
9030         /* Filter by task priority */
9031         if ((p->prio >= hmp_up_prio) &&
9032                 cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
9033                                         tsk_cpus_allowed(p))) {
9034                 return 1;
9035         }
9036 #endif
9037
9038         /* Let the task load settle before doing another down migration */
9039         now = cfs_rq_clock_task(cfs_rq);
9040         if (((now - se->avg.hmp_last_down_migration) >> 10)
9041                                         < hmp_next_down_threshold)
9042                 return 0;
9043
9044         if (cpumask_intersects(&hmp_slower_domain(cpu)->cpus,
9045                                         tsk_cpus_allowed(p))
9046                 && se->avg.load_avg_ratio < hmp_down_threshold) {
9047                 return 1;
9048         }
9049         return 0;
9050 }
9051 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
9052
9053 /*
9054  * hmp_can_migrate_task - may task p from runqueue rq be migrated to this_cpu?
9055  * Ideally this function should be merged with can_migrate_task() to avoid
9056  * redundant code.
9057  */
9058 static int hmp_can_migrate_task(struct task_struct *p, struct lb_env *env)
9059 {
9060         int tsk_cache_hot = 0;
9061
9062         /*
9063          * We do not migrate tasks that are:
9064          * 1) running (obviously), or
9065          * 2) cannot be migrated to this CPU due to cpus_allowed
9066          */
9067         if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) {
9068                 schedstat_inc(p, se.statistics.nr_failed_migrations_affine);
9069                 return 0;
9070         }
9071         env->flags &= ~LBF_ALL_PINNED;
9072
9073         if (task_running(env->src_rq, p)) {
9074                 schedstat_inc(p, se.statistics.nr_failed_migrations_running);
9075                 return 0;
9076         }
9077
9078         /*
9079          * Aggressive migration if:
9080          * 1) task is cache cold, or
9081          * 2) too many balance attempts have failed.
9082          */
9083
9084 #if defined(CONFIG_MT_LOAD_BALANCE_ENHANCEMENT)
9085         tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd, env->mt_check_cache_in_idle);
9086 #else
9087         tsk_cache_hot = task_hot(p, env->src_rq->clock_task, env->sd);
9088 #endif
9089         if (!tsk_cache_hot ||
9090                 env->sd->nr_balance_failed > env->sd->cache_nice_tries) {
9091 #ifdef CONFIG_SCHEDSTATS
9092                 if (tsk_cache_hot) {
9093                         schedstat_inc(env->sd, lb_hot_gained[env->idle]);
9094                         schedstat_inc(p, se.statistics.nr_forced_migrations);
9095                 }
9096 #endif
9097                 return 1;
9098         }
9099
9100         return 1;
9101 }
9102
9103 /*
9104  * move_specific_task tries to move a specific task.
9105  * Returns 1 if successful and 0 otherwise.
9106  * Called with both runqueues locked.
9107  */
9108 static int move_specific_task(struct lb_env *env, struct task_struct *pm)
9109 {
9110         struct task_struct *p, *n;
9111
9112         list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) {
9113         if (throttled_lb_pair(task_group(p), env->src_rq->cpu,
9114                                 env->dst_cpu))
9115                 continue;
9116
9117                 if (!hmp_can_migrate_task(p, env))
9118                         continue;
9119                 /* Check if we found the right task */
9120                 if (p != pm)
9121                         continue;
9122
9123                 move_task(p, env);
9124                 /*
9125                  * Right now, this is only the third place move_task()
9126                  * is called, so we can safely collect move_task()
9127                  * stats here rather than inside move_task().
9128                  */
9129                 schedstat_inc(env->sd, lb_gained[env->idle]);
9130                 return 1;
9131         }
9132         return 0;
9133 }
9134
9135 /*
9136  * hmp_active_task_migration_cpu_stop is run by cpu stopper and used to
9137  * migrate a specific task from one runqueue to another.
9138  * hmp_force_up_migration uses this to push a currently running task
9139  * off a runqueue.
9140  * Based on active_load_balance_stop_cpu and can potentially be merged.
9141  */
9142 static int hmp_active_task_migration_cpu_stop(void *data)
9143 {
9144         struct rq *busiest_rq = data;
9145         struct task_struct *p = busiest_rq->migrate_task;
9146         int busiest_cpu = cpu_of(busiest_rq);
9147         int target_cpu = busiest_rq->push_cpu;
9148         struct rq *target_rq = cpu_rq(target_cpu);
9149         struct sched_domain *sd;
9150
9151         raw_spin_lock_irq(&busiest_rq->lock);
9152         /* make sure the requested cpu hasn't gone down in the meantime */
9153         if (unlikely(busiest_cpu != smp_processor_id() ||
9154                 !busiest_rq->active_balance)) {
9155                 goto out_unlock;
9156         }
9157         /* Is there any task to move? */
9158         if (busiest_rq->nr_running <= 1)
9159                 goto out_unlock;
9160         /* Task has migrated meanwhile, abort forced migration */
9161         if (task_rq(p) != busiest_rq)
9162                 goto out_unlock;
9163         /*
9164          * This condition is "impossible", if it occurs
9165          * we need to fix it. Originally reported by
9166          * Bjorn Helgaas on a 128-cpu setup.
9167          */
9168         BUG_ON(busiest_rq == target_rq);
9169
9170         /* move a task from busiest_rq to target_rq */
9171         double_lock_balance(busiest_rq, target_rq);
9172
9173         /* Search for an sd spanning us and the target CPU. */
9174         rcu_read_lock();
9175         for_each_domain(target_cpu, sd) {
9176                 if (cpumask_test_cpu(busiest_cpu, sched_domain_span(sd)))
9177                         break;
9178         }
9179
9180         if (likely(sd)) {
9181                 struct lb_env env = {
9182                         .sd             = sd,
9183                         .dst_cpu        = target_cpu,
9184                         .dst_rq         = target_rq,
9185                         .src_cpu        = busiest_rq->cpu,
9186                         .src_rq         = busiest_rq,
9187                         .idle           = CPU_IDLE,
9188                 };
9189
9190                 schedstat_inc(sd, alb_count);
9191
9192                 if (move_specific_task(&env, p))
9193                         schedstat_inc(sd, alb_pushed);
9194                 else
9195                         schedstat_inc(sd, alb_failed);
9196         }
9197         rcu_read_unlock();
9198         double_unlock_balance(busiest_rq, target_rq);
9199 out_unlock:
9200         busiest_rq->active_balance = 0;
9201         raw_spin_unlock_irq(&busiest_rq->lock);
9202         return 0;
9203 }
9204
9205 static DEFINE_SPINLOCK(hmp_force_migration);
9206 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
9207 /*
9208  * Heterogenous Multi-Processor (HMP) Global Load Balance
9209  */
9210
9211 /*
9212  * According to Linaro's comment, we should only check the currently running
9213  * tasks because selecting other tasks for migration will require extensive
9214  * book keeping.
9215  */
9216 #ifdef CONFIG_HMP_GLOBAL_BALANCE
9217 static void hmp_force_down_migration(int this_cpu)
9218 {
9219         int curr_cpu, target_cpu;
9220         struct sched_entity *se;
9221         struct rq *target;
9222         unsigned long flags;
9223         unsigned int force;
9224         struct task_struct *p;
9225         struct clb_env clbenv;
9226
9227         /* Migrate light task from big to LITTLE */
9228         for_each_cpu(curr_cpu, &hmp_fast_cpu_mask) {
9229                 /* Check whether CPU is online */
9230                 if(!cpu_online(curr_cpu))
9231                         continue;
9232
9233                 force = 0;
9234                 target = cpu_rq(curr_cpu);
9235                 raw_spin_lock_irqsave(&target->lock, flags);
9236                 se = target->cfs.curr;
9237                 if (!se) {
9238                         raw_spin_unlock_irqrestore(&target->lock, flags);
9239                         continue;
9240                 }
9241
9242                 /* Find task entity */
9243                 if (!entity_is_task(se)) {
9244                         struct cfs_rq *cfs_rq;
9245                         cfs_rq = group_cfs_rq(se);
9246                         while (cfs_rq) {
9247                                 se = cfs_rq->curr;
9248                                 cfs_rq = group_cfs_rq(se);
9249                         }
9250                 }
9251
9252                 p = task_of(se);
9253                 target_cpu = hmp_select_cpu(HMP_GB,p,&hmp_slow_cpu_mask,-1);
9254                 if(NR_CPUS == target_cpu) {
9255                         raw_spin_unlock_irqrestore(&target->lock, flags);
9256                         continue;
9257                 }
9258
9259                 /* Collect cluster information */
9260                 memset(&clbenv, 0, sizeof(clbenv));
9261                 clbenv.flags |= HMP_GB;
9262                 clbenv.btarget = curr_cpu;
9263                 clbenv.ltarget = target_cpu;
9264                 clbenv.lcpus = &hmp_slow_cpu_mask;
9265                 clbenv.bcpus = &hmp_fast_cpu_mask;
9266                 sched_update_clbstats(&clbenv);
9267
9268                 /* Check migration threshold */
9269                 if (!target->active_balance &&
9270                         hmp_down_migration(curr_cpu, &target_cpu, se, &clbenv)) {
9271                         target->active_balance = 1;
9272                         target->push_cpu = target_cpu;
9273                         target->migrate_task = p;
9274                         force = 1;
9275                         trace_sched_hmp_migrate(p, target->push_cpu, 1);
9276                         hmp_next_down_delay(&p->se, target->push_cpu);
9277                 }
9278                 raw_spin_unlock_irqrestore(&target->lock, flags);
9279                 if (force) {
9280                         stop_one_cpu_nowait(cpu_of(target),
9281                                 hmp_active_task_migration_cpu_stop,
9282                                 target, &target->active_balance_work);
9283                 }
9284         }
9285 }
9286 #endif /* CONFIG_HMP_GLOBAL_BALANCE */
9287 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9288 u32 AVOID_FORCE_UP_MIGRATION_FROM_CPUX_TO_CPUY_COUNT[NR_CPUS][NR_CPUS];
9289 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9290
9291 static void hmp_force_up_migration(int this_cpu)
9292 {
9293         int curr_cpu, target_cpu;
9294         struct sched_entity *se;
9295         struct rq *target;
9296         unsigned long flags;
9297         unsigned int force;
9298         struct task_struct *p;
9299         struct clb_env clbenv;
9300 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9301         int push_cpu;
9302 #endif
9303
9304         if (!spin_trylock(&hmp_force_migration))
9305                 return;
9306
9307 #ifdef CONFIG_HMP_TRACER
9308         for_each_online_cpu(curr_cpu)
9309                 trace_sched_cfs_runnable_load(curr_cpu,cfs_load(curr_cpu),
9310                         cfs_length(curr_cpu));
9311 #endif
9312
9313         /* Migrate heavy task from LITTLE to big */
9314         for_each_cpu(curr_cpu, &hmp_slow_cpu_mask) {
9315                 /* Check whether CPU is online */
9316                 if(!cpu_online(curr_cpu))
9317                         continue;
9318
9319                 force = 0;
9320                 target = cpu_rq(curr_cpu);
9321                 raw_spin_lock_irqsave(&target->lock, flags);
9322                 se = target->cfs.curr;
9323                 if (!se) {
9324                         raw_spin_unlock_irqrestore(&target->lock, flags);
9325                         continue;
9326                 }
9327
9328                 /* Find task entity */
9329                 if (!entity_is_task(se)) {
9330                         struct cfs_rq *cfs_rq;
9331                         cfs_rq = group_cfs_rq(se);
9332                         while (cfs_rq) {
9333                                 se = cfs_rq->curr;
9334                                 cfs_rq = group_cfs_rq(se);
9335                         }
9336                 }
9337
9338                 p = task_of(se);
9339                 target_cpu = hmp_select_cpu(HMP_GB,p,&hmp_fast_cpu_mask,-1);
9340                 if(NR_CPUS == target_cpu) {
9341                         raw_spin_unlock_irqrestore(&target->lock, flags);
9342                         continue;
9343                 }
9344
9345                 /* Collect cluster information */
9346                 memset(&clbenv, 0, sizeof(clbenv));
9347                 clbenv.flags |= HMP_GB;
9348                 clbenv.ltarget = curr_cpu;
9349                 clbenv.btarget = target_cpu;
9350                 clbenv.lcpus = &hmp_slow_cpu_mask;
9351                 clbenv.bcpus = &hmp_fast_cpu_mask;
9352                 sched_update_clbstats(&clbenv);
9353
9354 #ifdef CONFIG_HMP_LAZY_BALANCE
9355 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9356                 if (PA_ENABLE && LB_ENABLE) {
9357 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9358                         if (is_light_task(p) && !is_buddy_busy(per_cpu(sd_pack_buddy, curr_cpu))) {
9359 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9360                                 push_cpu = hmp_select_cpu(HMP_GB,p,&hmp_fast_cpu_mask,-1);
9361                                 if (hmp_cpu_is_fast(push_cpu)) {
9362                                         AVOID_FORCE_UP_MIGRATION_FROM_CPUX_TO_CPUY_COUNT[curr_cpu][push_cpu]++;
9363 #ifdef CONFIG_HMP_TRACER
9364                                         trace_sched_power_aware_active(POWER_AWARE_ACTIVE_MODULE_AVOID_FORCE_UP_FORM_CPUX_TO_CPUY, p->pid, curr_cpu, push_cpu);
9365 #endif /* CONFIG_HMP_TRACER */
9366                                 }
9367 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9368                                 goto out_force_up;
9369                         }
9370 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
9371                 }
9372 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
9373 #endif /* CONFIG_HMP_LAZY_BALANCE */
9374
9375                 /* Check migration threshold */
9376                 if (!target->active_balance &&
9377                         hmp_up_migration(curr_cpu, &target_cpu, se, &clbenv)) {
9378                         target->active_balance = 1;
9379                         target->push_cpu = target_cpu;
9380                         target->migrate_task = p;
9381                         force = 1;
9382                         trace_sched_hmp_migrate(p, target->push_cpu, 1);
9383                         hmp_next_up_delay(&p->se, target->push_cpu);
9384                 }
9385
9386 #ifdef CONFIG_HMP_LAZY_BALANCE
9387 out_force_up:
9388 #endif /* CONFIG_HMP_LAZY_BALANCE */
9389
9390                 raw_spin_unlock_irqrestore(&target->lock, flags);
9391                 if (force) {
9392                         stop_one_cpu_nowait(cpu_of(target),
9393                                 hmp_active_task_migration_cpu_stop,
9394                                 target, &target->active_balance_work);
9395                 }
9396         }
9397
9398 #ifdef CONFIG_HMP_GLOBAL_BALANCE
9399         hmp_force_down_migration(this_cpu);
9400 #endif
9401 #ifdef CONFIG_HMP_TRACER
9402         trace_sched_hmp_load(clbenv.bstats.load_avg, clbenv.lstats.load_avg);
9403 #endif
9404         spin_unlock(&hmp_force_migration);
9405 }
9406 #else /* CONFIG_SCHED_HMP_ENHANCEMENT */
9407 /*
9408  * hmp_force_up_migration checks runqueues for tasks that need to
9409  * be actively migrated to a faster cpu.
9410  */
9411 static void hmp_force_up_migration(int this_cpu)
9412 {
9413         int cpu, target_cpu;
9414         struct sched_entity *curr;
9415         struct rq *target;
9416         unsigned long flags;
9417         unsigned int force;
9418         struct task_struct *p;
9419
9420         if (!spin_trylock(&hmp_force_migration))
9421                 return;
9422         for_each_online_cpu(cpu) {
9423                 force = 0;
9424                 target = cpu_rq(cpu);
9425                 raw_spin_lock_irqsave(&target->lock, flags);
9426                 curr = target->cfs.curr;
9427                 if (!curr) {
9428                         raw_spin_unlock_irqrestore(&target->lock, flags);
9429                         continue;
9430                 }
9431                 if (!entity_is_task(curr)) {
9432                         struct cfs_rq *cfs_rq;
9433
9434                         cfs_rq = group_cfs_rq(curr);
9435                         while (cfs_rq) {
9436                                 curr = cfs_rq->curr;
9437                                 cfs_rq = group_cfs_rq(curr);
9438                         }
9439                 }
9440                 p = task_of(curr);
9441                 if (hmp_up_migration(cpu, &target_cpu, curr)) {
9442                         if (!target->active_balance) {
9443                                 target->active_balance = 1;
9444                                 target->push_cpu = target_cpu;
9445                                 target->migrate_task = p;
9446                                 force = 1;
9447                                 trace_sched_hmp_migrate(p, target->push_cpu, 1);
9448                                 hmp_next_up_delay(&p->se, target->push_cpu);
9449                         }
9450                 }
9451                 if (!force && !target->active_balance) {
9452                         /*
9453                          * For now we just check the currently running task.
9454                          * Selecting the lightest task for offloading will
9455                          * require extensive book keeping.
9456                          */
9457                         target->push_cpu = hmp_offload_down(cpu, curr);
9458                         if (target->push_cpu < NR_CPUS) {
9459                                 target->active_balance = 1;
9460                                 target->migrate_task = p;
9461                                 force = 1;
9462                                 trace_sched_hmp_migrate(p, target->push_cpu, 2);
9463                                 hmp_next_down_delay(&p->se, target->push_cpu);
9464                         }
9465                 }
9466                 raw_spin_unlock_irqrestore(&target->lock, flags);
9467                 if (force)
9468                         stop_one_cpu_nowait(cpu_of(target),
9469                                 hmp_active_task_migration_cpu_stop,
9470                                 target, &target->active_balance_work);
9471         }
9472         spin_unlock(&hmp_force_migration);
9473 }
9474 #endif /* CONFIG_SCHED_HMP_ENHANCEMENT */
9475 #else
9476 static void hmp_force_up_migration(int this_cpu) { }
9477 #endif /* CONFIG_SCHED_HMP */
9478
9479 /*
9480  * run_rebalance_domains is triggered when needed from the scheduler tick.
9481  * Also triggered for nohz idle balancing (with nohz_balancing_kick set).
9482  */
9483 static void run_rebalance_domains(struct softirq_action *h)
9484 {
9485         int this_cpu = smp_processor_id();
9486         struct rq *this_rq = cpu_rq(this_cpu);
9487         enum cpu_idle_type idle = this_rq->idle_balance ?
9488                                                 CPU_IDLE : CPU_NOT_IDLE;
9489
9490         hmp_force_up_migration(this_cpu);
9491
9492         rebalance_domains(this_cpu, idle);
9493
9494         /*
9495          * If this cpu has a pending nohz_balance_kick, then do the
9496          * balancing on behalf of the other idle cpus whose ticks are
9497          * stopped.
9498          */
9499         nohz_idle_balance(this_cpu, idle);
9500 }
9501
9502 static inline int on_null_domain(int cpu)
9503 {
9504         return !rcu_dereference_sched(cpu_rq(cpu)->sd);
9505 }
9506
9507 /*
9508  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
9509  */
9510 void trigger_load_balance(struct rq *rq, int cpu)
9511 {
9512         /* Don't need to rebalance while attached to NULL domain */
9513         if (time_after_eq(jiffies, rq->next_balance) &&
9514             likely(!on_null_domain(cpu)))
9515                 raise_softirq(SCHED_SOFTIRQ);
9516 #ifdef CONFIG_NO_HZ_COMMON
9517         if (nohz_kick_needed(rq, cpu) && likely(!on_null_domain(cpu)))
9518                 nohz_balancer_kick(cpu);
9519 #endif
9520 }
9521
9522 static void rq_online_fair(struct rq *rq)
9523 {
9524 #ifdef CONFIG_SCHED_HMP
9525         hmp_online_cpu(rq->cpu);
9526 #endif
9527         update_sysctl();
9528 }
9529
9530 static void rq_offline_fair(struct rq *rq)
9531 {
9532 #ifdef CONFIG_SCHED_HMP
9533         hmp_offline_cpu(rq->cpu);
9534 #endif
9535         update_sysctl();
9536
9537         /* Ensure any throttled groups are reachable by pick_next_task */
9538         unthrottle_offline_cfs_rqs(rq);
9539 }
9540
9541 #endif /* CONFIG_SMP */
9542
9543 /*
9544  * scheduler tick hitting a task of our scheduling class:
9545  */
9546 static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
9547 {
9548         struct cfs_rq *cfs_rq;
9549         struct sched_entity *se = &curr->se;
9550
9551         for_each_sched_entity(se) {
9552                 cfs_rq = cfs_rq_of(se);
9553                 entity_tick(cfs_rq, se, queued);
9554         }
9555
9556         if (sched_feat_numa(NUMA))
9557                 task_tick_numa(rq, curr);
9558
9559         update_rq_runnable_avg(rq, 1);
9560 }
9561
9562 /*
9563  * called on fork with the child task as argument from the parent's context
9564  *  - child not yet on the tasklist
9565  *  - preemption disabled
9566  */
9567 static void task_fork_fair(struct task_struct *p)
9568 {
9569         struct cfs_rq *cfs_rq;
9570         struct sched_entity *se = &p->se, *curr;
9571         int this_cpu = smp_processor_id();
9572         struct rq *rq = this_rq();
9573         unsigned long flags;
9574
9575         raw_spin_lock_irqsave(&rq->lock, flags);
9576
9577         update_rq_clock(rq);
9578
9579         cfs_rq = task_cfs_rq(current);
9580         curr = cfs_rq->curr;
9581
9582         /*
9583          * Not only the cpu but also the task_group of the parent might have
9584          * been changed after parent->se.parent,cfs_rq were copied to
9585          * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
9586          * of child point to valid ones.
9587          */
9588         rcu_read_lock();
9589         __set_task_cpu(p, this_cpu);
9590         rcu_read_unlock();
9591
9592         update_curr(cfs_rq);
9593
9594         if (curr)
9595                 se->vruntime = curr->vruntime;
9596         place_entity(cfs_rq, se, 1);
9597
9598         if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
9599                 /*
9600                  * Upon rescheduling, sched_class::put_prev_task() will place
9601                  * 'current' within the tree based on its new key value.
9602                  */
9603                 swap(curr->vruntime, se->vruntime);
9604                 resched_task(rq->curr);
9605         }
9606
9607         se->vruntime -= cfs_rq->min_vruntime;
9608
9609         raw_spin_unlock_irqrestore(&rq->lock, flags);
9610 }
9611
9612 /*
9613  * Priority of the task has changed. Check to see if we preempt
9614  * the current task.
9615  */
9616 static void
9617 prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio)
9618 {
9619         if (!p->se.on_rq)
9620                 return;
9621
9622         /*
9623          * Reschedule if we are currently running on this runqueue and
9624          * our priority decreased, or if we are not currently running on
9625          * this runqueue and our priority is higher than the current's
9626          */
9627         if (rq->curr == p) {
9628                 if (p->prio > oldprio)
9629                         resched_task(rq->curr);
9630         } else
9631                 check_preempt_curr(rq, p, 0);
9632 }
9633
9634 static void switched_from_fair(struct rq *rq, struct task_struct *p)
9635 {
9636         struct sched_entity *se = &p->se;
9637         struct cfs_rq *cfs_rq = cfs_rq_of(se);
9638
9639         /*
9640          * Ensure the task's vruntime is normalized, so that when it's
9641          * switched back to the fair class the enqueue_entity(.flags=0) will
9642          * do the right thing.
9643          *
9644          * If it's on_rq, then the dequeue_entity(.flags=0) will already
9645          * have normalized the vruntime, if it's !on_rq, then only when
9646          * the task is sleeping will it still have non-normalized vruntime.
9647          */
9648         if (!p->on_rq && p->state != TASK_RUNNING) {
9649                 /*
9650                  * Fix up our vruntime so that the current sleep doesn't
9651                  * cause 'unlimited' sleep bonus.
9652                  */
9653                 place_entity(cfs_rq, se, 0);
9654                 se->vruntime -= cfs_rq->min_vruntime;
9655         }
9656
9657 #ifdef CONFIG_SMP
9658         /*
9659         * Remove our load from contribution when we leave sched_fair
9660         * and ensure we don't carry in an old decay_count if we
9661         * switch back.
9662         */
9663         if (p->se.avg.decay_count) {
9664                 struct cfs_rq *cfs_rq = cfs_rq_of(&p->se);
9665                 __synchronize_entity_decay(&p->se);
9666                 subtract_blocked_load_contrib(cfs_rq,
9667                                 p->se.avg.load_avg_contrib);
9668         }
9669 #endif
9670 }
9671
9672 /*
9673  * We switched to the sched_fair class.
9674  */
9675 static void switched_to_fair(struct rq *rq, struct task_struct *p)
9676 {
9677         if (!p->se.on_rq)
9678                 return;
9679
9680         /*
9681          * We were most likely switched from sched_rt, so
9682          * kick off the schedule if running, otherwise just see
9683          * if we can still preempt the current task.
9684          */
9685         if (rq->curr == p)
9686                 resched_task(rq->curr);
9687         else{
9688                 /*
9689                 When task p change priority form RT to normal priority
9690                 in switch_from_rt(), it might call pull_rt_task
9691                 and potentially double_lock_balance will unlock rq.
9692                 Task p might migrate to other CPU and result in task p is NOT at rq.
9693                 In this case, it is not necessary to check preempt for rq.
9694                 (Because task p is NOT at rq anymore)
9695                 and the migrate flow for task p will check preempt in enqueue flow.
9696                 So bypass the check_preempt_curr.
9697                  */
9698                 if (rq == task_rq(p)) {
9699                         check_preempt_curr(rq, p, 0);
9700                 }
9701         }
9702 }
9703
9704 /* Account for a task changing its policy or group.
9705  *
9706  * This routine is mostly called to set cfs_rq->curr field when a task
9707  * migrates between groups/classes.
9708  */
9709 static void set_curr_task_fair(struct rq *rq)
9710 {
9711         struct sched_entity *se = &rq->curr->se;
9712
9713         for_each_sched_entity(se) {
9714                 struct cfs_rq *cfs_rq = cfs_rq_of(se);
9715
9716                 set_next_entity(cfs_rq, se);
9717                 /* ensure bandwidth has been allocated on our new cfs_rq */
9718                 account_cfs_rq_runtime(cfs_rq, 0);
9719         }
9720 }
9721
9722 void init_cfs_rq(struct cfs_rq *cfs_rq)
9723 {
9724         cfs_rq->tasks_timeline = RB_ROOT;
9725         cfs_rq->min_vruntime = (u64)(-(1LL << 20));
9726 #ifndef CONFIG_64BIT
9727         cfs_rq->min_vruntime_copy = cfs_rq->min_vruntime;
9728 #endif
9729 #ifdef CONFIG_SMP
9730         atomic64_set(&cfs_rq->decay_counter, 1);
9731         atomic_long_set(&cfs_rq->removed_load, 0);
9732 #endif
9733 }
9734
9735 #ifdef CONFIG_FAIR_GROUP_SCHED
9736 static void task_move_group_fair(struct task_struct *p, int on_rq)
9737 {
9738         struct cfs_rq *cfs_rq;
9739         /*
9740          * If the task was not on the rq at the time of this cgroup movement
9741          * it must have been asleep, sleeping tasks keep their ->vruntime
9742          * absolute on their old rq until wakeup (needed for the fair sleeper
9743          * bonus in place_entity()).
9744          *
9745          * If it was on the rq, we've just 'preempted' it, which does convert
9746          * ->vruntime to a relative base.
9747          *
9748          * Make sure both cases convert their relative position when migrating
9749          * to another cgroup's rq. This does somewhat interfere with the
9750          * fair sleeper stuff for the first placement, but who cares.
9751          */
9752         /*
9753          * When !on_rq, vruntime of the task has usually NOT been normalized.
9754          * But there are some cases where it has already been normalized:
9755          *
9756          * - Moving a forked child which is waiting for being woken up by
9757          *   wake_up_new_task().
9758          * - Moving a task which has been woken up by try_to_wake_up() and
9759          *   waiting for actually being woken up by sched_ttwu_pending().
9760          *
9761          * To prevent boost or penalty in the new cfs_rq caused by delta
9762          * min_vruntime between the two cfs_rqs, we skip vruntime adjustment.
9763          */
9764         if (!on_rq && (!p->se.sum_exec_runtime || p->state == TASK_WAKING))
9765                 on_rq = 1;
9766
9767         if (!on_rq)
9768                 p->se.vruntime -= cfs_rq_of(&p->se)->min_vruntime;
9769         set_task_rq(p, task_cpu(p));
9770         if (!on_rq) {
9771                 cfs_rq = cfs_rq_of(&p->se);
9772                 p->se.vruntime += cfs_rq->min_vruntime;
9773 #ifdef CONFIG_SMP
9774                 /*
9775                  * migrate_task_rq_fair() will have removed our previous
9776                  * contribution, but we must synchronize for ongoing future
9777                  * decay.
9778                  */
9779                 p->se.avg.decay_count = atomic64_read(&cfs_rq->decay_counter);
9780                 cfs_rq->blocked_load_avg += p->se.avg.load_avg_contrib;
9781 #endif
9782         }
9783 }
9784
9785 void free_fair_sched_group(struct task_group *tg)
9786 {
9787         int i;
9788
9789         destroy_cfs_bandwidth(tg_cfs_bandwidth(tg));
9790
9791         for_each_possible_cpu(i) {
9792                 if (tg->cfs_rq)
9793                         kfree(tg->cfs_rq[i]);
9794                 if (tg->se)
9795                         kfree(tg->se[i]);
9796         }
9797
9798         kfree(tg->cfs_rq);
9799         kfree(tg->se);
9800 }
9801
9802 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9803 {
9804         struct cfs_rq *cfs_rq;
9805         struct sched_entity *se;
9806         int i;
9807
9808         tg->cfs_rq = kzalloc(sizeof(cfs_rq) * nr_cpu_ids, GFP_KERNEL);
9809         if (!tg->cfs_rq)
9810                 goto err;
9811         tg->se = kzalloc(sizeof(se) * nr_cpu_ids, GFP_KERNEL);
9812         if (!tg->se)
9813                 goto err;
9814
9815         tg->shares = NICE_0_LOAD;
9816
9817         init_cfs_bandwidth(tg_cfs_bandwidth(tg));
9818
9819         for_each_possible_cpu(i) {
9820                 cfs_rq = kzalloc_node(sizeof(struct cfs_rq),
9821                                       GFP_KERNEL, cpu_to_node(i));
9822                 if (!cfs_rq)
9823                         goto err;
9824
9825                 se = kzalloc_node(sizeof(struct sched_entity),
9826                                   GFP_KERNEL, cpu_to_node(i));
9827                 if (!se)
9828                         goto err_free_rq;
9829
9830                 init_cfs_rq(cfs_rq);
9831                 init_tg_cfs_entry(tg, cfs_rq, se, i, parent->se[i]);
9832         }
9833
9834         return 1;
9835
9836 err_free_rq:
9837         kfree(cfs_rq);
9838 err:
9839         return 0;
9840 }
9841
9842 void unregister_fair_sched_group(struct task_group *tg, int cpu)
9843 {
9844         struct rq *rq = cpu_rq(cpu);
9845         unsigned long flags;
9846
9847         /*
9848         * Only empty task groups can be destroyed; so we can speculatively
9849         * check on_list without danger of it being re-added.
9850         */
9851         if (!tg->cfs_rq[cpu]->on_list)
9852                 return;
9853
9854         raw_spin_lock_irqsave(&rq->lock, flags);
9855         list_del_leaf_cfs_rq(tg->cfs_rq[cpu]);
9856         raw_spin_unlock_irqrestore(&rq->lock, flags);
9857 }
9858
9859 void init_tg_cfs_entry(struct task_group *tg, struct cfs_rq *cfs_rq,
9860                         struct sched_entity *se, int cpu,
9861                         struct sched_entity *parent)
9862 {
9863         struct rq *rq = cpu_rq(cpu);
9864
9865         cfs_rq->tg = tg;
9866         cfs_rq->rq = rq;
9867         init_cfs_rq_runtime(cfs_rq);
9868
9869         tg->cfs_rq[cpu] = cfs_rq;
9870         tg->se[cpu] = se;
9871
9872         /* se could be NULL for root_task_group */
9873         if (!se)
9874                 return;
9875
9876         if (!parent)
9877                 se->cfs_rq = &rq->cfs;
9878         else
9879                 se->cfs_rq = parent->my_q;
9880
9881         se->my_q = cfs_rq;
9882         /* guarantee group entities always have weight */
9883         update_load_set(&se->load, NICE_0_LOAD);
9884         se->parent = parent;
9885 }
9886
9887 static DEFINE_MUTEX(shares_mutex);
9888
9889 int sched_group_set_shares(struct task_group *tg, unsigned long shares)
9890 {
9891         int i;
9892         unsigned long flags;
9893
9894         /*
9895          * We can't change the weight of the root cgroup.
9896          */
9897         if (!tg->se[0])
9898                 return -EINVAL;
9899
9900         shares = clamp(shares, scale_load(MIN_SHARES), scale_load(MAX_SHARES));
9901
9902         mutex_lock(&shares_mutex);
9903         if (tg->shares == shares)
9904                 goto done;
9905
9906         tg->shares = shares;
9907         for_each_possible_cpu(i) {
9908                 struct rq *rq = cpu_rq(i);
9909                 struct sched_entity *se;
9910
9911                 se = tg->se[i];
9912                 /* Propagate contribution to hierarchy */
9913                 raw_spin_lock_irqsave(&rq->lock, flags);
9914                 for_each_sched_entity(se)
9915                         update_cfs_shares(group_cfs_rq(se));
9916                 raw_spin_unlock_irqrestore(&rq->lock, flags);
9917         }
9918
9919 done:
9920         mutex_unlock(&shares_mutex);
9921         return 0;
9922 }
9923 #else /* CONFIG_FAIR_GROUP_SCHED */
9924
9925 void free_fair_sched_group(struct task_group *tg) { }
9926
9927 int alloc_fair_sched_group(struct task_group *tg, struct task_group *parent)
9928 {
9929         return 1;
9930 }
9931
9932 void unregister_fair_sched_group(struct task_group *tg, int cpu) { }
9933
9934 #endif /* CONFIG_FAIR_GROUP_SCHED */
9935
9936
9937 static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task)
9938 {
9939         struct sched_entity *se = &task->se;
9940         unsigned int rr_interval = 0;
9941
9942         /*
9943          * Time slice is 0 for SCHED_OTHER tasks that are on an otherwise
9944          * idle runqueue:
9945          */
9946         if (rq->cfs.load.weight)
9947                 rr_interval = NS_TO_JIFFIES(sched_slice(cfs_rq_of(se), se));
9948
9949         return rr_interval;
9950 }
9951
9952 /*
9953  * All the scheduling class methods:
9954  */
9955 const struct sched_class fair_sched_class = {
9956         .next                   = &idle_sched_class,
9957         .enqueue_task           = enqueue_task_fair,
9958         .dequeue_task           = dequeue_task_fair,
9959         .yield_task             = yield_task_fair,
9960         .yield_to_task          = yield_to_task_fair,
9961
9962         .check_preempt_curr     = check_preempt_wakeup,
9963
9964         .pick_next_task         = pick_next_task_fair,
9965         .put_prev_task          = put_prev_task_fair,
9966
9967 #ifdef CONFIG_SMP
9968         .select_task_rq         = select_task_rq_fair,
9969         .migrate_task_rq        = migrate_task_rq_fair,
9970
9971         .rq_online              = rq_online_fair,
9972         .rq_offline             = rq_offline_fair,
9973
9974         .task_waking            = task_waking_fair,
9975 #endif
9976
9977         .set_curr_task          = set_curr_task_fair,
9978         .task_tick              = task_tick_fair,
9979         .task_fork              = task_fork_fair,
9980
9981         .prio_changed           = prio_changed_fair,
9982         .switched_from          = switched_from_fair,
9983         .switched_to            = switched_to_fair,
9984
9985         .get_rr_interval        = get_rr_interval_fair,
9986
9987 #ifdef CONFIG_FAIR_GROUP_SCHED
9988         .task_move_group        = task_move_group_fair,
9989 #endif
9990 };
9991
9992 #ifdef CONFIG_SCHED_DEBUG
9993 void print_cfs_stats(struct seq_file *m, int cpu)
9994 {
9995         struct cfs_rq *cfs_rq;
9996
9997         rcu_read_lock();
9998         for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
9999                 print_cfs_rq(m, cpu, cfs_rq);
10000         rcu_read_unlock();
10001 }
10002 #endif
10003
10004 __init void init_sched_fair_class(void)
10005 {
10006 #ifdef CONFIG_SMP
10007         open_softirq(SCHED_SOFTIRQ, run_rebalance_domains);
10008
10009 #ifdef CONFIG_NO_HZ_COMMON
10010         nohz.next_balance = jiffies;
10011         zalloc_cpumask_var(&nohz.idle_cpus_mask, GFP_NOWAIT);
10012         cpu_notifier(sched_ilb_notifier, 0);
10013 #endif
10014
10015         cmp_cputopo_domain_setup();
10016 #ifdef CONFIG_SCHED_HMP
10017         hmp_cpu_mask_setup();
10018 #endif
10019 #endif /* SMP */
10020 }
10021
10022 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
10023 static u32 cpufreq_calc_scale(u32 min, u32 max, u32 curr)
10024 {
10025         u32 result = curr / max;
10026         return result;
10027 }
10028
10029 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
10030 DEFINE_PER_CPU(u32, FREQ_CPU);
10031 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
10032
10033 /* Called when the CPU Frequency is changed.
10034  * Once for each CPU.
10035  */
10036 static int cpufreq_callback(struct notifier_block *nb,
10037                                         unsigned long val, void *data)
10038 {
10039         struct cpufreq_freqs *freq = data;
10040         int cpu = freq->cpu;
10041         struct cpufreq_extents *extents;
10042 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10043         struct cpumask* mask;
10044         int id;
10045 #endif
10046
10047         if (freq->flags & CPUFREQ_CONST_LOOPS)
10048                 return NOTIFY_OK;
10049
10050         if (val != CPUFREQ_POSTCHANGE)
10051                 return NOTIFY_OK;
10052
10053         /* if dynamic load scale is disabled, set the load scale to 1.0 */
10054         if (!hmp_data.freqinvar_load_scale_enabled) {
10055                 freq_scale[cpu].curr_scale = 1024;
10056                 return NOTIFY_OK;
10057         }
10058
10059         extents = &freq_scale[cpu];
10060 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10061         if (extents->max < extents->const_max){
10062                 extents->throttling=1;
10063         }
10064         else {
10065                 extents->throttling=0;
10066         }
10067 #endif
10068         if (extents->flags & SCHED_LOAD_FREQINVAR_SINGLEFREQ) {
10069                 /* If our governor was recognised as a single-freq governor,
10070                  * use 1.0
10071                  */
10072                 extents->curr_scale = 1024;
10073         } else {
10074 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10075                 extents->curr_scale = cpufreq_calc_scale(extents->min,
10076                                 extents->const_max, freq->new);
10077 #else
10078                 extents->curr_scale = cpufreq_calc_scale(extents->min,
10079                                 extents->max, freq->new);
10080 #endif
10081         }
10082
10083 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10084         mask = arch_cpu_is_big(cpu)?&hmp_fast_cpu_mask:&hmp_slow_cpu_mask;
10085         for_each_cpu(id, mask)
10086                 freq_scale[id].curr_scale = extents->curr_scale;
10087 #endif
10088
10089 #if NR_CPUS == 4
10090 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10091         switch (cpu) {
10092         case 0:
10093         case 2:
10094                 (extents + 1)->curr_scale = extents->curr_scale;
10095                 break;
10096
10097         case 1:
10098         case 3:
10099                 (extents - 1)->curr_scale = extents->curr_scale;
10100                 break;
10101
10102         default:
10103
10104                 break;
10105         }
10106 #endif
10107 #endif
10108
10109 #ifdef CONFIG_HMP_POWER_AWARE_CONTROLLER
10110         per_cpu(FREQ_CPU, cpu) = freq->new;
10111 #endif /* CONFIG_HMP_POWER_AWARE_CONTROLLER */
10112         return NOTIFY_OK;
10113 }
10114
10115 /* Called when the CPUFreq governor is changed.
10116  * Only called for the CPUs which are actually changed by the
10117  * userspace.
10118  */
10119 static int cpufreq_policy_callback(struct notifier_block *nb,
10120                                        unsigned long event, void *data)
10121 {
10122         struct cpufreq_policy *policy = data;
10123         struct cpufreq_extents *extents;
10124         int cpu, singleFreq = 0;
10125         static const char performance_governor[] = "performance";
10126         static const char powersave_governor[] = "powersave";
10127
10128         if (event == CPUFREQ_START)
10129                 return 0;
10130
10131         if (event != CPUFREQ_INCOMPATIBLE)
10132                 return 0;
10133
10134         /* CPUFreq governors do not accurately report the range of
10135          * CPU Frequencies they will choose from.
10136          * We recognise performance and powersave governors as
10137          * single-frequency only.
10138          */
10139         if (!strncmp(policy->governor->name, performance_governor,
10140                         strlen(performance_governor)) ||
10141                 !strncmp(policy->governor->name, powersave_governor,
10142                                 strlen(powersave_governor)))
10143                 singleFreq = 1;
10144
10145         /* Make sure that all CPUs impacted by this policy are
10146          * updated since we will only get a notification when the
10147          * user explicitly changes the policy on a CPU.
10148          */
10149         for_each_cpu(cpu, policy->cpus) {
10150                 extents = &freq_scale[cpu];
10151                 extents->max = policy->max >> SCHED_FREQSCALE_SHIFT;
10152                 extents->min = policy->min >> SCHED_FREQSCALE_SHIFT;
10153 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10154                 extents->const_max = policy->cpuinfo.max_freq >> SCHED_FREQSCALE_SHIFT;
10155 #endif
10156                 if (!hmp_data.freqinvar_load_scale_enabled) {
10157                         extents->curr_scale = 1024;
10158                 } else if (singleFreq) {
10159                         extents->flags |= SCHED_LOAD_FREQINVAR_SINGLEFREQ;
10160                         extents->curr_scale = 1024;
10161                 } else {
10162                         extents->flags &= ~SCHED_LOAD_FREQINVAR_SINGLEFREQ;
10163 #ifdef CONFIG_SCHED_HMP_ENHANCEMENT
10164                         extents->curr_scale = cpufreq_calc_scale(extents->min,
10165                                         extents->const_max, policy->cur);
10166 #else
10167                         extents->curr_scale = cpufreq_calc_scale(extents->min,
10168                                         extents->max, policy->cur);
10169 #endif
10170                 }
10171         }
10172
10173         return 0;
10174 }
10175
10176 static struct notifier_block cpufreq_notifier = {
10177         .notifier_call  = cpufreq_callback,
10178 };
10179 static struct notifier_block cpufreq_policy_notifier = {
10180         .notifier_call  = cpufreq_policy_callback,
10181 };
10182
10183 static int __init register_sched_cpufreq_notifier(void)
10184 {
10185         int ret = 0;
10186
10187         /* init safe defaults since there are no policies at registration */
10188         for (ret = 0; ret < CONFIG_NR_CPUS; ret++) {
10189                 /* safe defaults */
10190                 freq_scale[ret].max = 1024;
10191                 freq_scale[ret].min = 1024;
10192                 freq_scale[ret].curr_scale = 1024;
10193         }
10194
10195         pr_info("sched: registering cpufreq notifiers for scale-invariant loads\n");
10196         ret = cpufreq_register_notifier(&cpufreq_policy_notifier,
10197                         CPUFREQ_POLICY_NOTIFIER);
10198
10199         if (ret != -EINVAL)
10200                 ret = cpufreq_register_notifier(&cpufreq_notifier,
10201                         CPUFREQ_TRANSITION_NOTIFIER);
10202
10203         return ret;
10204 }
10205
10206 core_initcall(register_sched_cpufreq_notifier);
10207 #endif /* CONFIG_HMP_FREQUENCY_INVARIANT_SCALE */
10208
10209 #ifdef CONFIG_HEVTASK_INTERFACE
10210 /*
10211  *  * This allows printing both to /proc/task_detect and
10212  *   * to the console
10213  *    */
10214 #ifndef CONFIG_KGDB_KDB
10215 #define SEQ_printf(m, x...)                     \
10216         do {                                            \
10217                 if (m)                                  \
10218                 seq_printf(m, x);               \
10219                 else                                    \
10220                 printk(x);                      \
10221         } while (0)
10222 #else
10223 #define SEQ_printf(m, x...)                     \
10224         do {                                            \
10225                 if (m)                                  \
10226                 seq_printf(m, x);               \
10227                 else if (__get_cpu_var(kdb_in_use) == 1)                \
10228                 kdb_printf(x);                  \
10229                 else                                            \
10230                 printk(x);                              \
10231         } while (0)
10232 #endif
10233
10234 static int task_detect_show(struct seq_file *m, void *v)
10235 {
10236         struct task_struct *p;
10237         unsigned long flags;
10238         unsigned int i;
10239
10240 #ifdef CONFIG_HMP_FREQUENCY_INVARIANT_SCALE
10241         for(i=0;i<NR_CPUS;i++){
10242                 SEQ_printf(m,"%5d ",freq_scale[i].curr_scale);
10243         }
10244 #endif
10245
10246         SEQ_printf(m, "\n%lu\n ",jiffies_to_cputime(jiffies));
10247
10248         for(i=0;i<NR_CPUS;i++){
10249                 raw_spin_lock_irqsave(&cpu_rq(i)->lock,flags);
10250                 if(cpu_online(i)){
10251                         list_for_each_entry(p,&cpu_rq(i)->cfs_tasks,se.group_node){
10252                                 SEQ_printf(m, "%lu %5d %5d %lu (%15s)\n ",
10253                                                 p->se.avg.load_avg_ratio,p->pid,task_cpu(p),
10254                                                 (p->utime+p->stime),p->comm);
10255                         }
10256                 }
10257                 raw_spin_unlock_irqrestore(&cpu_rq(i)->lock,flags);
10258
10259         }
10260
10261         return 0;
10262 }
10263
10264 static int task_detect_open(struct inode *inode, struct file *filp)
10265 {
10266         return single_open(filp, task_detect_show, NULL);
10267 }
10268
10269 static const struct file_operations task_detect_fops = {
10270         .open           = task_detect_open,
10271         .read           = seq_read,
10272         .llseek         = seq_lseek,
10273         .release        = single_release,
10274 };
10275
10276 static int __init init_task_detect_procfs(void)
10277 {
10278         struct proc_dir_entry *pe;
10279
10280         pe = proc_create("task_detect", 0444, NULL, &task_detect_fops);
10281         if (!pe)
10282                 return -ENOMEM;
10283         return 0;
10284 }
10285
10286 __initcall(init_task_detect_procfs);
10287 #endif /* CONFIG_HEVTASK_INTERFACE */