Merge 4.14.95 into android-4.14-p

[GitHub/moto-9609/android_kernel_motorola_exynos9610.git] / kernel / sched / fair.c
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 5e828914bf9172aaaed611e7f495ffed898541ca..08d90c69d123d31fc05f316592399fa454603e78 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -387,10 +387,9 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
         }
  }
  
-/* Iterate thr' all leaf cfs_rq's on a runqueue */
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)                     \
-       list_for_each_entry_safe(cfs_rq, pos, &rq->leaf_cfs_rq_list,    \
-                                leaf_cfs_rq_list)
+/* Iterate through all leaf cfs_rq's on a runqueue: */
+#define for_each_leaf_cfs_rq(rq, cfs_rq) \
+       list_for_each_entry_rcu(cfs_rq, &rq->leaf_cfs_rq_list, leaf_cfs_rq_list)
  
  /* Do the two (enqueued) entities belong to the same group ? */
  static inline struct cfs_rq *
@@ -483,8 +482,8 @@ static inline void list_del_leaf_cfs_rq(struct cfs_rq *cfs_rq)
  {
  }
  
-#define for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos)     \
-               for (cfs_rq = &rq->cfs, pos = NULL; cfs_rq; cfs_rq = pos)
+#define for_each_leaf_cfs_rq(rq, cfs_rq)       \
+               for (cfs_rq = &rq->cfs; cfs_rq; cfs_rq = NULL)
  
  static inline struct sched_entity *parent_entity(struct sched_entity *se)
  {
@@ -776,11 +775,12 @@ static void attach_entity_cfs_rq(struct sched_entity *se);
   * To solve this problem, we also cap the util_avg of successive tasks to
   * only 1/2 of the left utilization budget:
   *
- *   util_avg_cap = (1024 - cfs_rq->avg.util_avg) / 2^n
+ *   util_avg_cap = (cpu_scale - cfs_rq->avg.util_avg) / 2^n
   *
- * where n denotes the nth task.
+ * where n denotes the nth task and cpu_scale the CPU capacity.
   *
- * For example, a simplest series from the beginning would be like:
+ * For example, for a CPU with 1024 of capacity, a simplest series from
+ * the beginning would be like:
   *
   *  task  util_avg: 512, 256, 128,  64,  32,   16,    8, ...
   * cfs_rq util_avg: 512, 768, 896, 960, 992, 1008, 1016, ...
@@ -792,7 +792,8 @@ void post_init_entity_util_avg(struct sched_entity *se)
  {
         struct cfs_rq *cfs_rq = cfs_rq_of(se);
         struct sched_avg *sa = &se->avg;
-       long cap = (long)(SCHED_CAPACITY_SCALE - cfs_rq->avg.util_avg) / 2;
+       long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq)));
+       long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2;
  
         if (cap > 0) {
                 if (cfs_rq->avg.util_avg != 0) {
@@ -4094,7 +4095,7 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags)
          * put back on, and if we advance min_vruntime, we'll be placed back
          * further than we started -- ie. we'll be penalized.
          */
-       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE)
                 update_min_vruntime(cfs_rq);
  }
  
@@ -4309,12 +4310,12 @@ static inline bool cfs_bandwidth_used(void)
  
  void cfs_bandwidth_usage_inc(void)
  {
-       static_key_slow_inc(&__cfs_bandwidth_used);
+       static_key_slow_inc_cpuslocked(&__cfs_bandwidth_used);
  }
  
  void cfs_bandwidth_usage_dec(void)
  {
-       static_key_slow_dec(&__cfs_bandwidth_used);
+       static_key_slow_dec_cpuslocked(&__cfs_bandwidth_used);
  }
  #else /* HAVE_JUMP_LABEL */
  static bool cfs_bandwidth_used(void)
@@ -4357,6 +4358,7 @@ void __refill_cfs_bandwidth_runtime(struct cfs_bandwidth *cfs_b)
         now = sched_clock_cpu(smp_processor_id());
         cfs_b->runtime = cfs_b->quota;
         cfs_b->runtime_expires = now + ktime_to_ns(cfs_b->period);
+       cfs_b->expires_seq++;
  }
  
  static inline struct cfs_bandwidth *tg_cfs_bandwidth(struct task_group *tg)
@@ -4379,6 +4381,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
         struct task_group *tg = cfs_rq->tg;
         struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(tg);
         u64 amount = 0, min_amount, expires;
+       int expires_seq;
  
         /* note: this is a positive sum as runtime_remaining <= 0 */
         min_amount = sched_cfs_bandwidth_slice() - cfs_rq->runtime_remaining;
@@ -4395,6 +4398,7 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
                         cfs_b->idle = 0;
                 }
         }
+       expires_seq = cfs_b->expires_seq;
         expires = cfs_b->runtime_expires;
         raw_spin_unlock(&cfs_b->lock);
  
@@ -4404,8 +4408,10 @@ static int assign_cfs_rq_runtime(struct cfs_rq *cfs_rq)
          * spread between our sched_clock and the one on which runtime was
          * issued.
          */
-       if ((s64)(expires - cfs_rq->runtime_expires) > 0)
+       if (cfs_rq->expires_seq != expires_seq) {
+               cfs_rq->expires_seq = expires_seq;
                 cfs_rq->runtime_expires = expires;
+       }
  
         return cfs_rq->runtime_remaining > 0;
  }
@@ -4431,12 +4437,9 @@ static void expire_cfs_rq_runtime(struct cfs_rq *cfs_rq)
          * has not truly expired.
          *
          * Fortunately we can check determine whether this the case by checking
-        * whether the global deadline has advanced. It is valid to compare
-        * cfs_b->runtime_expires without any locks since we only care about
-        * exact equality, so a partial write will still work.
+        * whether the global deadline(cfs_b->expires_seq) has advanced.
          */
-
-       if (cfs_rq->runtime_expires != cfs_b->runtime_expires) {
+       if (cfs_rq->expires_seq == cfs_b->expires_seq) {
                 /* extend local deadline, drift is bounded above by 2 ticks */
                 cfs_rq->runtime_expires += TICK_NSEC;
         } else {
@@ -4568,9 +4571,13 @@ static void throttle_cfs_rq(struct cfs_rq *cfs_rq)
  
         /*
          * Add to the _head_ of the list, so that an already-started
-        * distribute_cfs_runtime will not see us
+        * distribute_cfs_runtime will not see us. If disribute_cfs_runtime is
+        * not running add to the tail so that later runqueues don't get starved.
          */
-       list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       if (cfs_b->distribute_running)
+               list_add_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
+       else
+               list_add_tail_rcu(&cfs_rq->throttled_list, &cfs_b->throttled_cfs_rq);
  
         /*
          * If we're the first throttled task, make sure the bandwidth
@@ -4714,14 +4721,16 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
          * in us over-using our runtime if it is all used during this loop, but
          * only by limited amounts in that extreme case.
          */
-       while (throttled && cfs_b->runtime > 0) {
+       while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
                 runtime = cfs_b->runtime;
+               cfs_b->distribute_running = 1;
                 raw_spin_unlock(&cfs_b->lock);
                 /* we can't nest cfs_b->lock while distributing bandwidth */
                 runtime = distribute_cfs_runtime(cfs_b, runtime,
                                                  runtime_expires);
                 raw_spin_lock(&cfs_b->lock);
  
+               cfs_b->distribute_running = 0;
                 throttled = !list_empty(&cfs_b->throttled_cfs_rq);
  
                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
@@ -4832,6 +4841,11 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
  
         /* confirm we're still not at a refresh boundary */
         raw_spin_lock(&cfs_b->lock);
+       if (cfs_b->distribute_running) {
+               raw_spin_unlock(&cfs_b->lock);
+               return;
+       }
+
         if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
                 raw_spin_unlock(&cfs_b->lock);
                 return;
@@ -4841,6 +4855,9 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
                 runtime = cfs_b->runtime;
  
         expires = cfs_b->runtime_expires;
+       if (runtime)
+               cfs_b->distribute_running = 1;
+
         raw_spin_unlock(&cfs_b->lock);
  
         if (!runtime)
@@ -4851,6 +4868,7 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
         raw_spin_lock(&cfs_b->lock);
         if (expires == cfs_b->runtime_expires)
                 cfs_b->runtime -= min(runtime, cfs_b->runtime);
+       cfs_b->distribute_running = 0;
         raw_spin_unlock(&cfs_b->lock);
  }
  
@@ -4959,6 +4977,7 @@ void init_cfs_bandwidth(struct cfs_bandwidth *cfs_b)
         cfs_b->period_timer.function = sched_cfs_period_timer;
         hrtimer_init(&cfs_b->slack_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         cfs_b->slack_timer.function = sched_cfs_slack_timer;
+       cfs_b->distribute_running = 0;
  }
  
  static void init_cfs_rq_runtime(struct cfs_rq *cfs_rq)
@@ -6905,6 +6924,7 @@ static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p
  }
  
  #ifdef CONFIG_SCHED_SMT
+DEFINE_STATIC_KEY_FALSE(sched_smt_present);
  
  static inline void set_idle_cores(int cpu, int val)
  {
@@ -8890,27 +8910,10 @@ static void attach_tasks(struct lb_env *env)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
-static inline bool cfs_rq_is_decayed(struct cfs_rq *cfs_rq)
-{
-       if (cfs_rq->load.weight)
-               return false;
-
-       if (cfs_rq->avg.load_sum)
-               return false;
-
-       if (cfs_rq->avg.util_sum)
-               return false;
-
-       if (cfs_rq->runnable_load_sum)
-               return false;
-
-       return true;
-}
-
  static void update_blocked_averages(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
-       struct cfs_rq *cfs_rq, *pos;
+       struct cfs_rq *cfs_rq;
         struct rq_flags rf;
  
         rq_lock_irqsave(rq, &rf);
@@ -8920,7 +8923,7 @@ static void update_blocked_averages(int cpu)
          * Iterates the task_group tree in a bottom up fashion, see
          * list_add_leaf_cfs_rq() for details.
          */
-       for_each_leaf_cfs_rq_safe(rq, cfs_rq, pos) {
+       for_each_leaf_cfs_rq(rq, cfs_rq) {
                 struct sched_entity *se;
  
                 /* throttled entities do not contribute to load */
@@ -8934,13 +8937,6 @@ static void update_blocked_averages(int cpu)
                 se = cfs_rq->tg->se[cpu];
                 if (se && !skip_blocked_update(se))
                         update_load_avg(se, 0);
-
-               /*
-                * There can be a lot of idle CPU cgroups.  Don't let fully
-                * decayed cfs_rqs linger on the list.
-                */
-               if (cfs_rq_is_decayed(cfs_rq))
-                       list_del_leaf_cfs_rq(cfs_rq);
         }
         update_rt_rq_load_avg(rq_clock_task(rq), cpu, &rq->rt, 0);
  #ifdef CONFIG_NO_HZ_COMMON
@@ -11425,7 +11421,8 @@ static inline bool vruntime_normalized(struct task_struct *p)
          * - A task which has been woken up by try_to_wake_up() and
          *   waiting for actually being woken up by sched_ttwu_pending().
          */
-       if (!se->sum_exec_runtime || p->state == TASK_WAKING)
+       if (!se->sum_exec_runtime ||
+           (p->state == TASK_WAKING && p->sched_remote_wakeup))
                 return true;
  
         return false;
@@ -11851,10 +11848,10 @@ const struct sched_class fair_sched_class = {
  #ifdef CONFIG_SCHED_DEBUG
  void print_cfs_stats(struct seq_file *m, int cpu)
  {
-       struct cfs_rq *cfs_rq, *pos;
+       struct cfs_rq *cfs_rq;
  
         rcu_read_lock();
-       for_each_leaf_cfs_rq_safe(cpu_rq(cpu), cfs_rq, pos)
+       for_each_leaf_cfs_rq(cpu_rq(cpu), cfs_rq)
                 print_cfs_rq(m, cpu, cfs_rq);
         rcu_read_unlock();
  }