[COMMON] sched/rt: add the sync flag into RT
[GitHub/LineageOS/android_kernel_motorola_exynos9610.git] / kernel / sched / rt.c
index cc561cab3b9b9997c11a665b7c257b3e8bca53d7..a223e4f7249381dd5e6ed62450ccc228a705c680 100644 (file)
 
 #include "walt.h"
 
+#include <trace/events/sched.h>
+
 int sched_rr_timeslice = RR_TIMESLICE;
 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
 
+
+void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se);
+
 static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
 
 struct rt_bandwidth def_rt_bandwidth;
@@ -101,6 +106,8 @@ void init_rt_rq(struct rt_rq *rt_rq)
        rt_rq->rt_nr_migratory = 0;
        rt_rq->overloaded = 0;
        plist_head_init(&rt_rq->pushable_tasks);
+       atomic_long_set(&rt_rq->removed_util_avg, 0);
+       atomic_long_set(&rt_rq->removed_load_avg, 0);
 #endif /* CONFIG_SMP */
        /* We start is dequeued state, because no RT tasks are queued */
        rt_rq->rt_queued = 0;
@@ -219,6 +226,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
                init_rt_rq(rt_rq);
                rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
                init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+               init_rt_entity_runnable_average(rt_se);
        }
 
        return 1;
@@ -267,6 +275,94 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent)
 
 #ifdef CONFIG_SMP
 
+#include "sched-pelt.h"
+#define entity_is_task(se)     (!se->my_q)
+
+extern u64 decay_load(u64 val, u64 n);
+
+static u32 __accumulate_pelt_segments_rt(u64 periods, u32 d1, u32 d3)
+{
+       u32 c1, c2, c3 = d3;
+
+       c1 = decay_load((u64)d1, periods);
+
+       c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+
+       return c1 + c2 + c3;
+}
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+static __always_inline u32
+accumulate_sum_rt(u64 delta, int cpu, struct sched_avg *sa,
+              unsigned long weight, int running)
+{
+       unsigned long scale_freq, scale_cpu;
+       u32 contrib = (u32)delta;
+       u64 periods;
+
+       scale_freq = arch_scale_freq_capacity(NULL, cpu);
+       scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+       delta += sa->period_contrib;
+       periods = delta / 1024;
+
+       if (periods) {
+               sa->load_sum = decay_load(sa->load_sum, periods);
+               sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+
+               delta %= 1024;
+               contrib = __accumulate_pelt_segments_rt(periods,
+                               1024 - sa->period_contrib, delta);
+       }
+       sa->period_contrib = delta;
+
+       contrib = cap_scale(contrib, scale_freq);
+       if (weight) {
+               sa->load_sum += weight * contrib;
+       }
+       if (running)
+               sa->util_sum += contrib * scale_cpu;
+
+       return periods;
+}
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series, exactly like fair task load.
+ * refer the ___update_load_avg @ fair sched class
+ */
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+       unsigned long weight, int running, struct rt_rq *rt_rq)
+{
+       u64 delta;
+
+       delta = now - sa->last_update_time;
+
+       if ((s64)delta < 0) {
+               sa->last_update_time = now;
+               return 0;
+       }
+
+       delta >>= 10;
+       if (!delta)
+               return 0;
+
+       sa->last_update_time += delta << 10;
+
+       if (!weight)
+               running = 0;
+
+       if (!accumulate_sum_rt(delta, cpu, sa, weight, running))
+               return 0;
+
+       sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
+       sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
+
+       return 1;
+}
+
 static void pull_rt_task(struct rq *this_rq);
 
 static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
@@ -1211,6 +1307,40 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        dec_rt_group(rt_se, rt_rq);
 }
 
+#ifdef CONFIG_SMP
+static void
+attach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+       rt_se->avg.last_update_time = rt_rq->avg.last_update_time;
+       rt_rq->avg.util_avg += rt_se->avg.util_avg;
+       rt_rq->avg.util_sum += rt_se->avg.util_sum;
+       rt_rq->avg.load_avg += rt_se->avg.load_avg;
+       rt_rq->avg.load_sum += rt_se->avg.load_sum;
+#ifdef CONFIG_RT_GROUP_SCHED
+       rt_rq->propagate_avg = 1;
+#endif
+       rt_rq_util_change(rt_rq);
+}
+
+static void
+detach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+       sub_positive(&rt_rq->avg.util_avg, rt_se->avg.util_avg);
+       sub_positive(&rt_rq->avg.util_sum, rt_se->avg.util_sum);
+       sub_positive(&rt_rq->avg.load_avg, rt_se->avg.load_avg);
+       sub_positive(&rt_rq->avg.load_sum, rt_se->avg.load_sum);
+#ifdef CONFIG_RT_GROUP_SCHED
+       rt_rq->propagate_avg = 1;
+#endif
+       rt_rq_util_change(rt_rq);
+}
+#else
+static inline void
+attach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) {}
+static inline void
+detach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) {}
+#endif
+
 /*
  * Change rt_se->run_list location unless SAVE && !MOVE
  *
@@ -1265,6 +1395,11 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
        }
        rt_se->on_rq = 1;
 
+       update_rt_load_avg(rq_clock_task(rq_of_rt_rq(rt_rq)), rt_se);
+
+       if (rt_entity_is_task(rt_se) && !rt_se->avg.last_update_time)
+               attach_rt_entity_load_avg(rt_rq, rt_se);
+
        inc_rt_tasks(rt_se, rt_rq);
 }
 
@@ -1279,6 +1414,8 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flag
        }
        rt_se->on_rq = 0;
 
+       update_rt_load_avg(rq_clock_task(rq_of_rt_rq(rt_rq)), rt_se);
+
        dec_rt_tasks(rt_se, rt_rq);
 }
 
@@ -1396,8 +1533,16 @@ static void yield_task_rt(struct rq *rq)
 }
 
 #ifdef CONFIG_SMP
-static int find_lowest_rq(struct task_struct *task);
 
+/* TODO:
+ * attach/detach/migrate_task_rt_rq() for load tracking
+ */
+
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+static int find_lowest_rq(struct task_struct *task, int wake_flags);
+#else
+static int find_lowest_rq(struct task_struct *task);
+#endif
 static int
 select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
                  int sibling_count_hint)
@@ -1414,6 +1559,25 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
        rcu_read_lock();
        curr = READ_ONCE(rq->curr); /* unlocked access */
 
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+       if (curr) {
+               int target = find_lowest_rq(p, flags);
+               /*
+                * Even though the destination CPU is running
+                * a higher priority task, FluidRT can bother moving it
+                * when its utilization is very small, and the other CPU is too busy
+                * to accomodate the p in the point of priority and utilization.
+                *
+                * BTW, if the curr has higher priority than p, FluidRT tries to find
+                * the other CPUs first. In the worst case, curr can be victim, if it
+                * has very small utilization.
+                */
+               if (likely(target != -1)) {
+                       cpu = target;
+               }
+       }
+#else
+
        /*
         * If the current task on @p's runqueue is an RT task, then
         * try to see if we can wake this RT task up on another
@@ -1440,7 +1604,6 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
            (curr->nr_cpus_allowed < 2 ||
             curr->prio <= p->prio)) {
                int target = find_lowest_rq(p);
-
                /*
                 * Don't bother moving it if the destination CPU is
                 * not running a lower priority task.
@@ -1449,12 +1612,201 @@ select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
                    p->prio < cpu_rq(target)->rt.highest_prio.curr)
                        cpu = target;
        }
+#endif
        rcu_read_unlock();
 
 out:
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+       if (cpu >= 6)
+               trace_sched_fluid_stat(p, &p->se.avg, cpu, "BIG_ASSIGED");
+#endif
        return cpu;
 }
 
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_rt(struct sched_rt_entity *rt_se,
+                                   struct rt_rq *prev, struct rt_rq *next)
+{
+       u64 p_last_update_time;
+       u64 n_last_update_time;
+
+       if (!sched_feat(ATTACH_AGE_LOAD))
+               return;
+       /*
+        * We are supposed to update the task to "current" time, then its up to
+        * date and ready to go to new CPU/rt_rq. But we have difficulty in
+        * getting what current time is, so simply throw away the out-of-date
+        * time. This will result in the wakee task is less decayed, but giving
+        * the wakee more load sounds not bad.
+        */
+       if (!(rt_se->avg.last_update_time && prev))
+               return;
+#ifndef CONFIG_64BIT
+       {
+               u64 p_last_update_time_copy;
+               u64 n_last_update_time_copy;
+
+               do {
+                       p_last_update_time_copy = prev->load_last_update_time_copy;
+                       n_last_update_time_copy = next->load_last_update_time_copy;
+
+                       smp_rmb();
+
+                       p_last_update_time = prev->avg.last_update_time;
+                       n_last_update_time = next->avg.last_update_time;
+
+               } while (p_last_update_time != p_last_update_time_copy ||
+                        n_last_update_time != n_last_update_time_copy);
+       }
+#else
+       p_last_update_time = prev->avg.last_update_time;
+       n_last_update_time = next->avg.last_update_time;
+#endif
+       __update_load_avg(p_last_update_time, cpu_of(rq_of_rt_rq(prev)),
+               &rt_se->avg, 0, 0, NULL);
+
+       rt_se->avg.last_update_time = n_last_update_time;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifndef CONFIG_64BIT
+static inline u64 rt_rq_last_update_time(struct rt_rq *rt_rq)
+{
+       u64 last_update_time_copy;
+       u64 last_update_time;
+
+       do {
+               last_update_time_copy = rt_rq->load_last_update_time_copy;
+               smp_rmb();
+               last_update_time = rt_rq->avg.last_update_time;
+       } while (last_update_time != last_update_time_copy);
+
+       return last_update_time;
+}
+#else
+static inline u64 rt_rq_last_update_time(struct rt_rq *rt_rq)
+{
+       return rt_rq->avg.last_update_time;
+}
+#endif
+
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_rt_entity_load_avg(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+       u64 last_update_time;
+
+       last_update_time = rt_rq_last_update_time(rt_rq);
+       __update_load_avg(last_update_time, cpu_of(rq_of_rt_rq(rt_rq)),
+                               &rt_se->avg, 0, 0, NULL);
+}
+
+/*
+ * Task first catches up with rt_rq, and then subtract
+ * itself from the rt_rq (task must be off the queue now).
+ */
+static void remove_rt_entity_load_avg(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+       /*
+        * tasks cannot exit without having gone through wake_up_new_task() ->
+        * post_init_entity_util_avg() which will have added things to the
+        * rt_rq, so we can remove unconditionally.
+        *
+        * Similarly for groups, they will have passed through
+        * post_init_entity_util_avg() before unregister_sched_fair_group()
+        * calls this.
+        */
+
+       sync_rt_entity_load_avg(rt_se);
+       atomic_long_add(rt_se->avg.load_avg, &rt_rq->removed_load_avg);
+       atomic_long_add(rt_se->avg.util_avg, &rt_rq->removed_util_avg);
+}
+
+static void attach_task_rt_rq(struct task_struct *p)
+{
+       struct sched_rt_entity *rt_se = &p->rt;
+       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+       u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
+
+       update_rt_load_avg(now, rt_se);
+       attach_rt_entity_load_avg(rt_rq, rt_se);
+}
+
+static void detach_task_rt_rq(struct task_struct *p)
+{
+       struct sched_rt_entity *rt_se = &p->rt;
+       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+       u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
+
+       update_rt_load_avg(now, rt_se);
+       detach_rt_entity_load_avg(rt_rq, rt_se);
+}
+
+static void migrate_task_rq_rt(struct task_struct *p)
+{
+       /*
+        * We are supposed to update the task to "current" time, then its up to date
+        * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
+        * what current time is, so simply throw away the out-of-date time. This
+        * will result in the wakee task is less decayed, but giving the wakee more
+        * load sounds not bad.
+        */
+       remove_rt_entity_load_avg(&p->rt);
+
+       /* Tell new CPU we are migrated */
+       p->rt.avg.last_update_time = 0;
+
+       /* We have migrated, no longer consider this task hot */
+       p->se.exec_start = 0;
+}
+
+static void task_dead_rt(struct task_struct *p)
+{
+       remove_rt_entity_load_avg(&p->rt);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void task_set_group_rt(struct task_struct *p)
+{
+       set_task_rq(p, task_cpu(p));
+}
+
+static void task_move_group_rt(struct task_struct *p)
+{
+       detach_task_rt_rq(p);
+       set_task_rq(p, task_cpu(p));
+
+#ifdef CONFIG_SMP
+       /* Tell se's cfs_rq has been changed -- migrated */
+       p->se.avg.last_update_time = 0;
+#endif
+       attach_task_rt_rq(p);
+}
+
+static void task_change_group_rt(struct task_struct *p, int type)
+{
+       switch (type) {
+       case TASK_SET_GROUP:
+               task_set_group_rt(p);
+               break;
+
+       case TASK_MOVE_GROUP:
+               task_move_group_rt(p);
+               break;
+       }
+}
+#endif
+
 static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
 {
        /*
@@ -1482,8 +1834,54 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
        resched_curr(rq);
 }
 
+/* Give new sched_entity start runnable values to heavy its load in infant time */
+void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se)
+{
+       struct sched_avg *sa = &rt_se->avg;
+
+       sa->last_update_time = 0;
+
+       sa->period_contrib = 1023;
+
+       /*
+        * Tasks are intialized with zero load.
+        * Load is not actually used by RT, but can be inherited into fair task.
+        */
+       sa->load_avg = 0;
+       sa->load_sum = 0;
+       /*
+        * At this point, util_avg won't be used in select_task_rq_rt anyway
+        */
+       sa->util_avg = 0;
+       sa->util_sum = 0;
+       /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+}
+#else
+void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se) { }
 #endif /* CONFIG_SMP */
 
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+static inline void set_victim_flag(struct task_struct *p)
+{
+       p->victim_flag = 1;
+}
+
+static inline void clear_victim_flag(struct task_struct *p)
+{
+       p->victim_flag = 0;
+}
+
+static inline bool test_victim_flag(struct task_struct *p)
+{
+       if (p->victim_flag)
+               return true;
+       else
+               return false;
+}
+#else
+static inline bool test_victim_flag(struct task_struct *p) { return false; }
+static inline void clear_victim_flag(struct task_struct *p) {}
+#endif
 /*
  * Preempt the current task with a newly woken task if needed:
  */
@@ -1492,6 +1890,10 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag
        if (p->prio < rq->curr->prio) {
                resched_curr(rq);
                return;
+       } else if (test_victim_flag(p)) {
+               requeue_task_rt(rq, p, 1);
+               resched_curr(rq);
+               return;
        }
 
 #ifdef CONFIG_SMP
@@ -1534,15 +1936,18 @@ static struct task_struct *_pick_next_task_rt(struct rq *rq)
        struct sched_rt_entity *rt_se;
        struct task_struct *p;
        struct rt_rq *rt_rq  = &rq->rt;
+       u64 now = rq_clock_task(rq);
 
        do {
                rt_se = pick_next_rt_entity(rq, rt_rq);
                BUG_ON(!rt_se);
+               update_rt_load_avg(now, rt_se);
+               rt_rq->curr = rt_se;
                rt_rq = group_rt_rq(rt_se);
        } while (rt_rq);
 
        p = rt_task_of(rt_se);
-       p->se.exec_start = rq_clock_task(rq);
+       p->se.exec_start = now;
 
        return p;
 }
@@ -1598,14 +2003,17 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf)
                update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), rt_rq,
                                        rq->curr->sched_class == &rt_sched_class);
 
+       clear_victim_flag(p);
+
        return p;
 }
 
 static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
-       update_curr_rt(rq);
+       struct sched_rt_entity *rt_se = &p->rt;
+       u64 now = rq_clock_task(rq);
 
-       update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
+       update_curr_rt(rq);
 
        /*
         * The previous task needs to be made eligible for pushing
@@ -1613,10 +2021,130 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
         */
        if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
+
+       for_each_sched_rt_entity(rt_se) {
+               struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+               if (rt_se->on_rq)
+                       update_rt_load_avg(now, rt_se);
+
+               rt_rq->curr = NULL;
+       }
 }
 
 #ifdef CONFIG_SMP
 
+void rt_rq_util_change(struct rt_rq *rt_rq)
+{
+       if (&this_rq()->rt == rt_rq)
+               cpufreq_update_util(rt_rq->rq, SCHED_CPUFREQ_RT);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_rt_util(struct rt_rq *cfs_rq, struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *grt_rq = rt_se->my_q;
+       long delta = grt_rq->avg.util_avg - rt_se->avg.util_avg;
+
+       /* Nothing to update */
+       if (!delta)
+               return;
+
+       /* Set new sched_rt_entity's utilization */
+       rt_se->avg.util_avg = grt_rq->avg.util_avg;
+       rt_se->avg.util_sum = rt_se->avg.util_avg * LOAD_AVG_MAX;
+
+       /* Update parent rt_rq utilization */
+       add_positive(&cfs_rq->avg.util_avg, delta);
+       cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_rt_load(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *grt_rq = rt_se->my_q;
+       long delta = grt_rq->avg.load_avg - rt_se->avg.load_avg;
+
+       /*
+        * TODO: Need to consider the TG group update
+        * for RT RQ
+        */
+
+       /* Nothing to update */
+       if (!delta)
+               return;
+
+       /* Set new sched_rt_entity's load */
+       rt_se->avg.load_avg = grt_rq->avg.load_avg;
+       rt_se->avg.load_sum = rt_se->avg.load_avg * LOAD_AVG_MAX;
+
+       /* Update parent cfs_rq load */
+       add_positive(&rt_rq->avg.load_avg, delta);
+       rt_rq->avg.load_sum = rt_rq->avg.load_avg * LOAD_AVG_MAX;
+
+       /*
+        * TODO: If the sched_entity is already enqueued, should we have to update the
+        * runnable load avg.
+        */
+}
+
+static inline int test_and_clear_tg_rt_propagate(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *rt_rq = rt_se->my_q;
+
+       if (!rt_rq->propagate_avg)
+               return 0;
+
+       rt_rq->propagate_avg = 0;
+       return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_rt_load_avg(struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *rt_rq;
+
+       if (rt_entity_is_task(rt_se))
+               return 0;
+
+       if (!test_and_clear_tg_rt_propagate(rt_se))
+               return 0;
+
+       rt_rq = rt_rq_of_se(rt_se);
+
+       rt_rq->propagate_avg = 1;
+
+       update_tg_rt_util(rt_rq, rt_se);
+       update_tg_rt_load(rt_rq, rt_se);
+
+       return 1;
+}
+#else
+static inline int propagate_entity_rt_load_avg(struct sched_rt_entity *rt_se) { };
+#endif
+
+void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se)
+{
+       struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+       struct rq *rq = rq_of_rt_rq(rt_rq);
+       int cpu = cpu_of(rq);
+       /*
+        * Track task load average for carrying it to new CPU after migrated.
+        */
+       if (rt_se->avg.last_update_time)
+               __update_load_avg(now, cpu, &rt_se->avg, scale_load_down(NICE_0_LOAD),
+                       rt_rq->curr == rt_se, NULL);
+
+       update_rt_rq_load_avg(now, cpu, rt_rq, true);
+       propagate_entity_rt_load_avg(rt_se);
+
+       if (entity_is_task(rt_se))
+               trace_sched_rt_load_avg_task(rt_task_of(rt_se), &rt_se->avg);
+}
+
 /* Only try algorithms three times */
 #define RT_MAX_TRIES 3
 
@@ -1650,8 +2178,295 @@ static struct task_struct *pick_highest_pushable_task(struct rq *rq, int cpu)
 
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+static unsigned int sched_rt_boost_threshold = 60;
+
+static inline struct cpumask *sched_group_cpus_rt(struct sched_group *sg)
+{
+       return to_cpumask(sg->cpumask);
+}
+
+static inline int weight_from_rtprio(int prio)
+{
+       int idx = (prio >> 1);
+
+       if (!rt_prio(prio))
+               return sched_prio_to_weight[prio - MAX_RT_PRIO];
+
+       if ((idx << 1) == prio)
+               return rtprio_to_weight[idx];
+       else
+               return ((rtprio_to_weight[idx] + rtprio_to_weight[idx+1]) >> 1);
+}
+
+/* Affordable CPU:
+ * to find the best CPU in which the data is kept in cache-hot
+ *
+ * In most of time, RT task is invoked because,
+ *  Case - I : it is already scheduled some time ago, or
+ *  Case - II: it is requested by some task without timedelay
+ *
+ * In case-I, it's hardly to find the best CPU in cache-hot if the time is relatively long.
+ * But in case-II, waker CPU is likely to keep the cache-hot data useful to wakee RT task.
+ */
+static inline int affordable_cpu(int cpu, unsigned long task_load)
+{
+       /*
+        * If the task.state is 'TASK_INTERRUPTIBLE',
+        * she is likely to call 'schedule()' explicitely, for waking up RT task.
+        *   and have something in common with it.
+        */
+       if (cpu_curr(cpu)->state != TASK_INTERRUPTIBLE)
+               return 0;
+
+       /*
+        * Waker CPU must accommodate the target RT task.
+        */
+       if (capacity_of(cpu) <= task_load)
+               return 0;
+
+       /*
+        * Future work (More concerns if needed):
+        * - Min opportunity cost between the eviction of tasks and dismiss of target RT
+        *      : If evicted tasks are expecting too many damage for its execution,
+        *              Target RT should not be this CPU.
+        *      load(RT) >= Capa(CPU)/3 && load(evicted tasks) >= Capa(CPU)/3
+        * - Identifying the relation:
+        *      : Is it possible to identify the relation (such as mutex owner and waiter)
+        * -
+        */
+
+       return 1;
+}
+
+extern unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+extern unsigned long task_util(struct task_struct *p);
+
+/*
+ * Must find the victim or recessive (not in lowest_mask)
+ *
+ */
+/* Future-safe accessor for struct task_struct's cpus_allowed. */
+#define rttsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+
+static int find_victim_rt_rq(struct task_struct *task, struct sched_group *sg, int *best_cpu) {
+       struct cpumask *sg_cpus = sched_group_cpus_rt(sg);
+       int i;
+       unsigned long victim_rtweight, target_rtweight, min_rtweight;
+       unsigned int victim_cpu_cap, min_cpu_cap = arch_scale_cpu_capacity(NULL, task_cpu(task));
+       bool victim_rt = true;
+
+       if (!rt_task(task))
+               return *best_cpu;
+
+       target_rtweight = task->rt.avg.util_avg * weight_from_rtprio(task->prio);
+       min_rtweight = target_rtweight;
+
+       for_each_cpu_and(i, sg_cpus, rttsk_cpus_allowed(task)) {
+               struct task_struct *victim = cpu_rq(i)->curr;
+
+               if (victim->nr_cpus_allowed < 2)
+                       continue;
+
+               if (rt_task(victim)) {
+                       victim_cpu_cap = arch_scale_cpu_capacity(NULL, i);
+                       victim_rtweight = victim->rt.avg.util_avg * weight_from_rtprio(victim->prio);
+
+                       if (min_cpu_cap == victim_cpu_cap) {
+                               if (victim_rtweight < min_rtweight) {
+                                       min_rtweight = victim_rtweight;
+                                       *best_cpu = i;
+                                       min_cpu_cap = victim_cpu_cap;
+                               }
+                       } else {
+                               /*
+                                * It's necessary to un-cap the cpu capacity when comparing
+                                * utilization of each CPU. This is why the Fluid RT tries to give
+                                * the green light on big CPU to the long-run RT task
+                                * in accordance with the priority.
+                                */
+                               if (victim_rtweight * min_cpu_cap < min_rtweight * victim_cpu_cap) {
+                                       min_rtweight = victim_rtweight;
+                                       *best_cpu = i;
+                                       min_cpu_cap = victim_cpu_cap;
+                               }
+                       }
+               } else {
+                       /* If Non-RT CPU is exist, select it first. */
+                       *best_cpu = i;
+                       victim_rt = false;
+                       break;
+               }
+       }
+
+       if (*best_cpu >= 0 && victim_rt) {
+               set_victim_flag(cpu_rq(*best_cpu)->curr);
+       }
+
+       if (victim_rt)
+               trace_sched_fluid_stat(task, &task->se.avg, *best_cpu, "VICTIM-FAIR");
+       else
+               trace_sched_fluid_stat(task, &task->se.avg, *best_cpu, "VICTIM-RT");
+
+       return *best_cpu;
+
+}
+
+static int find_lowest_rq_fluid(struct task_struct *task, int wake_flags)
+{
+       int cpu, best_cpu = -1;
+       int prefer_cpu = smp_processor_id();    /* Cache-hot with itself or waker (default). */
+       int boosted = 0;
+       struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
+       struct sched_domain *sd;
+       struct sched_group *sg;
+       u64 cpu_load = ULLONG_MAX, min_load = ULLONG_MAX, min_rt_load = ULLONG_MAX;
+       int min_cpu = -1, min_rt_cpu = -1;
+
+       /* Make sure the mask is initialized first */
+       if (unlikely(!lowest_mask)) {
+               trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "NA LOWESTMSK");
+               goto out;
+       }
+
+       if (task->nr_cpus_allowed == 1) {
+               trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "NA ALLOWED");
+               goto out; /* No other targets possible */
+       }
+
+       /* update the per-cpu local_cpu_mask (lowest_mask) */
+       cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask);
+
+       /*
+        *
+        * Fluid Sched Core selection procedure:
+        *
+        * 1. Cache hot : this cpu (waker if wake_list is null)
+        * 2. idle CPU selection (prev_cpu first)
+        * 3. recessive task first (prev_cpu first)
+        * 4. victim task first (prev_cpu first)
+        */
+
+       /*
+        * 1. Cache hot : packing the callee and caller,
+        *      when there is nothing to run except callee
+        */
+       if ((wake_flags || affordable_cpu(prefer_cpu, task_util(task))) &&
+               cpumask_test_cpu(prefer_cpu, cpu_online_mask)) {
+               task->rt.sync_flag = 1;
+               best_cpu = prefer_cpu;
+               trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "CACHE-HOT");
+               goto out;
+       }
+
+       prefer_cpu = task_cpu(task);
+
+       /*
+        * 2. idle CPU selection
+        */
+       boosted = (task->rt.avg.util_avg > sched_rt_boost_threshold) ? (1) : (0);
+
+       /* TODO: Need to refer the scheduling status of eHMP */
+       for_each_cpu_and(cpu, rttsk_cpus_allowed(task), cpu_online_mask){
+               if (boosted && cpu < cpumask_first(cpu_coregroup_mask(prefer_cpu)))
+                       continue;
+
+               if (idle_cpu(cpu)) {
+                       best_cpu = cpu;
+                       trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "IDLE-FIRST");
+                       goto out;
+               }
+       }
+
+       rcu_read_lock();
+
+       sd = boosted ?
+               rcu_dereference(per_cpu(sd_ea, 0)) :
+               rcu_dereference(per_cpu(sd_ea, prefer_cpu));
+
+       if (!sd)
+               goto unlock;
+
+       sg = sd->groups;
+
+       /*
+        * 3. recessive task first
+        */
+       do {
+               for_each_cpu_and(cpu, sched_group_span(sg), lowest_mask) {
+
+                       cpu_load = cpu_util_wake(cpu, task) + task_util(task);
+
+                       if (rt_task(cpu_rq(cpu)->curr)) {
+                               if (cpu_load < min_rt_load ||
+                                       (cpu_load == min_rt_load && cpu == prefer_cpu)) {
+                                       min_rt_load = cpu_load;
+                                       min_rt_cpu = cpu;
+                               }
+
+                               continue;
+                       }
+                       if (cpu_load < min_load ||
+                               (cpu_load == min_load && cpu == prefer_cpu)) {
+                               min_load = cpu_load;
+                               min_cpu = cpu;
+                       }
+
+               }
+
+               /* Fair recessive task : best min-load of non-rt cpu is exist? */
+               if (min_cpu >= 0 &&
+                       ((capacity_of(min_cpu) >= min_load) || (min_cpu == prefer_cpu))) {
+                       best_cpu = min_cpu;
+                       trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "FAIR-RECESS");
+                       goto unlock;
+               }
+
+               /* RT recessive task : best min-load of rt cpu is exist? */
+               if (min_rt_cpu >= 0 &&
+                       ((capacity_of(min_rt_cpu) >= min_rt_load) || (min_rt_cpu == prefer_cpu))) {
+                       best_cpu = min_rt_cpu;
+                       trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "RT-RECESS");
+                       goto unlock;
+               }
+
+       } while (sg = sg->next, sg != sd->groups);
+       /* need to check the method for traversing the sg */
+
+       sg = sd->groups;
+
+       /*
+        * 4. victim task first
+        */
+       do {
+               if (find_victim_rt_rq(task, sg, &best_cpu) != -1)
+                       break;
+       } while (sg = sg->next, sg != sd->groups);
+
+       if (best_cpu < 0)
+               best_cpu = prefer_cpu;
+unlock:
+       rcu_read_unlock();
+out:
+
+       if (!cpumask_test_cpu(best_cpu, cpu_online_mask)) {
+               trace_sched_fluid_stat(task, &task->se.avg, cpu, "NOTHING_VALID");
+               best_cpu = -1;
+       }
+
+       return best_cpu;
+}
+#endif /* CONFIG_SCHED_USE_FLUID_RT */
+
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+static int find_lowest_rq(struct task_struct *task, int wake_flags)
+#else
 static int find_lowest_rq(struct task_struct *task)
+#endif
 {
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+       return find_lowest_rq_fluid(task, wake_flags);
+#else
        struct sched_domain *sd;
        struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
        int this_cpu = smp_processor_id();
@@ -1722,6 +2537,7 @@ static int find_lowest_rq(struct task_struct *task)
        if (cpu < nr_cpu_ids)
                return cpu;
        return -1;
+#endif /* CONFIG_SCHED_USE_FLUID_RT */
 }
 
 /* Will lock the rq it finds */
@@ -1732,14 +2548,26 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
        int cpu;
 
        for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+               cpu = find_lowest_rq(task, 0);
+#else
                cpu = find_lowest_rq(task);
-
+#endif
                if ((cpu == -1) || (cpu == rq->cpu))
                        break;
 
                lowest_rq = cpu_rq(cpu);
-
-               if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+               if (task->rt.sync_flag == 1 && lowest_rq->rt.highest_prio.curr == task->prio) {
+                       /*
+                        * If the sync flag is set,
+                        * let the task go even though its priority is same with current.
+                        */
+                       trace_sched_fluid_stat(task, &task->se.avg, cpu, "SYNC AGAIN");
+               } else
+ #else
+               if (lowest_rq->rt.highest_prio.curr <= task->prio)
+               {
                        /*
                         * Target rq has tasks of equal or higher priority,
                         * retrying does not release any lock and is unlikely
@@ -1748,6 +2576,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
                        lowest_rq = NULL;
                        break;
                }
+#endif
 
                /* if the prio of this runqueue changed, try again */
                if (double_lock_balance(rq, lowest_rq)) {
@@ -2178,8 +3007,19 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
            p->nr_cpus_allowed > 1 &&
            (dl_task(rq->curr) || rt_task(rq->curr)) &&
            (rq->curr->nr_cpus_allowed < 2 ||
-            rq->curr->prio <= p->prio))
+            rq->curr->prio <= p->prio)) {
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+               if (p->rt.sync_flag && rq->curr->prio < p->prio) {
+                       p->rt.sync_flag = 0;
+                       push_rt_tasks(rq);
+               }
+#else
                push_rt_tasks(rq);
+#endif
+       }
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+       p->rt.sync_flag = 0;
+#endif
 }
 
 /* Assumes rq->lock is held */
@@ -2210,6 +3050,7 @@ static void rq_offline_rt(struct rq *rq)
  */
 static void switched_from_rt(struct rq *rq, struct task_struct *p)
 {
+       detach_task_rt_rq(p);
        /*
         * If there are other RT tasks then we will reschedule
         * and the scheduling of the other RT tasks will handle
@@ -2232,8 +3073,15 @@ void __init init_sched_rt_class(void)
                                        GFP_KERNEL, cpu_to_node(i));
        }
 }
+#else
+void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se)
+{
+}
 #endif /* CONFIG_SMP */
 
+extern void
+copy_sched_avg(struct sched_avg *from, struct sched_avg *to, unsigned int ratio);
+
 /*
  * When switching a task to RT, we may overload the runqueue
  * with RT tasks. In this case we try to push them off to
@@ -2241,6 +3089,8 @@ void __init init_sched_rt_class(void)
  */
 static void switched_to_rt(struct rq *rq, struct task_struct *p)
 {
+       /* Copy fair sched avg into rt sched avg */
+       copy_sched_avg(&p->se.avg, &p->rt.avg, 100);
        /*
         * If we are already running, then there's nothing
         * that needs to be done. But if we are not running
@@ -2328,9 +3178,12 @@ static inline void watchdog(struct rq *rq, struct task_struct *p) { }
 static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 {
        struct sched_rt_entity *rt_se = &p->rt;
+       u64 now = rq_clock_task(rq);
 
        update_curr_rt(rq);
-       update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
+
+       for_each_sched_rt_entity(rt_se)
+               update_rt_load_avg(now, rt_se);
 
        watchdog(rq, p);
 
@@ -2362,9 +3215,15 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
 static void set_curr_task_rt(struct rq *rq)
 {
        struct task_struct *p = rq->curr;
+       struct sched_rt_entity *rt_se = &p->rt;
 
        p->se.exec_start = rq_clock_task(rq);
 
+       for_each_sched_rt_entity(rt_se) {
+               struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+               rt_rq->curr = rt_se;
+       }
+
        /* The running task is never eligible for pushing */
        dequeue_pushable_task(rq, p);
 }
@@ -2394,6 +3253,8 @@ const struct sched_class rt_sched_class = {
 #ifdef CONFIG_SMP
        .select_task_rq         = select_task_rq_rt,
 
+       .migrate_task_rq                = migrate_task_rq_rt,
+       .task_dead                              = task_dead_rt,
        .set_cpus_allowed       = set_cpus_allowed_common,
        .rq_online              = rq_online_rt,
        .rq_offline             = rq_offline_rt,
@@ -2410,6 +3271,9 @@ const struct sched_class rt_sched_class = {
        .switched_to            = switched_to_rt,
 
        .update_curr            = update_curr_rt,
+#ifdef CONFIG_RT_GROUP_SCHED
+       .task_change_group      = task_change_group_rt,
+#endif
 };
 
 #ifdef CONFIG_RT_GROUP_SCHED