#include "walt.h"
+#include <trace/events/sched.h>
+
int sched_rr_timeslice = RR_TIMESLICE;
int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
+
+void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se);
+
static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun);
struct rt_bandwidth def_rt_bandwidth;
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
+ atomic_long_set(&rt_rq->removed_util_avg, 0);
+ atomic_long_set(&rt_rq->removed_load_avg, 0);
#endif /* CONFIG_SMP */
/* We start is dequeued state, because no RT tasks are queued */
rt_rq->rt_queued = 0;
init_rt_rq(rt_rq);
rt_rq->rt_runtime = tg->rt_bandwidth.rt_runtime;
init_tg_rt_entry(tg, rt_rq, rt_se, i, parent->rt_se[i]);
+ init_rt_entity_runnable_average(rt_se);
}
return 1;
#ifdef CONFIG_SMP
+#include "sched-pelt.h"
+#define entity_is_task(se) (!se->my_q)
+
+extern u64 decay_load(u64 val, u64 n);
+
+static u32 __accumulate_pelt_segments_rt(u64 periods, u32 d1, u32 d3)
+{
+ u32 c1, c2, c3 = d3;
+
+ c1 = decay_load((u64)d1, periods);
+
+ c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024;
+
+ return c1 + c2 + c3;
+}
+
+#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT)
+
+static __always_inline u32
+accumulate_sum_rt(u64 delta, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running)
+{
+ unsigned long scale_freq, scale_cpu;
+ u32 contrib = (u32)delta;
+ u64 periods;
+
+ scale_freq = arch_scale_freq_capacity(NULL, cpu);
+ scale_cpu = arch_scale_cpu_capacity(NULL, cpu);
+
+ delta += sa->period_contrib;
+ periods = delta / 1024;
+
+ if (periods) {
+ sa->load_sum = decay_load(sa->load_sum, periods);
+ sa->util_sum = decay_load((u64)(sa->util_sum), periods);
+
+ delta %= 1024;
+ contrib = __accumulate_pelt_segments_rt(periods,
+ 1024 - sa->period_contrib, delta);
+ }
+ sa->period_contrib = delta;
+
+ contrib = cap_scale(contrib, scale_freq);
+ if (weight) {
+ sa->load_sum += weight * contrib;
+ }
+ if (running)
+ sa->util_sum += contrib * scale_cpu;
+
+ return periods;
+}
+
+/*
+ * We can represent the historical contribution to runnable average as the
+ * coefficients of a geometric series, exactly like fair task load.
+ * refer the ___update_load_avg @ fair sched class
+ */
+static __always_inline int
+__update_load_avg(u64 now, int cpu, struct sched_avg *sa,
+ unsigned long weight, int running, struct rt_rq *rt_rq)
+{
+ u64 delta;
+
+ delta = now - sa->last_update_time;
+
+ if ((s64)delta < 0) {
+ sa->last_update_time = now;
+ return 0;
+ }
+
+ delta >>= 10;
+ if (!delta)
+ return 0;
+
+ sa->last_update_time += delta << 10;
+
+ if (!weight)
+ running = 0;
+
+ if (!accumulate_sum_rt(delta, cpu, sa, weight, running))
+ return 0;
+
+ sa->load_avg = div_u64(sa->load_sum, LOAD_AVG_MAX - 1024 + sa->period_contrib);
+ sa->util_avg = sa->util_sum / (LOAD_AVG_MAX - 1024 + sa->period_contrib);
+
+ return 1;
+}
+
static void pull_rt_task(struct rq *this_rq);
static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev)
dec_rt_group(rt_se, rt_rq);
}
+#ifdef CONFIG_SMP
+static void
+attach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ rt_se->avg.last_update_time = rt_rq->avg.last_update_time;
+ rt_rq->avg.util_avg += rt_se->avg.util_avg;
+ rt_rq->avg.util_sum += rt_se->avg.util_sum;
+ rt_rq->avg.load_avg += rt_se->avg.load_avg;
+ rt_rq->avg.load_sum += rt_se->avg.load_sum;
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq->propagate_avg = 1;
+#endif
+ rt_rq_util_change(rt_rq);
+}
+
+static void
+detach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ sub_positive(&rt_rq->avg.util_avg, rt_se->avg.util_avg);
+ sub_positive(&rt_rq->avg.util_sum, rt_se->avg.util_sum);
+ sub_positive(&rt_rq->avg.load_avg, rt_se->avg.load_avg);
+ sub_positive(&rt_rq->avg.load_sum, rt_se->avg.load_sum);
+#ifdef CONFIG_RT_GROUP_SCHED
+ rt_rq->propagate_avg = 1;
+#endif
+ rt_rq_util_change(rt_rq);
+}
+#else
+static inline void
+attach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) {}
+static inline void
+detach_rt_entity_load_avg(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se) {}
+#endif
+
/*
* Change rt_se->run_list location unless SAVE && !MOVE
*
}
rt_se->on_rq = 1;
+ update_rt_load_avg(rq_clock_task(rq_of_rt_rq(rt_rq)), rt_se);
+
+ if (rt_entity_is_task(rt_se) && !rt_se->avg.last_update_time)
+ attach_rt_entity_load_avg(rt_rq, rt_se);
+
inc_rt_tasks(rt_se, rt_rq);
}
}
rt_se->on_rq = 0;
+ update_rt_load_avg(rq_clock_task(rq_of_rt_rq(rt_rq)), rt_se);
+
dec_rt_tasks(rt_se, rt_rq);
}
}
#ifdef CONFIG_SMP
-static int find_lowest_rq(struct task_struct *task);
+/* TODO:
+ * attach/detach/migrate_task_rt_rq() for load tracking
+ */
+
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+static int find_lowest_rq(struct task_struct *task, int wake_flags);
+#else
+static int find_lowest_rq(struct task_struct *task);
+#endif
static int
select_task_rq_rt(struct task_struct *p, int cpu, int sd_flag, int flags,
int sibling_count_hint)
rcu_read_lock();
curr = READ_ONCE(rq->curr); /* unlocked access */
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ if (curr) {
+ int target = find_lowest_rq(p, flags);
+ /*
+ * Even though the destination CPU is running
+ * a higher priority task, FluidRT can bother moving it
+ * when its utilization is very small, and the other CPU is too busy
+ * to accomodate the p in the point of priority and utilization.
+ *
+ * BTW, if the curr has higher priority than p, FluidRT tries to find
+ * the other CPUs first. In the worst case, curr can be victim, if it
+ * has very small utilization.
+ */
+ if (likely(target != -1)) {
+ cpu = target;
+ }
+ }
+#else
+
/*
* If the current task on @p's runqueue is an RT task, then
* try to see if we can wake this RT task up on another
(curr->nr_cpus_allowed < 2 ||
curr->prio <= p->prio)) {
int target = find_lowest_rq(p);
-
/*
* Don't bother moving it if the destination CPU is
* not running a lower priority task.
p->prio < cpu_rq(target)->rt.highest_prio.curr)
cpu = target;
}
+#endif
rcu_read_unlock();
out:
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ if (cpu >= 6)
+ trace_sched_fluid_stat(p, &p->se.avg, cpu, "BIG_ASSIGED");
+#endif
return cpu;
}
+#ifdef CONFIG_RT_GROUP_SCHED
+/*
+ * Called within set_task_rq() right before setting a task's cpu. The
+ * caller only guarantees p->pi_lock is held; no other assumptions,
+ * including the state of rq->lock, should be made.
+ */
+void set_task_rq_rt(struct sched_rt_entity *rt_se,
+ struct rt_rq *prev, struct rt_rq *next)
+{
+ u64 p_last_update_time;
+ u64 n_last_update_time;
+
+ if (!sched_feat(ATTACH_AGE_LOAD))
+ return;
+ /*
+ * We are supposed to update the task to "current" time, then its up to
+ * date and ready to go to new CPU/rt_rq. But we have difficulty in
+ * getting what current time is, so simply throw away the out-of-date
+ * time. This will result in the wakee task is less decayed, but giving
+ * the wakee more load sounds not bad.
+ */
+ if (!(rt_se->avg.last_update_time && prev))
+ return;
+#ifndef CONFIG_64BIT
+ {
+ u64 p_last_update_time_copy;
+ u64 n_last_update_time_copy;
+
+ do {
+ p_last_update_time_copy = prev->load_last_update_time_copy;
+ n_last_update_time_copy = next->load_last_update_time_copy;
+
+ smp_rmb();
+
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+
+ } while (p_last_update_time != p_last_update_time_copy ||
+ n_last_update_time != n_last_update_time_copy);
+ }
+#else
+ p_last_update_time = prev->avg.last_update_time;
+ n_last_update_time = next->avg.last_update_time;
+#endif
+ __update_load_avg(p_last_update_time, cpu_of(rq_of_rt_rq(prev)),
+ &rt_se->avg, 0, 0, NULL);
+
+ rt_se->avg.last_update_time = n_last_update_time;
+}
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+#ifndef CONFIG_64BIT
+static inline u64 rt_rq_last_update_time(struct rt_rq *rt_rq)
+{
+ u64 last_update_time_copy;
+ u64 last_update_time;
+
+ do {
+ last_update_time_copy = rt_rq->load_last_update_time_copy;
+ smp_rmb();
+ last_update_time = rt_rq->avg.last_update_time;
+ } while (last_update_time != last_update_time_copy);
+
+ return last_update_time;
+}
+#else
+static inline u64 rt_rq_last_update_time(struct rt_rq *rt_rq)
+{
+ return rt_rq->avg.last_update_time;
+}
+#endif
+
+/*
+ * Synchronize entity load avg of dequeued entity without locking
+ * the previous rq.
+ */
+void sync_rt_entity_load_avg(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ u64 last_update_time;
+
+ last_update_time = rt_rq_last_update_time(rt_rq);
+ __update_load_avg(last_update_time, cpu_of(rq_of_rt_rq(rt_rq)),
+ &rt_se->avg, 0, 0, NULL);
+}
+
+/*
+ * Task first catches up with rt_rq, and then subtract
+ * itself from the rt_rq (task must be off the queue now).
+ */
+static void remove_rt_entity_load_avg(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+
+ /*
+ * tasks cannot exit without having gone through wake_up_new_task() ->
+ * post_init_entity_util_avg() which will have added things to the
+ * rt_rq, so we can remove unconditionally.
+ *
+ * Similarly for groups, they will have passed through
+ * post_init_entity_util_avg() before unregister_sched_fair_group()
+ * calls this.
+ */
+
+ sync_rt_entity_load_avg(rt_se);
+ atomic_long_add(rt_se->avg.load_avg, &rt_rq->removed_load_avg);
+ atomic_long_add(rt_se->avg.util_avg, &rt_rq->removed_util_avg);
+}
+
+static void attach_task_rt_rq(struct task_struct *p)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
+
+ update_rt_load_avg(now, rt_se);
+ attach_rt_entity_load_avg(rt_rq, rt_se);
+}
+
+static void detach_task_rt_rq(struct task_struct *p)
+{
+ struct sched_rt_entity *rt_se = &p->rt;
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ u64 now = rq_clock_task(rq_of_rt_rq(rt_rq));
+
+ update_rt_load_avg(now, rt_se);
+ detach_rt_entity_load_avg(rt_rq, rt_se);
+}
+
+static void migrate_task_rq_rt(struct task_struct *p)
+{
+ /*
+ * We are supposed to update the task to "current" time, then its up to date
+ * and ready to go to new CPU/cfs_rq. But we have difficulty in getting
+ * what current time is, so simply throw away the out-of-date time. This
+ * will result in the wakee task is less decayed, but giving the wakee more
+ * load sounds not bad.
+ */
+ remove_rt_entity_load_avg(&p->rt);
+
+ /* Tell new CPU we are migrated */
+ p->rt.avg.last_update_time = 0;
+
+ /* We have migrated, no longer consider this task hot */
+ p->se.exec_start = 0;
+}
+
+static void task_dead_rt(struct task_struct *p)
+{
+ remove_rt_entity_load_avg(&p->rt);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+static void task_set_group_rt(struct task_struct *p)
+{
+ set_task_rq(p, task_cpu(p));
+}
+
+static void task_move_group_rt(struct task_struct *p)
+{
+ detach_task_rt_rq(p);
+ set_task_rq(p, task_cpu(p));
+
+#ifdef CONFIG_SMP
+ /* Tell se's cfs_rq has been changed -- migrated */
+ p->se.avg.last_update_time = 0;
+#endif
+ attach_task_rt_rq(p);
+}
+
+static void task_change_group_rt(struct task_struct *p, int type)
+{
+ switch (type) {
+ case TASK_SET_GROUP:
+ task_set_group_rt(p);
+ break;
+
+ case TASK_MOVE_GROUP:
+ task_move_group_rt(p);
+ break;
+ }
+}
+#endif
+
static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
{
/*
resched_curr(rq);
}
+/* Give new sched_entity start runnable values to heavy its load in infant time */
+void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se)
+{
+ struct sched_avg *sa = &rt_se->avg;
+
+ sa->last_update_time = 0;
+
+ sa->period_contrib = 1023;
+
+ /*
+ * Tasks are intialized with zero load.
+ * Load is not actually used by RT, but can be inherited into fair task.
+ */
+ sa->load_avg = 0;
+ sa->load_sum = 0;
+ /*
+ * At this point, util_avg won't be used in select_task_rq_rt anyway
+ */
+ sa->util_avg = 0;
+ sa->util_sum = 0;
+ /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */
+}
+#else
+void init_rt_entity_runnable_average(struct sched_rt_entity *rt_se) { }
#endif /* CONFIG_SMP */
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+static inline void set_victim_flag(struct task_struct *p)
+{
+ p->victim_flag = 1;
+}
+
+static inline void clear_victim_flag(struct task_struct *p)
+{
+ p->victim_flag = 0;
+}
+
+static inline bool test_victim_flag(struct task_struct *p)
+{
+ if (p->victim_flag)
+ return true;
+ else
+ return false;
+}
+#else
+static inline bool test_victim_flag(struct task_struct *p) { return false; }
+static inline void clear_victim_flag(struct task_struct *p) {}
+#endif
/*
* Preempt the current task with a newly woken task if needed:
*/
if (p->prio < rq->curr->prio) {
resched_curr(rq);
return;
+ } else if (test_victim_flag(p)) {
+ requeue_task_rt(rq, p, 1);
+ resched_curr(rq);
+ return;
}
#ifdef CONFIG_SMP
struct sched_rt_entity *rt_se;
struct task_struct *p;
struct rt_rq *rt_rq = &rq->rt;
+ u64 now = rq_clock_task(rq);
do {
rt_se = pick_next_rt_entity(rq, rt_rq);
BUG_ON(!rt_se);
+ update_rt_load_avg(now, rt_se);
+ rt_rq->curr = rt_se;
rt_rq = group_rt_rq(rt_se);
} while (rt_rq);
p = rt_task_of(rt_se);
- p->se.exec_start = rq_clock_task(rq);
+ p->se.exec_start = now;
return p;
}
update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), rt_rq,
rq->curr->sched_class == &rt_sched_class);
+ clear_victim_flag(p);
+
return p;
}
static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
{
- update_curr_rt(rq);
+ struct sched_rt_entity *rt_se = &p->rt;
+ u64 now = rq_clock_task(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
+ update_curr_rt(rq);
/*
* The previous task needs to be made eligible for pushing
*/
if (on_rt_rq(&p->rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ if (rt_se->on_rq)
+ update_rt_load_avg(now, rt_se);
+
+ rt_rq->curr = NULL;
+ }
}
#ifdef CONFIG_SMP
+void rt_rq_util_change(struct rt_rq *rt_rq)
+{
+ if (&this_rq()->rt == rt_rq)
+ cpufreq_update_util(rt_rq->rq, SCHED_CPUFREQ_RT);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+/* Take into account change of utilization of a child task group */
+static inline void
+update_tg_rt_util(struct rt_rq *cfs_rq, struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *grt_rq = rt_se->my_q;
+ long delta = grt_rq->avg.util_avg - rt_se->avg.util_avg;
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_rt_entity's utilization */
+ rt_se->avg.util_avg = grt_rq->avg.util_avg;
+ rt_se->avg.util_sum = rt_se->avg.util_avg * LOAD_AVG_MAX;
+
+ /* Update parent rt_rq utilization */
+ add_positive(&cfs_rq->avg.util_avg, delta);
+ cfs_rq->avg.util_sum = cfs_rq->avg.util_avg * LOAD_AVG_MAX;
+}
+
+
+/* Take into account change of load of a child task group */
+static inline void
+update_tg_rt_load(struct rt_rq *rt_rq, struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *grt_rq = rt_se->my_q;
+ long delta = grt_rq->avg.load_avg - rt_se->avg.load_avg;
+
+ /*
+ * TODO: Need to consider the TG group update
+ * for RT RQ
+ */
+
+ /* Nothing to update */
+ if (!delta)
+ return;
+
+ /* Set new sched_rt_entity's load */
+ rt_se->avg.load_avg = grt_rq->avg.load_avg;
+ rt_se->avg.load_sum = rt_se->avg.load_avg * LOAD_AVG_MAX;
+
+ /* Update parent cfs_rq load */
+ add_positive(&rt_rq->avg.load_avg, delta);
+ rt_rq->avg.load_sum = rt_rq->avg.load_avg * LOAD_AVG_MAX;
+
+ /*
+ * TODO: If the sched_entity is already enqueued, should we have to update the
+ * runnable load avg.
+ */
+}
+
+static inline int test_and_clear_tg_rt_propagate(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_se->my_q;
+
+ if (!rt_rq->propagate_avg)
+ return 0;
+
+ rt_rq->propagate_avg = 0;
+ return 1;
+}
+
+/* Update task and its cfs_rq load average */
+static inline int propagate_entity_rt_load_avg(struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq;
+
+ if (rt_entity_is_task(rt_se))
+ return 0;
+
+ if (!test_and_clear_tg_rt_propagate(rt_se))
+ return 0;
+
+ rt_rq = rt_rq_of_se(rt_se);
+
+ rt_rq->propagate_avg = 1;
+
+ update_tg_rt_util(rt_rq, rt_se);
+ update_tg_rt_load(rt_rq, rt_se);
+
+ return 1;
+}
+#else
+static inline int propagate_entity_rt_load_avg(struct sched_rt_entity *rt_se) { };
+#endif
+
+void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se)
+{
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ struct rq *rq = rq_of_rt_rq(rt_rq);
+ int cpu = cpu_of(rq);
+ /*
+ * Track task load average for carrying it to new CPU after migrated.
+ */
+ if (rt_se->avg.last_update_time)
+ __update_load_avg(now, cpu, &rt_se->avg, scale_load_down(NICE_0_LOAD),
+ rt_rq->curr == rt_se, NULL);
+
+ update_rt_rq_load_avg(now, cpu, rt_rq, true);
+ propagate_entity_rt_load_avg(rt_se);
+
+ if (entity_is_task(rt_se))
+ trace_sched_rt_load_avg_task(rt_task_of(rt_se), &rt_se->avg);
+}
+
/* Only try algorithms three times */
#define RT_MAX_TRIES 3
static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+static unsigned int sched_rt_boost_threshold = 60;
+
+static inline struct cpumask *sched_group_cpus_rt(struct sched_group *sg)
+{
+ return to_cpumask(sg->cpumask);
+}
+
+static inline int weight_from_rtprio(int prio)
+{
+ int idx = (prio >> 1);
+
+ if (!rt_prio(prio))
+ return sched_prio_to_weight[prio - MAX_RT_PRIO];
+
+ if ((idx << 1) == prio)
+ return rtprio_to_weight[idx];
+ else
+ return ((rtprio_to_weight[idx] + rtprio_to_weight[idx+1]) >> 1);
+}
+
+/* Affordable CPU:
+ * to find the best CPU in which the data is kept in cache-hot
+ *
+ * In most of time, RT task is invoked because,
+ * Case - I : it is already scheduled some time ago, or
+ * Case - II: it is requested by some task without timedelay
+ *
+ * In case-I, it's hardly to find the best CPU in cache-hot if the time is relatively long.
+ * But in case-II, waker CPU is likely to keep the cache-hot data useful to wakee RT task.
+ */
+static inline int affordable_cpu(int cpu, unsigned long task_load)
+{
+ /*
+ * If the task.state is 'TASK_INTERRUPTIBLE',
+ * she is likely to call 'schedule()' explicitely, for waking up RT task.
+ * and have something in common with it.
+ */
+ if (cpu_curr(cpu)->state != TASK_INTERRUPTIBLE)
+ return 0;
+
+ /*
+ * Waker CPU must accommodate the target RT task.
+ */
+ if (capacity_of(cpu) <= task_load)
+ return 0;
+
+ /*
+ * Future work (More concerns if needed):
+ * - Min opportunity cost between the eviction of tasks and dismiss of target RT
+ * : If evicted tasks are expecting too many damage for its execution,
+ * Target RT should not be this CPU.
+ * load(RT) >= Capa(CPU)/3 && load(evicted tasks) >= Capa(CPU)/3
+ * - Identifying the relation:
+ * : Is it possible to identify the relation (such as mutex owner and waiter)
+ * -
+ */
+
+ return 1;
+}
+
+extern unsigned long cpu_util_wake(int cpu, struct task_struct *p);
+extern unsigned long task_util(struct task_struct *p);
+
+/*
+ * Must find the victim or recessive (not in lowest_mask)
+ *
+ */
+/* Future-safe accessor for struct task_struct's cpus_allowed. */
+#define rttsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
+
+static int find_victim_rt_rq(struct task_struct *task, struct sched_group *sg, int *best_cpu) {
+ struct cpumask *sg_cpus = sched_group_cpus_rt(sg);
+ int i;
+ unsigned long victim_rtweight, target_rtweight, min_rtweight;
+ unsigned int victim_cpu_cap, min_cpu_cap = arch_scale_cpu_capacity(NULL, task_cpu(task));
+ bool victim_rt = true;
+
+ if (!rt_task(task))
+ return *best_cpu;
+
+ target_rtweight = task->rt.avg.util_avg * weight_from_rtprio(task->prio);
+ min_rtweight = target_rtweight;
+
+ for_each_cpu_and(i, sg_cpus, rttsk_cpus_allowed(task)) {
+ struct task_struct *victim = cpu_rq(i)->curr;
+
+ if (victim->nr_cpus_allowed < 2)
+ continue;
+
+ if (rt_task(victim)) {
+ victim_cpu_cap = arch_scale_cpu_capacity(NULL, i);
+ victim_rtweight = victim->rt.avg.util_avg * weight_from_rtprio(victim->prio);
+
+ if (min_cpu_cap == victim_cpu_cap) {
+ if (victim_rtweight < min_rtweight) {
+ min_rtweight = victim_rtweight;
+ *best_cpu = i;
+ min_cpu_cap = victim_cpu_cap;
+ }
+ } else {
+ /*
+ * It's necessary to un-cap the cpu capacity when comparing
+ * utilization of each CPU. This is why the Fluid RT tries to give
+ * the green light on big CPU to the long-run RT task
+ * in accordance with the priority.
+ */
+ if (victim_rtweight * min_cpu_cap < min_rtweight * victim_cpu_cap) {
+ min_rtweight = victim_rtweight;
+ *best_cpu = i;
+ min_cpu_cap = victim_cpu_cap;
+ }
+ }
+ } else {
+ /* If Non-RT CPU is exist, select it first. */
+ *best_cpu = i;
+ victim_rt = false;
+ break;
+ }
+ }
+
+ if (*best_cpu >= 0 && victim_rt) {
+ set_victim_flag(cpu_rq(*best_cpu)->curr);
+ }
+
+ if (victim_rt)
+ trace_sched_fluid_stat(task, &task->se.avg, *best_cpu, "VICTIM-FAIR");
+ else
+ trace_sched_fluid_stat(task, &task->se.avg, *best_cpu, "VICTIM-RT");
+
+ return *best_cpu;
+
+}
+
+static int find_lowest_rq_fluid(struct task_struct *task, int wake_flags)
+{
+ int cpu, best_cpu = -1;
+ int prefer_cpu = smp_processor_id(); /* Cache-hot with itself or waker (default). */
+ int boosted = 0;
+ struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
+ struct sched_domain *sd;
+ struct sched_group *sg;
+ u64 cpu_load = ULLONG_MAX, min_load = ULLONG_MAX, min_rt_load = ULLONG_MAX;
+ int min_cpu = -1, min_rt_cpu = -1;
+
+ /* Make sure the mask is initialized first */
+ if (unlikely(!lowest_mask)) {
+ trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "NA LOWESTMSK");
+ goto out;
+ }
+
+ if (task->nr_cpus_allowed == 1) {
+ trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "NA ALLOWED");
+ goto out; /* No other targets possible */
+ }
+
+ /* update the per-cpu local_cpu_mask (lowest_mask) */
+ cpupri_find(&task_rq(task)->rd->cpupri, task, lowest_mask);
+
+ /*
+ *
+ * Fluid Sched Core selection procedure:
+ *
+ * 1. Cache hot : this cpu (waker if wake_list is null)
+ * 2. idle CPU selection (prev_cpu first)
+ * 3. recessive task first (prev_cpu first)
+ * 4. victim task first (prev_cpu first)
+ */
+
+ /*
+ * 1. Cache hot : packing the callee and caller,
+ * when there is nothing to run except callee
+ */
+ if ((wake_flags || affordable_cpu(prefer_cpu, task_util(task))) &&
+ cpumask_test_cpu(prefer_cpu, cpu_online_mask)) {
+ task->rt.sync_flag = 1;
+ best_cpu = prefer_cpu;
+ trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "CACHE-HOT");
+ goto out;
+ }
+
+ prefer_cpu = task_cpu(task);
+
+ /*
+ * 2. idle CPU selection
+ */
+ boosted = (task->rt.avg.util_avg > sched_rt_boost_threshold) ? (1) : (0);
+
+ /* TODO: Need to refer the scheduling status of eHMP */
+ for_each_cpu_and(cpu, rttsk_cpus_allowed(task), cpu_online_mask){
+ if (boosted && cpu < cpumask_first(cpu_coregroup_mask(prefer_cpu)))
+ continue;
+
+ if (idle_cpu(cpu)) {
+ best_cpu = cpu;
+ trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "IDLE-FIRST");
+ goto out;
+ }
+ }
+
+ rcu_read_lock();
+
+ sd = boosted ?
+ rcu_dereference(per_cpu(sd_ea, 0)) :
+ rcu_dereference(per_cpu(sd_ea, prefer_cpu));
+
+ if (!sd)
+ goto unlock;
+
+ sg = sd->groups;
+
+ /*
+ * 3. recessive task first
+ */
+ do {
+ for_each_cpu_and(cpu, sched_group_span(sg), lowest_mask) {
+
+ cpu_load = cpu_util_wake(cpu, task) + task_util(task);
+
+ if (rt_task(cpu_rq(cpu)->curr)) {
+ if (cpu_load < min_rt_load ||
+ (cpu_load == min_rt_load && cpu == prefer_cpu)) {
+ min_rt_load = cpu_load;
+ min_rt_cpu = cpu;
+ }
+
+ continue;
+ }
+ if (cpu_load < min_load ||
+ (cpu_load == min_load && cpu == prefer_cpu)) {
+ min_load = cpu_load;
+ min_cpu = cpu;
+ }
+
+ }
+
+ /* Fair recessive task : best min-load of non-rt cpu is exist? */
+ if (min_cpu >= 0 &&
+ ((capacity_of(min_cpu) >= min_load) || (min_cpu == prefer_cpu))) {
+ best_cpu = min_cpu;
+ trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "FAIR-RECESS");
+ goto unlock;
+ }
+
+ /* RT recessive task : best min-load of rt cpu is exist? */
+ if (min_rt_cpu >= 0 &&
+ ((capacity_of(min_rt_cpu) >= min_rt_load) || (min_rt_cpu == prefer_cpu))) {
+ best_cpu = min_rt_cpu;
+ trace_sched_fluid_stat(task, &task->se.avg, best_cpu, "RT-RECESS");
+ goto unlock;
+ }
+
+ } while (sg = sg->next, sg != sd->groups);
+ /* need to check the method for traversing the sg */
+
+ sg = sd->groups;
+
+ /*
+ * 4. victim task first
+ */
+ do {
+ if (find_victim_rt_rq(task, sg, &best_cpu) != -1)
+ break;
+ } while (sg = sg->next, sg != sd->groups);
+
+ if (best_cpu < 0)
+ best_cpu = prefer_cpu;
+unlock:
+ rcu_read_unlock();
+out:
+
+ if (!cpumask_test_cpu(best_cpu, cpu_online_mask)) {
+ trace_sched_fluid_stat(task, &task->se.avg, cpu, "NOTHING_VALID");
+ best_cpu = -1;
+ }
+
+ return best_cpu;
+}
+#endif /* CONFIG_SCHED_USE_FLUID_RT */
+
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+static int find_lowest_rq(struct task_struct *task, int wake_flags)
+#else
static int find_lowest_rq(struct task_struct *task)
+#endif
{
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ return find_lowest_rq_fluid(task, wake_flags);
+#else
struct sched_domain *sd;
struct cpumask *lowest_mask = this_cpu_cpumask_var_ptr(local_cpu_mask);
int this_cpu = smp_processor_id();
if (cpu < nr_cpu_ids)
return cpu;
return -1;
+#endif /* CONFIG_SCHED_USE_FLUID_RT */
}
/* Will lock the rq it finds */
int cpu;
for (tries = 0; tries < RT_MAX_TRIES; tries++) {
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ cpu = find_lowest_rq(task, 0);
+#else
cpu = find_lowest_rq(task);
-
+#endif
if ((cpu == -1) || (cpu == rq->cpu))
break;
lowest_rq = cpu_rq(cpu);
-
- if (lowest_rq->rt.highest_prio.curr <= task->prio) {
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ if (task->rt.sync_flag == 1 && lowest_rq->rt.highest_prio.curr == task->prio) {
+ /*
+ * If the sync flag is set,
+ * let the task go even though its priority is same with current.
+ */
+ trace_sched_fluid_stat(task, &task->se.avg, cpu, "SYNC AGAIN");
+ } else
+ #else
+ if (lowest_rq->rt.highest_prio.curr <= task->prio)
+ {
/*
* Target rq has tasks of equal or higher priority,
* retrying does not release any lock and is unlikely
lowest_rq = NULL;
break;
}
+#endif
/* if the prio of this runqueue changed, try again */
if (double_lock_balance(rq, lowest_rq)) {
p->nr_cpus_allowed > 1 &&
(dl_task(rq->curr) || rt_task(rq->curr)) &&
(rq->curr->nr_cpus_allowed < 2 ||
- rq->curr->prio <= p->prio))
+ rq->curr->prio <= p->prio)) {
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ if (p->rt.sync_flag && rq->curr->prio < p->prio) {
+ p->rt.sync_flag = 0;
+ push_rt_tasks(rq);
+ }
+#else
push_rt_tasks(rq);
+#endif
+ }
+#ifdef CONFIG_SCHED_USE_FLUID_RT
+ p->rt.sync_flag = 0;
+#endif
}
/* Assumes rq->lock is held */
*/
static void switched_from_rt(struct rq *rq, struct task_struct *p)
{
+ detach_task_rt_rq(p);
/*
* If there are other RT tasks then we will reschedule
* and the scheduling of the other RT tasks will handle
GFP_KERNEL, cpu_to_node(i));
}
}
+#else
+void update_rt_load_avg(u64 now, struct sched_rt_entity *rt_se)
+{
+}
#endif /* CONFIG_SMP */
+extern void
+copy_sched_avg(struct sched_avg *from, struct sched_avg *to, unsigned int ratio);
+
/*
* When switching a task to RT, we may overload the runqueue
* with RT tasks. In this case we try to push them off to
*/
static void switched_to_rt(struct rq *rq, struct task_struct *p)
{
+ /* Copy fair sched avg into rt sched avg */
+ copy_sched_avg(&p->se.avg, &p->rt.avg, 100);
/*
* If we are already running, then there's nothing
* that needs to be done. But if we are not running
static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued)
{
struct sched_rt_entity *rt_se = &p->rt;
+ u64 now = rq_clock_task(rq);
update_curr_rt(rq);
- update_rt_rq_load_avg(rq_clock_task(rq), cpu_of(rq), &rq->rt, 1);
+
+ for_each_sched_rt_entity(rt_se)
+ update_rt_load_avg(now, rt_se);
watchdog(rq, p);
static void set_curr_task_rt(struct rq *rq)
{
struct task_struct *p = rq->curr;
+ struct sched_rt_entity *rt_se = &p->rt;
p->se.exec_start = rq_clock_task(rq);
+ for_each_sched_rt_entity(rt_se) {
+ struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
+ rt_rq->curr = rt_se;
+ }
+
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
}
#ifdef CONFIG_SMP
.select_task_rq = select_task_rq_rt,
+ .migrate_task_rq = migrate_task_rq_rt,
+ .task_dead = task_dead_rt,
.set_cpus_allowed = set_cpus_allowed_common,
.rq_online = rq_online_rt,
.rq_offline = rq_offline_rt,
.switched_to = switched_to_rt,
.update_curr = update_curr_rt,
+#ifdef CONFIG_RT_GROUP_SCHED
+ .task_change_group = task_change_group_rt,
+#endif
};
#ifdef CONFIG_RT_GROUP_SCHED