ANDROID: sched: Introduce Window Assisted Load Tracking (WALT)
authorSrivatsa Vaddagiri <vatsa@codeaurora.org>
Wed, 8 Nov 2017 19:56:30 +0000 (19:56 +0000)
committerChris Redpath <chris.redpath@arm.com>
Tue, 19 Dec 2017 19:45:37 +0000 (19:45 +0000)
This patch is a combination of many many patches which have
been previously applied to Android/EAS kernels. Similarly to
other EAS components, we are squashing these to present a more
orderly view of component history and relationships.

The original description of WALT was:

Use a window based view of time in order to track task demand
and CPU utilization in the scheduler.

WALT accounts two major statistics; CPU load and cumulative tasks
demand.

The CPU load which is account of accumulated each CPU's absolute
execution time is for CPU frequency guidance.  Whereas cumulative
tasks demand which is each CPU's instantaneous load to reflect
CPU's load at given time is for task placement decision.

Use cumulative tasks demand for cpu_util() for task placement and
introduce cpu_util_freq() for frequency guidance.

This version includes the "cumulative window demand" statistic
which was originally described as:

Energy cost estimation has been a long lasting challenge for WALT
because WALT guides CPU frequency based on the CPU utilization of
previous window.  Consequently it's not possible to know newly
waking-up task's energy cost until WALT's end of the current window.

The WALT already tracks 'Previous Runnable Sum' (prev_runnable_sum)
and 'Cumulative Runnable Average' (cr_avg).  They are designed for
CPU frequency guidance and task placement but unfortunately both
are not suitable for the energy cost estimation.

It's because using prev_runnable_sum for energy cost calculation would
make us to account CPU and task's energy solely based on activity in the
previous window so for example, any task didn't have an activity in the
previous window will be accounted as a 'zero energy cost' task.
Energy estimation with cr_avg is what energy_diff() relies on at present.
However cr_avg can only represent instantaneous picture of energy cost
thus for example, if a CPU was fully occupied for an entire WALT window
and became idle just before window boundary, and if there is a wake-up,
energy_diff() accounts that CPU is a 'zero energy cost' CPU.

As a result, introduce a new accounting unit 'Cumulative Window Demand'.
The cumulative window demand tracks all the tasks' demands have seen in
current window which is neither instantaneous nor actual execution time.
Because task demand represents estimated scaled execution time when the
task runs a full window, accumulation of all the demands represents
predicted CPU load at the end of window.

Thus we can estimate CPU's frequency at the end of current WALT window
with the cumulative window demand.

This version is extracted wholesale from the version currently
available in android-4.4 and android-4.9.

Window Assisted Load Tracking (WALT) implementation credits:
 Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa,
 Joonwoo Park, Pavan Kumar Kondeti, Olav Haugan,
 Srinath Sridharan, Vikram Mulukutla, Todd Kjos, Juri Lelli,
 John Stultz, Andres Oportus

Change-Id: If92dd9db843374073be59d2cb83febfef993b562
Signed-off-by: Chris Redpath <chris.redpath@arm.com>
15 files changed:
include/linux/sched.h
include/linux/sched/sysctl.h
include/trace/events/sched.h
init/Kconfig
kernel/sched/Makefile
kernel/sched/core.c
kernel/sched/cputime.c
kernel/sched/deadline.c
kernel/sched/fair.c
kernel/sched/rt.c
kernel/sched/sched.h
kernel/sched/stop_task.c
kernel/sched/walt.c [new file with mode: 0644]
kernel/sched/walt.h [new file with mode: 0644]
kernel/sysctl.c

index fdf74f27acf1e9801051c5b6c22f1ec5008b9f42..30c35a2ecb46a8deb5e48ec98c4468f9f9bf62de 100644 (file)
@@ -166,6 +166,15 @@ struct task_group;
 /* Task command name length: */
 #define TASK_COMM_LEN                  16
 
+enum task_event {
+       PUT_PREV_TASK   = 0,
+       PICK_NEXT_TASK  = 1,
+       TASK_WAKE       = 2,
+       TASK_MIGRATE    = 3,
+       TASK_UPDATE     = 4,
+       IRQ_UPDATE      = 5,
+};
+
 extern cpumask_var_t                   cpu_isolated_map;
 
 extern void scheduler_tick(void);
@@ -410,6 +419,41 @@ struct sched_entity {
 #endif
 };
 
+#ifdef CONFIG_SCHED_WALT
+#define RAVG_HIST_SIZE_MAX  5
+
+/* ravg represents frequency scaled cpu-demand of tasks */
+struct ravg {
+       /*
+        * 'mark_start' marks the beginning of an event (task waking up, task
+        * starting to execute, task being preempted) within a window
+        *
+        * 'sum' represents how runnable a task has been within current
+        * window. It incorporates both running time and wait time and is
+        * frequency scaled.
+        *
+        * 'sum_history' keeps track of history of 'sum' seen over previous
+        * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+        * ignored.
+        *
+        * 'demand' represents maximum sum seen over previous
+        * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+        * demand for tasks.
+        *
+        * 'curr_window' represents task's contribution to cpu busy time
+        * statistics (rq->curr_runnable_sum) in current window
+        *
+        * 'prev_window' represents task's contribution to cpu busy time
+        * statistics (rq->prev_runnable_sum) in previous window
+        */
+       u64 mark_start;
+       u32 sum, demand;
+       u32 sum_history[RAVG_HIST_SIZE_MAX];
+       u32 curr_window, prev_window;
+       u16 active_windows;
+};
+#endif
+
 struct sched_rt_entity {
        struct list_head                run_list;
        unsigned long                   timeout;
@@ -562,6 +606,16 @@ struct task_struct {
        const struct sched_class        *sched_class;
        struct sched_entity             se;
        struct sched_rt_entity          rt;
+#ifdef CONFIG_SCHED_WALT
+       struct ravg ravg;
+       /*
+        * 'init_load_pct' represents the initial task load assigned to children
+        * of this task
+        */
+       u32 init_load_pct;
+       u64 last_sleep_ts;
+#endif
+
 #ifdef CONFIG_CGROUP_SCHED
        struct task_group               *sched_task_group;
 #endif
index 4824a948b4066c651602c3af00ab609dd286e461..e076ff8179b209dbe26e0f09e32e40d446f12c3a 100644 (file)
@@ -25,6 +25,12 @@ extern unsigned int sysctl_sched_sync_hint_enable;
 extern unsigned int sysctl_sched_cstate_aware;
 extern unsigned int sysctl_sched_wakeup_granularity;
 extern unsigned int sysctl_sched_child_runs_first;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sysctl_sched_walt_init_task_load_pct;
+extern unsigned int sysctl_sched_walt_cpu_high_irqload;
+#endif
 
 enum sched_tunable_scaling {
        SCHED_TUNABLESCALING_NONE,
index 682c911f76b3d6aae7102f01e1bfbde19881d216..f881dcf92caed41af47d73836fc4d6e176c2f8e5 100644 (file)
@@ -667,6 +667,52 @@ TRACE_EVENT(sched_load_rt_rq,
                  __entry->util)
 );
 
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int walt_ravg_window;
+extern bool walt_disabled;
+#endif
+
+/*
+ * Tracepoint for accounting cpu root cfs_rq
+ */
+TRACE_EVENT(sched_load_avg_cpu,
+
+        TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
+
+        TP_ARGS(cpu, cfs_rq),
+
+        TP_STRUCT__entry(
+                __field( int,   cpu                             )
+                __field( unsigned long, load_avg                )
+                __field( unsigned long, util_avg                )
+                __field( unsigned long, util_avg_pelt           )
+                __field( unsigned long, util_avg_walt           )
+        ),
+
+        TP_fast_assign(
+                __entry->cpu                    = cpu;
+                __entry->load_avg               = cfs_rq->avg.load_avg;
+                __entry->util_avg               = cfs_rq->avg.util_avg;
+                __entry->util_avg_pelt  = cfs_rq->avg.util_avg;
+                __entry->util_avg_walt  = 0;
+#ifdef CONFIG_SCHED_WALT
+                __entry->util_avg_walt  =
+                                cpu_rq(cpu)->prev_runnable_sum << SCHED_CAPACITY_SHIFT;
+                do_div(__entry->util_avg_walt, walt_ravg_window);
+                if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+                        __entry->util_avg               = __entry->util_avg_walt;
+#endif
+        ),
+
+        TP_printk("cpu=%d load_avg=%lu util_avg=%lu "
+                          "util_avg_pelt=%lu util_avg_walt=%lu",
+                  __entry->cpu, __entry->load_avg, __entry->util_avg,
+                  __entry->util_avg_pelt, __entry->util_avg_walt)
+);
+
+
 /*
  * Tracepoint for sched_entity load tracking:
  */
@@ -684,6 +730,8 @@ TRACE_EVENT(sched_load_se,
                __field(        pid_t,          pid                           )
                __field(        unsigned long,  load                          )
                __field(        unsigned long,  util                          )
+               __field(        unsigned long,  util_pelt                     )
+               __field(        unsigned long,  util_walt                     )
        ),
 
        TP_fast_assign(
@@ -698,11 +746,23 @@ TRACE_EVENT(sched_load_se,
                __entry->pid = p ? p->pid : -1;
                __entry->load = se->avg.load_avg;
                __entry->util = se->avg.util_avg;
+               __entry->util_pelt  = __entry->util;
+               __entry->util_walt  = 0;
+#ifdef CONFIG_SCHED_WALT
+               if (!se->my_q) {
+                       struct task_struct *p = container_of(se, struct task_struct, se);
+                       __entry->util_walt = p->ravg.demand;
+                       do_div(__entry->util_walt, walt_ravg_window >> SCHED_CAPACITY_SHIFT);
+                       if (!walt_disabled && sysctl_sched_use_walt_task_util)
+                               __entry->util = __entry->util_walt;
+               }
+#endif
        ),
 
-       TP_printk("cpu=%d path=%s comm=%s pid=%d load=%lu util=%lu",
+       TP_printk("cpu=%d path=%s comm=%s pid=%d load=%lu util=%lu util_pelt=%lu util_walt=%lu",
                  __entry->cpu, __get_str(path), __entry->comm,
-                 __entry->pid, __entry->load, __entry->util)
+                 __entry->pid, __entry->load, __entry->util,
+                 __entry->util_pelt, __entry->util_walt)
 );
 
 /*
@@ -921,6 +981,163 @@ TRACE_EVENT(sched_find_best_target,
                __entry->target)
 );
 
+#ifdef CONFIG_SCHED_WALT
+struct rq;
+
+TRACE_EVENT(walt_update_task_ravg,
+
+       TP_PROTO(struct task_struct *p, struct rq *rq, int evt,
+                                               u64 wallclock, u64 irqtime),
+
+       TP_ARGS(p, rq, evt, wallclock, irqtime),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        pid_t,  cur_pid                 )
+               __field(        u64,    wallclock               )
+               __field(        u64,    mark_start              )
+               __field(        u64,    delta_m                 )
+               __field(        u64,    win_start               )
+               __field(        u64,    delta                   )
+               __field(        u64,    irqtime                 )
+               __array(    char,   evt, 16                     )
+               __field(unsigned int,   demand                  )
+               __field(unsigned int,   sum                     )
+               __field(         int,   cpu                     )
+               __field(        u64,    cs                      )
+               __field(        u64,    ps                      )
+               __field(        u32,    curr_window             )
+               __field(        u32,    prev_window             )
+               __field(        u64,    nt_cs                   )
+               __field(        u64,    nt_ps                   )
+               __field(        u32,    active_windows          )
+       ),
+
+       TP_fast_assign(
+                       static const char* walt_event_names[] =
+                       {
+                               "PUT_PREV_TASK",
+                               "PICK_NEXT_TASK",
+                               "TASK_WAKE",
+                               "TASK_MIGRATE",
+                               "TASK_UPDATE",
+                               "IRQ_UPDATE"
+                       };
+               __entry->wallclock      = wallclock;
+               __entry->win_start      = rq->window_start;
+               __entry->delta          = (wallclock - rq->window_start);
+               strcpy(__entry->evt, walt_event_names[evt]);
+               __entry->cpu            = rq->cpu;
+               __entry->cur_pid        = rq->curr->pid;
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->mark_start     = p->ravg.mark_start;
+               __entry->delta_m        = (wallclock - p->ravg.mark_start);
+               __entry->demand         = p->ravg.demand;
+               __entry->sum            = p->ravg.sum;
+               __entry->irqtime        = irqtime;
+               __entry->cs             = rq->curr_runnable_sum;
+               __entry->ps             = rq->prev_runnable_sum;
+               __entry->curr_window    = p->ravg.curr_window;
+               __entry->prev_window    = p->ravg.prev_window;
+               __entry->nt_cs          = rq->nt_curr_runnable_sum;
+               __entry->nt_ps          = rq->nt_prev_runnable_sum;
+               __entry->active_windows = p->ravg.active_windows;
+       ),
+
+       TP_printk("wallclock=%llu window_start=%llu delta=%llu event=%s cpu=%d cur_pid=%d pid=%d comm=%s"
+               " mark_start=%llu delta=%llu demand=%u sum=%u irqtime=%llu"
+               " curr_runnable_sum=%llu prev_runnable_sum=%llu cur_window=%u"
+               " prev_window=%u nt_curr_runnable_sum=%llu nt_prev_runnable_sum=%llu active_windows=%u",
+               __entry->wallclock, __entry->win_start, __entry->delta,
+               __entry->evt, __entry->cpu, __entry->cur_pid,
+               __entry->pid, __entry->comm, __entry->mark_start,
+               __entry->delta_m, __entry->demand,
+               __entry->sum, __entry->irqtime,
+               __entry->cs, __entry->ps,
+               __entry->curr_window, __entry->prev_window,
+               __entry->nt_cs, __entry->nt_ps,
+               __entry->active_windows
+               )
+);
+
+TRACE_EVENT(walt_update_history,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
+                       int evt),
+
+       TP_ARGS(rq, p, runtime, samples, evt),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(unsigned int,   runtime                 )
+               __field(         int,   samples                 )
+               __field(         int,   evt                     )
+               __field(         u64,   demand                  )
+               __field(unsigned int,   walt_avg                )
+               __field(unsigned int,   pelt_avg                )
+               __array(         u32,   hist, RAVG_HIST_SIZE_MAX)
+               __field(         int,   cpu                     )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->runtime        = runtime;
+               __entry->samples        = samples;
+               __entry->evt            = evt;
+               __entry->demand         = p->ravg.demand;
+               __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window,
+               __entry->pelt_avg       = p->se.avg.util_avg;
+               memcpy(__entry->hist, p->ravg.sum_history,
+                                       RAVG_HIST_SIZE_MAX * sizeof(u32));
+               __entry->cpu            = rq->cpu;
+       ),
+
+       TP_printk("pid=%d comm=%s runtime=%u samples=%d event=%d demand=%llu ravg_window=%u"
+               " walt=%u pelt=%u hist0=%u hist1=%u hist2=%u hist3=%u hist4=%u cpu=%d",
+               __entry->pid, __entry->comm,
+               __entry->runtime, __entry->samples, __entry->evt,
+               __entry->demand,
+               walt_ravg_window,
+               __entry->walt_avg,
+               __entry->pelt_avg,
+               __entry->hist[0], __entry->hist[1],
+               __entry->hist[2], __entry->hist[3],
+               __entry->hist[4], __entry->cpu)
+);
+
+TRACE_EVENT(walt_migration_update_sum,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p),
+
+       TP_ARGS(rq, p),
+
+       TP_STRUCT__entry(
+               __field(int,            cpu                     )
+               __field(int,            pid                     )
+               __field(        u64,    cs                      )
+               __field(        u64,    ps                      )
+               __field(        s64,    nt_cs                   )
+               __field(        s64,    nt_ps                   )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu            = cpu_of(rq);
+               __entry->cs             = rq->curr_runnable_sum;
+               __entry->ps             = rq->prev_runnable_sum;
+               __entry->nt_cs          = (s64)rq->nt_curr_runnable_sum;
+               __entry->nt_ps          = (s64)rq->nt_prev_runnable_sum;
+               __entry->pid            = p->pid;
+       ),
+
+       TP_printk("cpu=%d curr_runnable_sum=%llu prev_runnable_sum=%llu nt_curr_runnable_sum=%lld nt_prev_runnable_sum=%lld pid=%d",
+                 __entry->cpu, __entry->cs, __entry->ps,
+                 __entry->nt_cs, __entry->nt_ps, __entry->pid)
+);
+#endif /* CONFIG_SCHED_WALT */
 #endif /* CONFIG_SMP */
 #endif /* _TRACE_SCHED_H */
 
index de490d85a2eec9bd7fa576f8e0904d9358d5773b..09eb610b8dc35b209038a41187975ba62011a390 100644 (file)
@@ -400,6 +400,15 @@ config IRQ_TIME_ACCOUNTING
 
          If in doubt, say N here.
 
+config SCHED_WALT
+        bool "Support window based load tracking"
+        depends on SMP
+        help
+        This feature will allow the scheduler to maintain a tunable window
+       based set of metrics for tasks and runqueues. These metrics can be
+       used to guide task placement as well as task frequency requirements
+       for cpufreq governors.
+
 config BSD_PROCESS_ACCT
        bool "BSD Process Accounting"
        depends on MULTIUSER
index e6a7b240e0c544a491b434e45be388b6a3ff0b3a..b9207a9caa8616bde019367347305ce2a9de73f1 100644 (file)
@@ -21,6 +21,7 @@ obj-y += idle_task.o fair.o rt.o deadline.o
 obj-y += wait.o wait_bit.o swait.o completion.o idle.o
 obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
 obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
 obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
index f48a8887695dd920f7245dcca3f06e662cd7447e..00c6f5784cdbbfd01c9813f6ee93326c473d0cb7 100644 (file)
@@ -39,6 +39,7 @@
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/sched.h>
+#include "walt.h"
 
 DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
 
@@ -1192,6 +1193,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                        p->sched_class->migrate_task_rq(p);
                p->se.nr_migrations++;
                perf_event_task_migrate(p);
+
+               walt_fixup_busy_time(p, new_cpu);
        }
 
        __set_task_cpu(p, new_cpu);
@@ -1956,6 +1959,26 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
  *
  */
 
+#ifdef CONFIG_SMP
+#ifdef CONFIG_SCHED_WALT
+/* utility function to update walt signals at wakeup */
+static inline void walt_try_to_wake_up(struct task_struct *p)
+{
+       struct rq *rq = cpu_rq(task_cpu(p));
+       struct rq_flags rf;
+       u64 wallclock;
+
+       rq_lock_irqsave(rq, &rf);
+       wallclock = walt_ktime_clock();
+       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+       walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+       rq_unlock_irqrestore(rq, &rf);
+}
+#else
+#define walt_try_to_wake_up(a) {}
+#endif
+#endif
+
 /**
  * try_to_wake_up - wake up a thread
  * @p: the thread to be awakened
@@ -2054,6 +2077,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
         */
        smp_cond_load_acquire(&p->on_cpu, !VAL);
 
+       walt_try_to_wake_up(p);
+
        p->sched_contributes_to_load = !!task_contributes_to_load(p);
        p->state = TASK_WAKING;
 
@@ -2124,6 +2149,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
        trace_sched_waking(p);
 
        if (!task_on_rq_queued(p)) {
+               u64 wallclock = walt_ktime_clock();
+
+               walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+               walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+
                if (p->in_iowait) {
                        delayacct_blkio_end();
                        atomic_dec(&rq->nr_iowait);
@@ -2176,7 +2206,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        p->se.prev_sum_exec_runtime     = 0;
        p->se.nr_migrations             = 0;
        p->se.vruntime                  = 0;
+#ifdef CONFIG_SCHED_WALT
+       p->last_sleep_ts                = 0;
+#endif
+
        INIT_LIST_HEAD(&p->se.group_node);
+       walt_init_new_task_load(p);
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
        p->se.cfs_rq                    = NULL;
@@ -2453,6 +2488,9 @@ void wake_up_new_task(struct task_struct *p)
        struct rq *rq;
 
        raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+
+       walt_init_new_task_load(p);
+
        p->state = TASK_RUNNING;
 #ifdef CONFIG_SMP
        /*
@@ -2470,6 +2508,8 @@ void wake_up_new_task(struct task_struct *p)
        post_init_entity_util_avg(&p->se);
 
        activate_task(rq, p, ENQUEUE_NOCLOCK);
+       walt_mark_task_starting(p);
+
        p->on_rq = TASK_ON_RQ_QUEUED;
        trace_sched_wakeup_new(p);
        check_preempt_curr(rq, p, WF_FORK);
@@ -3023,6 +3063,9 @@ void scheduler_tick(void)
 
        rq_lock(rq, &rf);
 
+       walt_set_window_start(rq, &rf);
+       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+                       walt_ktime_clock(), 0);
        update_rq_clock(rq);
        curr->sched_class->task_tick(rq, curr, 0);
        cpu_load_update_active(rq);
@@ -3294,6 +3337,7 @@ static void __sched notrace __schedule(bool preempt)
        struct rq_flags rf;
        struct rq *rq;
        int cpu;
+       u64 wallclock;
 
        cpu = smp_processor_id();
        rq = cpu_rq(cpu);
@@ -3349,10 +3393,17 @@ static void __sched notrace __schedule(bool preempt)
        }
 
        next = pick_next_task(rq, prev, &rf);
+       wallclock = walt_ktime_clock();
+       walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+       walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
        clear_tsk_need_resched(prev);
        clear_preempt_need_resched();
 
        if (likely(prev != next)) {
+#ifdef CONFIG_SCHED_WALT
+               if (!prev->on_rq)
+                       prev->last_sleep_ts = wallclock;
+#endif
                rq->nr_switches++;
                rq->curr = next;
                /*
@@ -5704,6 +5755,9 @@ int sched_cpu_dying(unsigned int cpu)
        sched_ttwu_pending();
 
        rq_lock_irqsave(rq, &rf);
+
+       walt_migrate_sync_cpu(cpu);
+
        if (rq->rd) {
                BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                set_rq_offline(rq);
@@ -5929,6 +5983,11 @@ void __init sched_init(void)
                rq->idle_stamp = 0;
                rq->avg_idle = 2*sysctl_sched_migration_cost;
                rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+               rq->cur_irqload = 0;
+               rq->avg_irqload = 0;
+               rq->irqload_ts = 0;
+#endif
 
                INIT_LIST_HEAD(&rq->cfs_tasks);
 
index 14d2dbf97c531db0dd0c1bba3113a35cbe8d3bd9..029b505aca494c58ec224eeb1c6bead63450c169 100644 (file)
@@ -6,6 +6,7 @@
 #include <linux/context_tracking.h>
 #include <linux/sched/cputime.h>
 #include "sched.h"
+#include "walt.h"
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 
@@ -55,11 +56,18 @@ void irqtime_account_irq(struct task_struct *curr)
        struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
        s64 delta;
        int cpu;
+#ifdef CONFIG_SCHED_WALT
+       u64 wallclock;
+       bool account = true;
+#endif
 
        if (!sched_clock_irqtime)
                return;
 
        cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+       wallclock = sched_clock_cpu(cpu);
+#endif
        delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
        irqtime->irq_start_time += delta;
 
@@ -73,6 +81,13 @@ void irqtime_account_irq(struct task_struct *curr)
                irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
        else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
                irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+#ifdef CONFIG_SCHED_WALT
+       else
+               account = false;
+
+       if (account)
+               walt_account_irqtime(cpu, curr, delta, wallclock);
+#endif
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
index b60f444500fd852728d580fb82f47b56ee2a4209..f982a3fa825393319047764690dfe21ce3e15fb5 100644 (file)
@@ -20,6 +20,8 @@
 #include <linux/slab.h>
 #include <uapi/linux/sched/types.h>
 
+#include "walt.h"
+
 struct dl_bandwidth def_dl_bandwidth;
 
 static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -1290,6 +1292,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_prio(prio));
        dl_rq->dl_nr_running++;
        add_nr_running(rq_of_dl_rq(dl_rq), 1);
+       walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
 
        inc_dl_deadline(dl_rq, deadline);
        inc_dl_migration(dl_se, dl_rq);
@@ -1304,6 +1307,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
        WARN_ON(!dl_rq->dl_nr_running);
        dl_rq->dl_nr_running--;
        sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+       walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
 
        dec_dl_deadline(dl_rq, dl_se->deadline);
        dec_dl_migration(dl_se, dl_rq);
@@ -2018,7 +2022,9 @@ retry:
        deactivate_task(rq, next_task, 0);
        sub_running_bw(next_task->dl.dl_bw, &rq->dl);
        sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
+       next_task->on_rq = TASK_ON_RQ_MIGRATING;
        set_task_cpu(next_task, later_rq->cpu);
+       next_task->on_rq = TASK_ON_RQ_QUEUED;
        add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
        add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
        activate_task(later_rq, next_task, 0);
@@ -2110,7 +2116,9 @@ static void pull_dl_task(struct rq *this_rq)
                        deactivate_task(src_rq, p, 0);
                        sub_running_bw(p->dl.dl_bw, &src_rq->dl);
                        sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
+                       p->on_rq = TASK_ON_RQ_MIGRATING;
                        set_task_cpu(p, this_cpu);
+                       p->on_rq = TASK_ON_RQ_QUEUED;
                        add_rq_bw(p->dl.dl_bw, &this_rq->dl);
                        add_running_bw(p->dl.dl_bw, &this_rq->dl);
                        activate_task(this_rq, p, 0);
index 44a7154dd220fb3d459817b61017de4140333397..ba43bd69da8fab3d828d8bb547114dd98aa14117 100644 (file)
@@ -38,6 +38,7 @@
 
 #include "sched.h"
 #include "tune.h"
+#include "walt.h"
 
 /*
  * Targeted preemption latency for CPU-bound tasks:
@@ -110,6 +111,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity    = 1000000UL;
 
 const_debug unsigned int sysctl_sched_migration_cost   = 500000UL;
 
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+    (10 * NSEC_PER_MSEC);
+#endif
+
 #ifdef CONFIG_SMP
 /*
  * For asym packing, by default the lower numbered cpu has higher priority.
@@ -1440,7 +1448,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
 static unsigned long weighted_cpuload(struct rq *rq);
 static unsigned long source_load(int cpu, int type);
 static unsigned long target_load(int cpu, int type);
-static unsigned long capacity_of(int cpu);
 
 /* Cached statistics for all CPUs within a node */
 struct numa_stats {
@@ -2821,6 +2828,9 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
                 * See cpu_util().
                 */
                cpufreq_update_util(rq, 0);
+#ifdef CONFIG_SMP
+               trace_sched_load_avg_cpu(cpu_of(rq), cfs_rq);
+#endif
        }
 }
 
@@ -4952,8 +4962,13 @@ static inline void update_overutilized_status(struct rq *rq)
                set_sd_overutilized(sd);
        rcu_read_unlock();
 }
+
+unsigned long boosted_cpu_util(int cpu);
 #else
+
 #define update_overutilized_status(rq) do {} while (0)
+#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
+
 #endif /* CONFIG_SMP */
 
 /*
@@ -4991,6 +5006,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                flags = ENQUEUE_WAKEUP;
        }
@@ -4998,6 +5014,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -5029,7 +5046,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                add_nr_running(rq, 1);
                if (!task_new)
                        update_overutilized_status(rq);
+               walt_inc_cumulative_runnable_avg(rq, p);
        }
+
        hrtick_update(rq);
 }
 
@@ -5059,6 +5078,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                /* Don't dequeue parent if it has other entities besides us */
                if (cfs_rq->load.weight) {
@@ -5078,6 +5098,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
        for_each_sched_entity(se) {
                cfs_rq = cfs_rq_of(se);
                cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
 
                if (cfs_rq_throttled(cfs_rq))
                        break;
@@ -5095,8 +5116,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         */
        schedtune_dequeue_task(p, cpu_of(rq));
 
-       if (!se)
+       if (!se) {
                sub_nr_running(rq, 1);
+               walt_dec_cumulative_runnable_avg(rq, p);
+       }
 
        hrtick_update(rq);
 }
@@ -5405,16 +5428,6 @@ static unsigned long target_load(int cpu, int type)
        return max(rq->cpu_load[type-1], total);
 }
 
-static unsigned long capacity_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
 static unsigned long cpu_avg_load_per_task(int cpu)
 {
        struct rq *rq = cpu_rq(cpu);
@@ -5427,49 +5440,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
        return 0;
 }
 
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static unsigned long __cpu_util(int cpu, int delta)
-{
-       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
-       unsigned long capacity = capacity_orig_of(cpu);
-
-       delta += util;
-       if (delta < 0)
-               return 0;
-
-       return (delta >= capacity) ? capacity : delta;
-}
-
-static unsigned long cpu_util(int cpu)
-{
-       return __cpu_util(cpu, 0);
-}
-
 static void record_wakee(struct task_struct *p)
 {
        /*
@@ -6138,7 +6108,7 @@ schedtune_task_margin(struct task_struct *task)
 unsigned long
 boosted_cpu_util(int cpu)
 {
-       unsigned long util = cpu_util(cpu);
+       unsigned long util = cpu_util_freq(cpu);
        long margin = schedtune_cpu_margin(util, cpu);
 
        trace_sched_boost_cpu(cpu, util, margin);
@@ -6685,6 +6655,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
 
 static inline unsigned long task_util(struct task_struct *p)
 {
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+               return (p->ravg.demand / (walt_ravg_window >> SCHED_CAPACITY_SHIFT));
+       }
+#endif
        return p->se.avg.util_avg;
 }
 
@@ -6696,6 +6671,16 @@ static int cpu_util_wake(int cpu, struct task_struct *p)
 {
        unsigned long util, capacity;
 
+#ifdef CONFIG_SCHED_WALT
+       /*
+        * WALT does not decay idle tasks in the same manner
+        * as PELT, so it makes little sense to subtract task
+        * utilization from cpu utilization. Instead just use
+        * cpu_util for this case.
+        */
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+               return cpu_util(cpu);
+#endif
        /* Task has no contribution or is new */
        if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
                return cpu_util(cpu);
@@ -6759,6 +6744,9 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
                        if (!cpu_online(i))
                                continue;
 
+                       if (walt_cpu_high_irqload(i))
+                               continue;
+
                        /*
                         * p's blocked utilization is still accounted for on prev_cpu
                         * so prev_cpu will receive a negative bias due to the double
index 8874c09b8f7ad17c4b35be1cb5f48c5bb227ba19..0e5660145cc545d5404d2cd4bad300e25c128399 100644 (file)
@@ -9,6 +9,8 @@
 #include <linux/slab.h>
 #include <linux/irq_work.h>
 
+#include "walt.h"
+
 int sched_rr_timeslice = RR_TIMESLICE;
 int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
 
@@ -1335,6 +1337,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
                rt_se->timeout = 0;
 
        enqueue_rt_entity(rt_se, flags);
+       walt_inc_cumulative_runnable_avg(rq, p);
 
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
@@ -1346,6 +1349,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
 
        update_curr_rt(rq);
        dequeue_rt_entity(rt_se, flags);
+       walt_dec_cumulative_runnable_avg(rq, p);
 
        dequeue_pushable_task(rq, p);
 }
@@ -1863,7 +1867,9 @@ retry:
        }
 
        deactivate_task(rq, next_task, 0);
+       next_task->on_rq = TASK_ON_RQ_MIGRATING;
        set_task_cpu(next_task, lowest_rq->cpu);
+       next_task->on_rq = TASK_ON_RQ_QUEUED;
        activate_task(lowest_rq, next_task, 0);
        ret = 1;
 
@@ -2198,7 +2204,9 @@ static void pull_rt_task(struct rq *this_rq)
                        resched = true;
 
                        deactivate_task(src_rq, p, 0);
+                       p->on_rq = TASK_ON_RQ_MIGRATING;
                        set_task_cpu(p, this_cpu);
+                       p->on_rq = TASK_ON_RQ_QUEUED;
                        activate_task(this_rq, p, 0);
                        /*
                         * We continue with the search, just in
index 671af4025fd81e6bae62963e6ad53522dc4c472b..d767020c89150a0fc6a7179024ce740972844b1a 100644 (file)
@@ -483,6 +483,10 @@ struct cfs_rq {
        struct list_head leaf_cfs_rq_list;
        struct task_group *tg;  /* group that "owns" this runqueue */
 
+#ifdef CONFIG_SCHED_WALT
+       u64 cumulative_runnable_avg;
+#endif
+
 #ifdef CONFIG_CFS_BANDWIDTH
        int runtime_enabled;
        u64 runtime_expires;
@@ -759,6 +763,20 @@ struct rq {
        u64 max_idle_balance_cost;
 #endif
 
+#ifdef CONFIG_SCHED_WALT
+       u64 cumulative_runnable_avg;
+       u64 window_start;
+       u64 curr_runnable_sum;
+       u64 prev_runnable_sum;
+       u64 nt_curr_runnable_sum;
+       u64 nt_prev_runnable_sum;
+       u64 cur_irqload;
+       u64 avg_irqload;
+       u64 irqload_ts;
+       u64 cum_window_demand;
+#endif /* CONFIG_SCHED_WALT */
+
+
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
        u64 prev_irq_time;
 #endif
@@ -1710,6 +1728,86 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
 }
 #endif
 
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline unsigned long capacity_orig_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int walt_ravg_window;
+extern bool walt_disabled;
+
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static inline unsigned long __cpu_util(int cpu, int delta)
+{
+       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+               util = cpu_rq(cpu)->cumulative_runnable_avg << SCHED_CAPACITY_SHIFT;
+               util = div_u64(util, walt_ravg_window);
+       }
+#endif
+       delta += util;
+       if (delta < 0)
+               return 0;
+
+       return (delta >= capacity) ? capacity : delta;
+}
+
+static inline unsigned long cpu_util(int cpu)
+{
+       return __cpu_util(cpu, 0);
+}
+
+static inline unsigned long cpu_util_freq(int cpu)
+{
+       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+               util = cpu_rq(cpu)->prev_runnable_sum << SCHED_CAPACITY_SHIFT;
+               do_div(util, walt_ravg_window);
+       }
+#endif
+       return (util >= capacity) ? capacity : util;
+}
+
+#endif
+
 static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
 {
        rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
@@ -2123,6 +2221,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
 static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
 #endif /* CONFIG_CPU_FREQ */
 
+#ifdef CONFIG_SCHED_WALT
+
+static inline bool
+walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+       return cpu_of(rq) == task_cpu(p) &&
+              (p->on_rq || p->last_sleep_ts >= rq->window_start);
+}
+
+#endif /* CONFIG_SCHED_WALT */
+
 #ifdef arch_scale_freq_capacity
 #ifndef arch_scale_freq_invariant
 #define arch_scale_freq_invariant()    (true)
index ea6de023db26f76848e123dc093956090c3ea68c..7ca03e528c45d3224f8662a09d2a485162e886cf 100644 (file)
@@ -1,5 +1,6 @@
 // SPDX-License-Identifier: GPL-2.0
 #include "sched.h"
+#include "walt.h"
 
 /*
  * stop-task scheduling class.
@@ -44,12 +45,14 @@ static void
 enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
        add_nr_running(rq, 1);
+       walt_inc_cumulative_runnable_avg(rq, p);
 }
 
 static void
 dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
 {
        sub_nr_running(rq, 1);
+       walt_dec_cumulative_runnable_avg(rq, p);
 }
 
 static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c
new file mode 100644 (file)
index 0000000..a6aef2f
--- /dev/null
@@ -0,0 +1,901 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ *             and Todd Kjos
+ */
+
+#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <trace/events/sched.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT            0
+#define WINDOW_STATS_MAX               1
+#define WINDOW_STATS_MAX_RECENT_AVG    2
+#define WINDOW_STATS_AVG               3
+#define WINDOW_STATS_INVALID_POLICY    4
+
+#define EXITING_TASK_MARKER    0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+       WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* true -> use PELT based load stats, false -> use window-based load stats */
+bool __read_mostly walt_disabled = false;
+
+/*
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
+ */
+__read_mostly unsigned int walt_ravg_window =
+                                           (20000000 / TICK_NSEC) * TICK_NSEC;
+#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static __read_mostly bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+       return p->ravg.demand;
+}
+
+static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
+{
+       rq->cum_window_demand += delta;
+       if (unlikely((s64)rq->cum_window_demand < 0))
+               rq->cum_window_demand = 0;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+                                struct task_struct *p)
+{
+       rq->cumulative_runnable_avg += p->ravg.demand;
+
+       /*
+        * Add a task's contribution to the cumulative window demand when
+        *
+        * (1) task is enqueued with on_rq = 1 i.e migration,
+        *     prio/cgroup/class change.
+        * (2) task is waking for the first time in this window.
+        */
+       if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+               fixup_cum_window_demand(rq, p->ravg.demand);
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+                                struct task_struct *p)
+{
+       rq->cumulative_runnable_avg -= p->ravg.demand;
+       BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+
+       /*
+        * on_rq will be 1 for sleeping tasks. So check if the task
+        * is migrating or dequeuing in RUNNING state to change the
+        * prio/cgroup/class.
+        */
+       if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+               fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+                             struct task_struct *p, u64 new_task_load)
+{
+       s64 task_load_delta = (s64)new_task_load - task_load(p);
+
+       rq->cumulative_runnable_avg += task_load_delta;
+       if ((s64)rq->cumulative_runnable_avg < 0)
+               panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+                       task_load_delta, task_load(p));
+
+       fixup_cum_window_demand(rq, task_load_delta);
+}
+
+u64 walt_ktime_clock(void)
+{
+       if (unlikely(walt_ktime_suspended))
+               return ktime_to_ns(ktime_last);
+       return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+       walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+       ktime_last = ktime_get();
+       walt_ktime_suspended = true;
+       return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+       .resume = walt_resume,
+       .suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+       register_syscore_ops(&walt_syscore_ops);
+       return 0;
+}
+late_initcall(walt_init_ops);
+
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+               struct task_struct *p)
+{
+       cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+               struct task_struct *p)
+{
+       cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+
+static int exiting_task(struct task_struct *p)
+{
+       if (p->flags & PF_EXITING) {
+               if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+                       p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+               }
+               return 1;
+       }
+       return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+       unsigned int adj_window;
+       bool no_walt = walt_disabled;
+
+       get_option(&str, &walt_ravg_window);
+
+       /* Adjust for CONFIG_HZ */
+       adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
+
+       /* Warn if we're a bit too far away from the expected window size */
+       WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
+            "tick-adjusted window size %u, original was %u\n", adj_window,
+            walt_ravg_window);
+
+       walt_ravg_window = adj_window;
+
+       walt_disabled = walt_disabled ||
+                       (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+                        walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+       WARN(!no_walt && walt_disabled,
+            "invalid window size, disabling WALT\n");
+
+       return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+       s64 delta;
+       int nr_windows;
+
+       delta = wallclock - rq->window_start;
+       /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
+       if (delta < 0) {
+               delta = 0;
+               WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
+       }
+
+       if (delta < walt_ravg_window)
+               return;
+
+       nr_windows = div64_u64(delta, walt_ravg_window);
+       rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+
+       rq->cum_window_demand = rq->cumulative_runnable_avg;
+}
+
+extern unsigned long capacity_curr_of(int cpu);
+/*
+ * Translate absolute delta time accounted on a CPU
+ * to a scale where 1024 is the capacity of the most
+ * capable CPU running at FMAX
+ */
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+       unsigned long capcurr = capacity_curr_of(cpu_of(rq));
+
+       return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+       if (!walt_io_is_busy)
+               return 0;
+
+       return atomic_read(&rq->nr_iowait);
+}
+
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+                                u64 delta, u64 wallclock)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags, nr_windows;
+       u64 cur_jiffies_ts;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       /*
+        * cputime (wallclock) uses sched_clock so use the same here for
+        * consistency.
+        */
+       delta += sched_clock() - wallclock;
+       cur_jiffies_ts = get_jiffies_64();
+
+       if (is_idle_task(curr))
+               walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+                                delta);
+
+       nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+       if (nr_windows) {
+               if (nr_windows < 10) {
+                       /* Decay CPU's irqload by 3/4 for each window. */
+                       rq->avg_irqload *= (3 * nr_windows);
+                       rq->avg_irqload = div64_u64(rq->avg_irqload,
+                                                   4 * nr_windows);
+               } else {
+                       rq->avg_irqload = 0;
+               }
+               rq->avg_irqload += rq->cur_irqload;
+               rq->cur_irqload = 0;
+       }
+
+       rq->cur_irqload += delta;
+       rq->irqload_ts = cur_jiffies_ts;
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+       struct rq *rq = cpu_rq(cpu);
+       s64 delta;
+       delta = get_jiffies_64() - rq->irqload_ts;
+
+        /*
+        * Current context can be preempted by irq and rq->irqload_ts can be
+        * updated by irq context so that delta can be negative.
+        * But this is okay and we can safely return as this means there
+        * was recent irq occurrence.
+        */
+
+        if (delta < WALT_HIGH_IRQ_TIMEOUT)
+               return rq->avg_irqload;
+        else
+               return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+       return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+                                    u64 irqtime, int event)
+{
+       if (is_idle_task(p)) {
+               /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+               if (event == PICK_NEXT_TASK)
+                       return 0;
+
+               /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+               return irqtime || cpu_is_waiting_on_io(rq);
+       }
+
+       if (event == TASK_WAKE)
+               return 0;
+
+       if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+                                        event == TASK_UPDATE)
+               return 1;
+
+       /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+       return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock, u64 irqtime)
+{
+       int new_window, nr_full_windows = 0;
+       int p_is_curr_task = (p == rq->curr);
+       u64 mark_start = p->ravg.mark_start;
+       u64 window_start = rq->window_start;
+       u32 window_size = walt_ravg_window;
+       u64 delta;
+
+       new_window = mark_start < window_start;
+       if (new_window) {
+               nr_full_windows = div64_u64((window_start - mark_start),
+                                               window_size);
+               if (p->ravg.active_windows < USHRT_MAX)
+                       p->ravg.active_windows++;
+       }
+
+       /* Handle per-task window rollover. We don't care about the idle
+        * task or exiting tasks. */
+       if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+               u32 curr_window = 0;
+
+               if (!nr_full_windows)
+                       curr_window = p->ravg.curr_window;
+
+               p->ravg.prev_window = curr_window;
+               p->ravg.curr_window = 0;
+       }
+
+       if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+               /* account_busy_for_cpu_time() = 0, so no update to the
+                * task's current window needs to be made. This could be
+                * for example
+                *
+                *   - a wakeup event on a task within the current
+                *     window (!new_window below, no action required),
+                *   - switching to a new task from idle (PICK_NEXT_TASK)
+                *     in a new window where irqtime is 0 and we aren't
+                *     waiting on IO */
+
+               if (!new_window)
+                       return;
+
+               /* A new window has started. The RQ demand must be rolled
+                * over if p is the current task. */
+               if (p_is_curr_task) {
+                       u64 prev_sum = 0;
+
+                       /* p is either idle task or an exiting task */
+                       if (!nr_full_windows) {
+                               prev_sum = rq->curr_runnable_sum;
+                       }
+
+                       rq->prev_runnable_sum = prev_sum;
+                       rq->curr_runnable_sum = 0;
+               }
+
+               return;
+       }
+
+       if (!new_window) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. No rollover
+                * since we didn't start a new window. An example of this is
+                * when a task starts execution and then sleeps within the
+                * same window. */
+
+               if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+                       delta = wallclock - mark_start;
+               else
+                       delta = irqtime;
+               delta = scale_exec_time(delta, rq);
+               rq->curr_runnable_sum += delta;
+               if (!is_idle_task(p) && !exiting_task(p))
+                       p->ravg.curr_window += delta;
+
+               return;
+       }
+
+       if (!p_is_curr_task) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has also started, but p is not the current task, so the
+                * window is not rolled over - just split up and account
+                * as necessary into curr and prev. The window is only
+                * rolled over when a new window is processed for the current
+                * task.
+                *
+                * Irqtime can't be accounted by a task that isn't the
+                * currently running task. */
+
+               if (!nr_full_windows) {
+                       /* A full window hasn't elapsed, account partial
+                        * contribution to previous completed window. */
+                       delta = scale_exec_time(window_start - mark_start, rq);
+                       if (!exiting_task(p))
+                               p->ravg.prev_window += delta;
+               } else {
+                       /* Since at least one full window has elapsed,
+                        * the contribution to the previous window is the
+                        * full window (window_size). */
+                       delta = scale_exec_time(window_size, rq);
+                       if (!exiting_task(p))
+                               p->ravg.prev_window = delta;
+               }
+               rq->prev_runnable_sum += delta;
+
+               /* Account piece of busy time in the current window. */
+               delta = scale_exec_time(wallclock - window_start, rq);
+               rq->curr_runnable_sum += delta;
+               if (!exiting_task(p))
+                       p->ravg.curr_window = delta;
+
+               return;
+       }
+
+       if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has started and p is the current task so rollover is
+                * needed. If any of these three above conditions are true
+                * then this busy time can't be accounted as irqtime.
+                *
+                * Busy time for the idle task or exiting tasks need not
+                * be accounted.
+                *
+                * An example of this would be a task that starts execution
+                * and then sleeps once a new window has begun. */
+
+               if (!nr_full_windows) {
+                       /* A full window hasn't elapsed, account partial
+                        * contribution to previous completed window. */
+                       delta = scale_exec_time(window_start - mark_start, rq);
+                       if (!is_idle_task(p) && !exiting_task(p))
+                               p->ravg.prev_window += delta;
+
+                       delta += rq->curr_runnable_sum;
+               } else {
+                       /* Since at least one full window has elapsed,
+                        * the contribution to the previous window is the
+                        * full window (window_size). */
+                       delta = scale_exec_time(window_size, rq);
+                       if (!is_idle_task(p) && !exiting_task(p))
+                               p->ravg.prev_window = delta;
+
+               }
+               /*
+                * Rollover for normal runnable sum is done here by overwriting
+                * the values in prev_runnable_sum and curr_runnable_sum.
+                * Rollover for new task runnable sum has completed by previous
+                * if-else statement.
+                */
+               rq->prev_runnable_sum = delta;
+
+               /* Account piece of busy time in the current window. */
+               delta = scale_exec_time(wallclock - window_start, rq);
+               rq->curr_runnable_sum = delta;
+               if (!is_idle_task(p) && !exiting_task(p))
+                       p->ravg.curr_window = delta;
+
+               return;
+       }
+
+       if (irqtime) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has started and p is the current task so rollover is
+                * needed. The current task must be the idle task because
+                * irqtime is not accounted for any other task.
+                *
+                * Irqtime will be accounted each time we process IRQ activity
+                * after a period of idleness, so we know the IRQ busy time
+                * started at wallclock - irqtime. */
+
+               BUG_ON(!is_idle_task(p));
+               mark_start = wallclock - irqtime;
+
+               /* Roll window over. If IRQ busy time was just in the current
+                * window then that is all that need be accounted. */
+               rq->prev_runnable_sum = rq->curr_runnable_sum;
+               if (mark_start > window_start) {
+                       rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+                       return;
+               }
+
+               /* The IRQ busy time spanned multiple windows. Process the
+                * busy time preceding the current window start first. */
+               delta = window_start - mark_start;
+               if (delta > window_size)
+                       delta = window_size;
+               delta = scale_exec_time(delta, rq);
+               rq->prev_runnable_sum += delta;
+
+               /* Process the remaining IRQ busy time in the current window. */
+               delta = wallclock - window_start;
+               rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+               return;
+       }
+
+       BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+       /* No need to bother updating task demand for exiting tasks
+        * or the idle task. */
+       if (exiting_task(p) || is_idle_task(p))
+               return 0;
+
+       /* When a task is waking up it is completing a segment of non-busy
+        * time. Likewise, if wait time is not treated as busy time, then
+        * when a task begins to run or is migrated, it is not running and
+        * is completing a segment of non-busy time. */
+       if (event == TASK_WAKE || (!walt_account_wait_time &&
+                        (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+               return 0;
+
+       return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+                        u32 runtime, int samples, int event)
+{
+       u32 *hist = &p->ravg.sum_history[0];
+       int ridx, widx;
+       u32 max = 0, avg, demand;
+       u64 sum = 0;
+
+       /* Ignore windows where task had no activity */
+       if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+                       goto done;
+
+       /* Push new 'runtime' value onto stack */
+       widx = walt_ravg_hist_size - 1;
+       ridx = widx - samples;
+       for (; ridx >= 0; --widx, --ridx) {
+               hist[widx] = hist[ridx];
+               sum += hist[widx];
+               if (hist[widx] > max)
+                       max = hist[widx];
+       }
+
+       for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+               hist[widx] = runtime;
+               sum += hist[widx];
+               if (hist[widx] > max)
+                       max = hist[widx];
+       }
+
+       p->ravg.sum = 0;
+
+       if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+               demand = runtime;
+       } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+               demand = max;
+       } else {
+               avg = div64_u64(sum, walt_ravg_hist_size);
+               if (walt_window_stats_policy == WINDOW_STATS_AVG)
+                       demand = avg;
+               else
+                       demand = max(avg, runtime);
+       }
+
+       /*
+        * A throttled deadline sched class task gets dequeued without
+        * changing p->on_rq. Since the dequeue decrements hmp stats
+        * avoid decrementing it here again.
+        *
+        * When window is rolled over, the cumulative window demand
+        * is reset to the cumulative runnable average (contribution from
+        * the tasks on the runqueue). If the current task is dequeued
+        * already, it's demand is not included in the cumulative runnable
+        * average. So add the task demand separately to cumulative window
+        * demand.
+        */
+       if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+               if (task_on_rq_queued(p))
+                       fixup_cumulative_runnable_avg(rq, p, demand);
+               else if (rq->curr == p)
+                       fixup_cum_window_demand(rq, demand);
+       }
+
+       p->ravg.demand = demand;
+
+done:
+       trace_walt_update_history(rq, p, runtime, samples, event);
+       return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+                               u64 delta)
+{
+       delta = scale_exec_time(delta, rq);
+       p->ravg.sum += delta;
+       if (unlikely(p->ravg.sum > walt_ravg_window))
+               p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ *     a) Task event is contained within one window.
+ *             window_start < mark_start < wallclock
+ *
+ *             ws   ms  wc
+ *             |    |   |
+ *             V    V   V
+ *             |---------------|
+ *
+ *     In this case, p->ravg.sum is updated *iff* event is appropriate
+ *     (ex: event == PUT_PREV_TASK)
+ *
+ *     b) Task event spans two windows.
+ *             mark_start < window_start < wallclock
+ *
+ *             ms   ws   wc
+ *             |    |    |
+ *             V    V    V
+ *             -----|-------------------
+ *
+ *     In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ *     is appropriate, then a new window sample is recorded followed
+ *     by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ *     c) Task event spans more than two windows.
+ *
+ *             ms ws_tmp                          ws  wc
+ *             |  |                               |   |
+ *             V  V                               V   V
+ *             ---|-------|-------|-------|-------|------
+ *                |                               |
+ *                |<------ nr_full_windows ------>|
+ *
+ *     In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ *     event is appropriate, window sample of p->ravg.sum is recorded,
+ *     'nr_full_window' samples of window_size is also recorded *iff*
+ *     event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ *     *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock)
+{
+       u64 mark_start = p->ravg.mark_start;
+       u64 delta, window_start = rq->window_start;
+       int new_window, nr_full_windows;
+       u32 window_size = walt_ravg_window;
+
+       new_window = mark_start < window_start;
+       if (!account_busy_for_task_demand(p, event)) {
+               if (new_window)
+                       /* If the time accounted isn't being accounted as
+                        * busy time, and a new window started, only the
+                        * previous window need be closed out with the
+                        * pre-existing demand. Multiple windows may have
+                        * elapsed, but since empty windows are dropped,
+                        * it is not necessary to account those. */
+                       update_history(rq, p, p->ravg.sum, 1, event);
+               return;
+       }
+
+       if (!new_window) {
+               /* The simple case - busy time contained within the existing
+                * window. */
+               add_to_task_demand(rq, p, wallclock - mark_start);
+               return;
+       }
+
+       /* Busy time spans at least two windows. Temporarily rewind
+        * window_start to first window boundary after mark_start. */
+       delta = window_start - mark_start;
+       nr_full_windows = div64_u64(delta, window_size);
+       window_start -= (u64)nr_full_windows * (u64)window_size;
+
+       /* Process (window_start - mark_start) first */
+       add_to_task_demand(rq, p, window_start - mark_start);
+
+       /* Push new sample(s) into task's demand history */
+       update_history(rq, p, p->ravg.sum, 1, event);
+       if (nr_full_windows)
+               update_history(rq, p, scale_exec_time(window_size, rq),
+                              nr_full_windows, event);
+
+       /* Roll window_start back to current to process any remainder
+        * in current window. */
+       window_start += (u64)nr_full_windows * (u64)window_size;
+
+       /* Process (wallclock - window_start) next */
+       mark_start = window_start;
+       add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock, u64 irqtime)
+{
+       if (walt_disabled || !rq->window_start)
+               return;
+
+       lockdep_assert_held(&rq->lock);
+
+       update_window_start(rq, wallclock);
+
+       if (!p->ravg.mark_start)
+               goto done;
+
+       update_task_demand(p, rq, event, wallclock);
+       update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+       trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+       p->ravg.mark_start = wallclock;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+       u32 sum = 0;
+
+       if (exiting_task(p))
+               sum = EXITING_TASK_MARKER;
+
+       memset(&p->ravg, 0, sizeof(struct ravg));
+       /* Retain EXITING_TASK marker */
+       p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+       u64 wallclock;
+       struct rq *rq = task_rq(p);
+
+       if (!rq->window_start) {
+               reset_task_stats(p);
+               return;
+       }
+
+       wallclock = walt_ktime_clock();
+       p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq, struct rq_flags *rf)
+{
+       if (likely(rq->window_start))
+               return;
+
+       if (cpu_of(rq) == sync_cpu) {
+               rq->window_start = 1;
+       } else {
+               struct rq *sync_rq = cpu_rq(sync_cpu);
+               rq_unpin_lock(rq, rf);
+               double_lock_balance(rq, sync_rq);
+               rq->window_start = sync_rq->window_start;
+               rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+               raw_spin_unlock(&sync_rq->lock);
+               rq_repin_lock(rq, rf);
+       }
+
+       rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+       if (cpu == sync_cpu)
+               sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+       struct rq *src_rq = task_rq(p);
+       struct rq *dest_rq = cpu_rq(new_cpu);
+       u64 wallclock;
+
+       if (!p->on_rq && p->state != TASK_WAKING)
+               return;
+
+       if (exiting_task(p)) {
+               return;
+       }
+
+       if (p->state == TASK_WAKING)
+               double_rq_lock(src_rq, dest_rq);
+
+       wallclock = walt_ktime_clock();
+
+       walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+                       TASK_UPDATE, wallclock, 0);
+       walt_update_task_ravg(dest_rq->curr, dest_rq,
+                       TASK_UPDATE, wallclock, 0);
+
+       walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+       /*
+        * When a task is migrating during the wakeup, adjust
+        * the task's contribution towards cumulative window
+        * demand.
+        */
+       if (p->state == TASK_WAKING &&
+           p->last_sleep_ts >= src_rq->window_start) {
+               fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
+               fixup_cum_window_demand(dest_rq, p->ravg.demand);
+       }
+
+       if (p->ravg.curr_window) {
+               src_rq->curr_runnable_sum -= p->ravg.curr_window;
+               dest_rq->curr_runnable_sum += p->ravg.curr_window;
+       }
+
+       if (p->ravg.prev_window) {
+               src_rq->prev_runnable_sum -= p->ravg.prev_window;
+               dest_rq->prev_runnable_sum += p->ravg.prev_window;
+       }
+
+       if ((s64)src_rq->prev_runnable_sum < 0) {
+               src_rq->prev_runnable_sum = 0;
+               WARN_ON(1);
+       }
+       if ((s64)src_rq->curr_runnable_sum < 0) {
+               src_rq->curr_runnable_sum = 0;
+               WARN_ON(1);
+       }
+
+       trace_walt_migration_update_sum(src_rq, p);
+       trace_walt_migration_update_sum(dest_rq, p);
+
+       if (p->state == TASK_WAKING)
+               double_rq_unlock(src_rq, dest_rq);
+}
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+       int i;
+       u32 init_load_windows =
+                       div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+                          (u64)walt_ravg_window, 100);
+       u32 init_load_pct = current->init_load_pct;
+
+       p->init_load_pct = 0;
+       memset(&p->ravg, 0, sizeof(struct ravg));
+
+       if (init_load_pct) {
+               init_load_windows = div64_u64((u64)init_load_pct *
+                         (u64)walt_ravg_window, 100);
+       }
+
+       p->ravg.demand = init_load_windows;
+       for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+               p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h
new file mode 100644 (file)
index 0000000..c7a4ef9
--- /dev/null
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+               u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq, struct rq_flags *rf);
+void walt_migrate_sync_cpu(int cpu);
+u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+                                  u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+               int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq, struct rq_flags *rf) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#define walt_cpu_high_irqload(cpu) false
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern bool walt_disabled;
+
+#endif
index 4d931de68a188ed6c2af9f43a5055735e27a5949..bc15c2a8fe7ba52b8d17fd953f4c86c56957df4a 100644 (file)
@@ -329,6 +329,36 @@ static struct ctl_table kern_table[] = {
                .extra1         = &min_sched_granularity_ns,
                .extra2         = &max_sched_granularity_ns,
        },
+#ifdef CONFIG_SCHED_WALT
+       {
+               .procname       = "sched_use_walt_cpu_util",
+               .data           = &sysctl_sched_use_walt_cpu_util,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_use_walt_task_util",
+               .data           = &sysctl_sched_use_walt_task_util,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_walt_init_task_load_pct",
+               .data           = &sysctl_sched_walt_init_task_load_pct,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_walt_cpu_high_irqload",
+               .data           = &sysctl_sched_walt_cpu_high_irqload,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+#endif
        {
                .procname       = "sched_sync_hint_enable",
                .data           = &sysctl_sched_sync_hint_enable,