ANDROID: sched: Introduce Window Assisted Load Tracking (WALT)

author Srivatsa Vaddagiri <vatsa@codeaurora.org>

Wed, 8 Nov 2017 19:56:30 +0000 (19:56 +0000)

committer Chris Redpath <chris.redpath@arm.com>

Tue, 19 Dec 2017 19:45:37 +0000 (19:45 +0000)
author Srivatsa Vaddagiri <vatsa@codeaurora.org>
Wed, 8 Nov 2017 19:56:30 +0000 (19:56 +0000)
committer Chris Redpath <chris.redpath@arm.com>
Tue, 19 Dec 2017 19:45:37 +0000 (19:45 +0000)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index fdf74f27acf1e9801051c5b6c22f1ec5008b9f42..30c35a2ecb46a8deb5e48ec98c4468f9f9bf62de 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -166,6 +166,15 @@ struct task_group;
  /* Task command name length: */
  #define TASK_COMM_LEN                  16
  
+enum task_event {
+       PUT_PREV_TASK   = 0,
+       PICK_NEXT_TASK  = 1,
+       TASK_WAKE       = 2,
+       TASK_MIGRATE    = 3,
+       TASK_UPDATE     = 4,
+       IRQ_UPDATE      = 5,
+};
+
  extern cpumask_var_t                   cpu_isolated_map;
  
  extern void scheduler_tick(void);
@@ -410,6 +419,41 @@ struct sched_entity {
  #endif
  };
  
+#ifdef CONFIG_SCHED_WALT
+#define RAVG_HIST_SIZE_MAX  5
+
+/* ravg represents frequency scaled cpu-demand of tasks */
+struct ravg {
+       /*
+        * 'mark_start' marks the beginning of an event (task waking up, task
+        * starting to execute, task being preempted) within a window
+        *
+        * 'sum' represents how runnable a task has been within current
+        * window. It incorporates both running time and wait time and is
+        * frequency scaled.
+        *
+        * 'sum_history' keeps track of history of 'sum' seen over previous
+        * RAVG_HIST_SIZE windows. Windows where task was entirely sleeping are
+        * ignored.
+        *
+        * 'demand' represents maximum sum seen over previous
+        * sysctl_sched_ravg_hist_size windows. 'demand' could drive frequency
+        * demand for tasks.
+        *
+        * 'curr_window' represents task's contribution to cpu busy time
+        * statistics (rq->curr_runnable_sum) in current window
+        *
+        * 'prev_window' represents task's contribution to cpu busy time
+        * statistics (rq->prev_runnable_sum) in previous window
+        */
+       u64 mark_start;
+       u32 sum, demand;
+       u32 sum_history[RAVG_HIST_SIZE_MAX];
+       u32 curr_window, prev_window;
+       u16 active_windows;
+};
+#endif
+
  struct sched_rt_entity {
         struct list_head                run_list;
         unsigned long                   timeout;
@@ -562,6 +606,16 @@ struct task_struct {
         const struct sched_class        *sched_class;
         struct sched_entity             se;
         struct sched_rt_entity          rt;
+#ifdef CONFIG_SCHED_WALT
+       struct ravg ravg;
+       /*
+        * 'init_load_pct' represents the initial task load assigned to children
+        * of this task
+        */
+       u32 init_load_pct;
+       u64 last_sleep_ts;
+#endif
+
  #ifdef CONFIG_CGROUP_SCHED
         struct task_group               *sched_task_group;
  #endif
diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h

index 4824a948b4066c651602c3af00ab609dd286e461..e076ff8179b209dbe26e0f09e32e40d446f12c3a 100644 (file)
--- a/include/linux/sched/sysctl.h
+++ b/include/linux/sched/sysctl.h
@@ -25,6 +25,12 @@ extern unsigned int sysctl_sched_sync_hint_enable;
  extern unsigned int sysctl_sched_cstate_aware;
  extern unsigned int sysctl_sched_wakeup_granularity;
  extern unsigned int sysctl_sched_child_runs_first;
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int sysctl_sched_walt_init_task_load_pct;
+extern unsigned int sysctl_sched_walt_cpu_high_irqload;
+#endif
  
  enum sched_tunable_scaling {
         SCHED_TUNABLESCALING_NONE,
diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h

index 682c911f76b3d6aae7102f01e1bfbde19881d216..f881dcf92caed41af47d73836fc4d6e176c2f8e5 100644 (file)
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -667,6 +667,52 @@ TRACE_EVENT(sched_load_rt_rq,
                   __entry->util)
  );
  
+#ifdef CONFIG_SCHED_WALT
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int sysctl_sched_use_walt_task_util;
+extern unsigned int walt_ravg_window;
+extern bool walt_disabled;
+#endif
+
+/*
+ * Tracepoint for accounting cpu root cfs_rq
+ */
+TRACE_EVENT(sched_load_avg_cpu,
+
+        TP_PROTO(int cpu, struct cfs_rq *cfs_rq),
+
+        TP_ARGS(cpu, cfs_rq),
+
+        TP_STRUCT__entry(
+                __field( int,   cpu                             )
+                __field( unsigned long, load_avg                )
+                __field( unsigned long, util_avg                )
+                __field( unsigned long, util_avg_pelt           )
+                __field( unsigned long, util_avg_walt           )
+        ),
+
+        TP_fast_assign(
+                __entry->cpu                    = cpu;
+                __entry->load_avg               = cfs_rq->avg.load_avg;
+                __entry->util_avg               = cfs_rq->avg.util_avg;
+                __entry->util_avg_pelt  = cfs_rq->avg.util_avg;
+                __entry->util_avg_walt  = 0;
+#ifdef CONFIG_SCHED_WALT
+                __entry->util_avg_walt  =
+                                cpu_rq(cpu)->prev_runnable_sum << SCHED_CAPACITY_SHIFT;
+                do_div(__entry->util_avg_walt, walt_ravg_window);
+                if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+                        __entry->util_avg               = __entry->util_avg_walt;
+#endif
+        ),
+
+        TP_printk("cpu=%d load_avg=%lu util_avg=%lu "
+                          "util_avg_pelt=%lu util_avg_walt=%lu",
+                  __entry->cpu, __entry->load_avg, __entry->util_avg,
+                  __entry->util_avg_pelt, __entry->util_avg_walt)
+);
+
+
  /*
   * Tracepoint for sched_entity load tracking:
   */
@@ -684,6 +730,8 @@ TRACE_EVENT(sched_load_se,
                 __field(        pid_t,          pid                           )
                 __field(        unsigned long,  load                          )
                 __field(        unsigned long,  util                          )
+               __field(        unsigned long,  util_pelt                     )
+               __field(        unsigned long,  util_walt                     )
         ),
  
         TP_fast_assign(
@@ -698,11 +746,23 @@ TRACE_EVENT(sched_load_se,
                 __entry->pid = p ? p->pid : -1;
                 __entry->load = se->avg.load_avg;
                 __entry->util = se->avg.util_avg;
+               __entry->util_pelt  = __entry->util;
+               __entry->util_walt  = 0;
+#ifdef CONFIG_SCHED_WALT
+               if (!se->my_q) {
+                       struct task_struct *p = container_of(se, struct task_struct, se);
+                       __entry->util_walt = p->ravg.demand;
+                       do_div(__entry->util_walt, walt_ravg_window >> SCHED_CAPACITY_SHIFT);
+                       if (!walt_disabled && sysctl_sched_use_walt_task_util)
+                               __entry->util = __entry->util_walt;
+               }
+#endif
         ),
  
-       TP_printk("cpu=%d path=%s comm=%s pid=%d load=%lu util=%lu",
+       TP_printk("cpu=%d path=%s comm=%s pid=%d load=%lu util=%lu util_pelt=%lu util_walt=%lu",
                   __entry->cpu, __get_str(path), __entry->comm,
-                 __entry->pid, __entry->load, __entry->util)
+                 __entry->pid, __entry->load, __entry->util,
+                 __entry->util_pelt, __entry->util_walt)
  );
  
  /*
@@ -921,6 +981,163 @@ TRACE_EVENT(sched_find_best_target,
                 __entry->target)
  );
  
+#ifdef CONFIG_SCHED_WALT
+struct rq;
+
+TRACE_EVENT(walt_update_task_ravg,
+
+       TP_PROTO(struct task_struct *p, struct rq *rq, int evt,
+                                               u64 wallclock, u64 irqtime),
+
+       TP_ARGS(p, rq, evt, wallclock, irqtime),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(        pid_t,  cur_pid                 )
+               __field(        u64,    wallclock               )
+               __field(        u64,    mark_start              )
+               __field(        u64,    delta_m                 )
+               __field(        u64,    win_start               )
+               __field(        u64,    delta                   )
+               __field(        u64,    irqtime                 )
+               __array(    char,   evt, 16                     )
+               __field(unsigned int,   demand                  )
+               __field(unsigned int,   sum                     )
+               __field(         int,   cpu                     )
+               __field(        u64,    cs                      )
+               __field(        u64,    ps                      )
+               __field(        u32,    curr_window             )
+               __field(        u32,    prev_window             )
+               __field(        u64,    nt_cs                   )
+               __field(        u64,    nt_ps                   )
+               __field(        u32,    active_windows          )
+       ),
+
+       TP_fast_assign(
+                       static const char* walt_event_names[] =
+                       {
+                               "PUT_PREV_TASK",
+                               "PICK_NEXT_TASK",
+                               "TASK_WAKE",
+                               "TASK_MIGRATE",
+                               "TASK_UPDATE",
+                               "IRQ_UPDATE"
+                       };
+               __entry->wallclock      = wallclock;
+               __entry->win_start      = rq->window_start;
+               __entry->delta          = (wallclock - rq->window_start);
+               strcpy(__entry->evt, walt_event_names[evt]);
+               __entry->cpu            = rq->cpu;
+               __entry->cur_pid        = rq->curr->pid;
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->mark_start     = p->ravg.mark_start;
+               __entry->delta_m        = (wallclock - p->ravg.mark_start);
+               __entry->demand         = p->ravg.demand;
+               __entry->sum            = p->ravg.sum;
+               __entry->irqtime        = irqtime;
+               __entry->cs             = rq->curr_runnable_sum;
+               __entry->ps             = rq->prev_runnable_sum;
+               __entry->curr_window    = p->ravg.curr_window;
+               __entry->prev_window    = p->ravg.prev_window;
+               __entry->nt_cs          = rq->nt_curr_runnable_sum;
+               __entry->nt_ps          = rq->nt_prev_runnable_sum;
+               __entry->active_windows = p->ravg.active_windows;
+       ),
+
+       TP_printk("wallclock=%llu window_start=%llu delta=%llu event=%s cpu=%d cur_pid=%d pid=%d comm=%s"
+               " mark_start=%llu delta=%llu demand=%u sum=%u irqtime=%llu"
+               " curr_runnable_sum=%llu prev_runnable_sum=%llu cur_window=%u"
+               " prev_window=%u nt_curr_runnable_sum=%llu nt_prev_runnable_sum=%llu active_windows=%u",
+               __entry->wallclock, __entry->win_start, __entry->delta,
+               __entry->evt, __entry->cpu, __entry->cur_pid,
+               __entry->pid, __entry->comm, __entry->mark_start,
+               __entry->delta_m, __entry->demand,
+               __entry->sum, __entry->irqtime,
+               __entry->cs, __entry->ps,
+               __entry->curr_window, __entry->prev_window,
+               __entry->nt_cs, __entry->nt_ps,
+               __entry->active_windows
+               )
+);
+
+TRACE_EVENT(walt_update_history,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p, u32 runtime, int samples,
+                       int evt),
+
+       TP_ARGS(rq, p, runtime, samples, evt),
+
+       TP_STRUCT__entry(
+               __array(        char,   comm,   TASK_COMM_LEN   )
+               __field(        pid_t,  pid                     )
+               __field(unsigned int,   runtime                 )
+               __field(         int,   samples                 )
+               __field(         int,   evt                     )
+               __field(         u64,   demand                  )
+               __field(unsigned int,   walt_avg                )
+               __field(unsigned int,   pelt_avg                )
+               __array(         u32,   hist, RAVG_HIST_SIZE_MAX)
+               __field(         int,   cpu                     )
+       ),
+
+       TP_fast_assign(
+               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
+               __entry->pid            = p->pid;
+               __entry->runtime        = runtime;
+               __entry->samples        = samples;
+               __entry->evt            = evt;
+               __entry->demand         = p->ravg.demand;
+               __entry->walt_avg = (__entry->demand << 10) / walt_ravg_window,
+               __entry->pelt_avg       = p->se.avg.util_avg;
+               memcpy(__entry->hist, p->ravg.sum_history,
+                                       RAVG_HIST_SIZE_MAX * sizeof(u32));
+               __entry->cpu            = rq->cpu;
+       ),
+
+       TP_printk("pid=%d comm=%s runtime=%u samples=%d event=%d demand=%llu ravg_window=%u"
+               " walt=%u pelt=%u hist0=%u hist1=%u hist2=%u hist3=%u hist4=%u cpu=%d",
+               __entry->pid, __entry->comm,
+               __entry->runtime, __entry->samples, __entry->evt,
+               __entry->demand,
+               walt_ravg_window,
+               __entry->walt_avg,
+               __entry->pelt_avg,
+               __entry->hist[0], __entry->hist[1],
+               __entry->hist[2], __entry->hist[3],
+               __entry->hist[4], __entry->cpu)
+);
+
+TRACE_EVENT(walt_migration_update_sum,
+
+       TP_PROTO(struct rq *rq, struct task_struct *p),
+
+       TP_ARGS(rq, p),
+
+       TP_STRUCT__entry(
+               __field(int,            cpu                     )
+               __field(int,            pid                     )
+               __field(        u64,    cs                      )
+               __field(        u64,    ps                      )
+               __field(        s64,    nt_cs                   )
+               __field(        s64,    nt_ps                   )
+       ),
+
+       TP_fast_assign(
+               __entry->cpu            = cpu_of(rq);
+               __entry->cs             = rq->curr_runnable_sum;
+               __entry->ps             = rq->prev_runnable_sum;
+               __entry->nt_cs          = (s64)rq->nt_curr_runnable_sum;
+               __entry->nt_ps          = (s64)rq->nt_prev_runnable_sum;
+               __entry->pid            = p->pid;
+       ),
+
+       TP_printk("cpu=%d curr_runnable_sum=%llu prev_runnable_sum=%llu nt_curr_runnable_sum=%lld nt_prev_runnable_sum=%lld pid=%d",
+                 __entry->cpu, __entry->cs, __entry->ps,
+                 __entry->nt_cs, __entry->nt_ps, __entry->pid)
+);
+#endif /* CONFIG_SCHED_WALT */
  #endif /* CONFIG_SMP */
  #endif /* _TRACE_SCHED_H */
  
diff --git a/init/Kconfig b/init/Kconfig

index de490d85a2eec9bd7fa576f8e0904d9358d5773b..09eb610b8dc35b209038a41187975ba62011a390 100644 (file)
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -400,6 +400,15 @@ config IRQ_TIME_ACCOUNTING
  
           If in doubt, say N here.
  
+config SCHED_WALT
+        bool "Support window based load tracking"
+        depends on SMP
+        help
+        This feature will allow the scheduler to maintain a tunable window
+       based set of metrics for tasks and runqueues. These metrics can be
+       used to guide task placement as well as task frequency requirements
+       for cpufreq governors.
+
  config BSD_PROCESS_ACCT
         bool "BSD Process Accounting"
         depends on MULTIUSER
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile

index e6a7b240e0c544a491b434e45be388b6a3ff0b3a..b9207a9caa8616bde019367347305ce2a9de73f1 100644 (file)
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -21,6 +21,7 @@ obj-y += idle_task.o fair.o rt.o deadline.o
  obj-y += wait.o wait_bit.o swait.o completion.o idle.o
  obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o
  obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += energy.o
+obj-$(CONFIG_SCHED_WALT) += walt.o
  obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
  obj-$(CONFIG_SCHEDSTATS) += stats.o
  obj-$(CONFIG_SCHED_DEBUG) += debug.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index f48a8887695dd920f7245dcca3f06e662cd7447e..00c6f5784cdbbfd01c9813f6ee93326c473d0cb7 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -39,6 +39,7 @@
  
  #define CREATE_TRACE_POINTS
  #include <trace/events/sched.h>
+#include "walt.h"
  
  DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues);
  
@@ -1192,6 +1193,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu)
                         p->sched_class->migrate_task_rq(p);
                 p->se.nr_migrations++;
                 perf_event_task_migrate(p);
+
+               walt_fixup_busy_time(p, new_cpu);
         }
  
         __set_task_cpu(p, new_cpu);
@@ -1956,6 +1959,26 @@ static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags)
   *
   */
  
+#ifdef CONFIG_SMP
+#ifdef CONFIG_SCHED_WALT
+/* utility function to update walt signals at wakeup */
+static inline void walt_try_to_wake_up(struct task_struct *p)
+{
+       struct rq *rq = cpu_rq(task_cpu(p));
+       struct rq_flags rf;
+       u64 wallclock;
+
+       rq_lock_irqsave(rq, &rf);
+       wallclock = walt_ktime_clock();
+       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+       walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+       rq_unlock_irqrestore(rq, &rf);
+}
+#else
+#define walt_try_to_wake_up(a) {}
+#endif
+#endif
+
  /**
   * try_to_wake_up - wake up a thread
   * @p: the thread to be awakened
@@ -2054,6 +2077,8 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags,
          */
         smp_cond_load_acquire(&p->on_cpu, !VAL);
  
+       walt_try_to_wake_up(p);
+
         p->sched_contributes_to_load = !!task_contributes_to_load(p);
         p->state = TASK_WAKING;
  
@@ -2124,6 +2149,11 @@ static void try_to_wake_up_local(struct task_struct *p, struct rq_flags *rf)
         trace_sched_waking(p);
  
         if (!task_on_rq_queued(p)) {
+               u64 wallclock = walt_ktime_clock();
+
+               walt_update_task_ravg(rq->curr, rq, TASK_UPDATE, wallclock, 0);
+               walt_update_task_ravg(p, rq, TASK_WAKE, wallclock, 0);
+
                 if (p->in_iowait) {
                         delayacct_blkio_end();
                         atomic_dec(&rq->nr_iowait);
@@ -2176,7 +2206,12 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
         p->se.prev_sum_exec_runtime     = 0;
         p->se.nr_migrations             = 0;
         p->se.vruntime                  = 0;
+#ifdef CONFIG_SCHED_WALT
+       p->last_sleep_ts                = 0;
+#endif
+
         INIT_LIST_HEAD(&p->se.group_node);
+       walt_init_new_task_load(p);
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
         p->se.cfs_rq                    = NULL;
@@ -2453,6 +2488,9 @@ void wake_up_new_task(struct task_struct *p)
         struct rq *rq;
  
         raw_spin_lock_irqsave(&p->pi_lock, rf.flags);
+
+       walt_init_new_task_load(p);
+
         p->state = TASK_RUNNING;
  #ifdef CONFIG_SMP
         /*
@@ -2470,6 +2508,8 @@ void wake_up_new_task(struct task_struct *p)
         post_init_entity_util_avg(&p->se);
  
         activate_task(rq, p, ENQUEUE_NOCLOCK);
+       walt_mark_task_starting(p);
+
         p->on_rq = TASK_ON_RQ_QUEUED;
         trace_sched_wakeup_new(p);
         check_preempt_curr(rq, p, WF_FORK);
@@ -3023,6 +3063,9 @@ void scheduler_tick(void)
  
         rq_lock(rq, &rf);
  
+       walt_set_window_start(rq, &rf);
+       walt_update_task_ravg(rq->curr, rq, TASK_UPDATE,
+                       walt_ktime_clock(), 0);
         update_rq_clock(rq);
         curr->sched_class->task_tick(rq, curr, 0);
         cpu_load_update_active(rq);
@@ -3294,6 +3337,7 @@ static void __sched notrace __schedule(bool preempt)
         struct rq_flags rf;
         struct rq *rq;
         int cpu;
+       u64 wallclock;
  
         cpu = smp_processor_id();
         rq = cpu_rq(cpu);
@@ -3349,10 +3393,17 @@ static void __sched notrace __schedule(bool preempt)
         }
  
         next = pick_next_task(rq, prev, &rf);
+       wallclock = walt_ktime_clock();
+       walt_update_task_ravg(prev, rq, PUT_PREV_TASK, wallclock, 0);
+       walt_update_task_ravg(next, rq, PICK_NEXT_TASK, wallclock, 0);
         clear_tsk_need_resched(prev);
         clear_preempt_need_resched();
  
         if (likely(prev != next)) {
+#ifdef CONFIG_SCHED_WALT
+               if (!prev->on_rq)
+                       prev->last_sleep_ts = wallclock;
+#endif
                 rq->nr_switches++;
                 rq->curr = next;
                 /*
@@ -5704,6 +5755,9 @@ int sched_cpu_dying(unsigned int cpu)
         sched_ttwu_pending();
  
         rq_lock_irqsave(rq, &rf);
+
+       walt_migrate_sync_cpu(cpu);
+
         if (rq->rd) {
                 BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span));
                 set_rq_offline(rq);
@@ -5929,6 +5983,11 @@ void __init sched_init(void)
                 rq->idle_stamp = 0;
                 rq->avg_idle = 2*sysctl_sched_migration_cost;
                 rq->max_idle_balance_cost = sysctl_sched_migration_cost;
+#ifdef CONFIG_SCHED_WALT
+               rq->cur_irqload = 0;
+               rq->avg_irqload = 0;
+               rq->irqload_ts = 0;
+#endif
  
                 INIT_LIST_HEAD(&rq->cfs_tasks);
  
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index 14d2dbf97c531db0dd0c1bba3113a35cbe8d3bd9..029b505aca494c58ec224eeb1c6bead63450c169 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -6,6 +6,7 @@
  #include <linux/context_tracking.h>
  #include <linux/sched/cputime.h>
  #include "sched.h"
+#include "walt.h"
  
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
  
@@ -55,11 +56,18 @@ void irqtime_account_irq(struct task_struct *curr)
         struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
         s64 delta;
         int cpu;
+#ifdef CONFIG_SCHED_WALT
+       u64 wallclock;
+       bool account = true;
+#endif
  
         if (!sched_clock_irqtime)
                 return;
  
         cpu = smp_processor_id();
+#ifdef CONFIG_SCHED_WALT
+       wallclock = sched_clock_cpu(cpu);
+#endif
         delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
         irqtime->irq_start_time += delta;
  
@@ -73,6 +81,13 @@ void irqtime_account_irq(struct task_struct *curr)
                 irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
         else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
                 irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
+#ifdef CONFIG_SCHED_WALT
+       else
+               account = false;
+
+       if (account)
+               walt_account_irqtime(cpu, curr, delta, wallclock);
+#endif
  }
  EXPORT_SYMBOL_GPL(irqtime_account_irq);
  
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index b60f444500fd852728d580fb82f47b56ee2a4209..f982a3fa825393319047764690dfe21ce3e15fb5 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -20,6 +20,8 @@
  #include <linux/slab.h>
  #include <uapi/linux/sched/types.h>
  
+#include "walt.h"
+
  struct dl_bandwidth def_dl_bandwidth;
  
  static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se)
@@ -1290,6 +1292,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
         WARN_ON(!dl_prio(prio));
         dl_rq->dl_nr_running++;
         add_nr_running(rq_of_dl_rq(dl_rq), 1);
+       walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
  
         inc_dl_deadline(dl_rq, deadline);
         inc_dl_migration(dl_se, dl_rq);
@@ -1304,6 +1307,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq)
         WARN_ON(!dl_rq->dl_nr_running);
         dl_rq->dl_nr_running--;
         sub_nr_running(rq_of_dl_rq(dl_rq), 1);
+       walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se));
  
         dec_dl_deadline(dl_rq, dl_se->deadline);
         dec_dl_migration(dl_se, dl_rq);
@@ -2018,7 +2022,9 @@ retry:
         deactivate_task(rq, next_task, 0);
         sub_running_bw(next_task->dl.dl_bw, &rq->dl);
         sub_rq_bw(next_task->dl.dl_bw, &rq->dl);
+       next_task->on_rq = TASK_ON_RQ_MIGRATING;
         set_task_cpu(next_task, later_rq->cpu);
+       next_task->on_rq = TASK_ON_RQ_QUEUED;
         add_rq_bw(next_task->dl.dl_bw, &later_rq->dl);
         add_running_bw(next_task->dl.dl_bw, &later_rq->dl);
         activate_task(later_rq, next_task, 0);
@@ -2110,7 +2116,9 @@ static void pull_dl_task(struct rq *this_rq)
                         deactivate_task(src_rq, p, 0);
                         sub_running_bw(p->dl.dl_bw, &src_rq->dl);
                         sub_rq_bw(p->dl.dl_bw, &src_rq->dl);
+                       p->on_rq = TASK_ON_RQ_MIGRATING;
                         set_task_cpu(p, this_cpu);
+                       p->on_rq = TASK_ON_RQ_QUEUED;
                         add_rq_bw(p->dl.dl_bw, &this_rq->dl);
                         add_running_bw(p->dl.dl_bw, &this_rq->dl);
                         activate_task(this_rq, p, 0);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 44a7154dd220fb3d459817b61017de4140333397..ba43bd69da8fab3d828d8bb547114dd98aa14117 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -38,6 +38,7 @@
  
  #include "sched.h"
  #include "tune.h"
+#include "walt.h"
  
  /*
   * Targeted preemption latency for CPU-bound tasks:
@@ -110,6 +111,13 @@ unsigned int normalized_sysctl_sched_wakeup_granularity    = 1000000UL;
  
  const_debug unsigned int sysctl_sched_migration_cost   = 500000UL;
  
+#ifdef CONFIG_SCHED_WALT
+unsigned int sysctl_sched_use_walt_cpu_util = 1;
+unsigned int sysctl_sched_use_walt_task_util = 1;
+__read_mostly unsigned int sysctl_sched_walt_cpu_high_irqload =
+    (10 * NSEC_PER_MSEC);
+#endif
+
  #ifdef CONFIG_SMP
  /*
   * For asym packing, by default the lower numbered cpu has higher priority.
@@ -1440,7 +1448,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
  static unsigned long weighted_cpuload(struct rq *rq);
  static unsigned long source_load(int cpu, int type);
  static unsigned long target_load(int cpu, int type);
-static unsigned long capacity_of(int cpu);
  
  /* Cached statistics for all CPUs within a node */
  struct numa_stats {
@@ -2821,6 +2828,9 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq)
                  * See cpu_util().
                  */
                 cpufreq_update_util(rq, 0);
+#ifdef CONFIG_SMP
+               trace_sched_load_avg_cpu(cpu_of(rq), cfs_rq);
+#endif
         }
  }
  
@@ -4952,8 +4962,13 @@ static inline void update_overutilized_status(struct rq *rq)
                 set_sd_overutilized(sd);
         rcu_read_unlock();
  }
+
+unsigned long boosted_cpu_util(int cpu);
  #else
+
  #define update_overutilized_status(rq) do {} while (0)
+#define boosted_cpu_util(cpu) cpu_util_freq(cpu)
+
  #endif /* CONFIG_SMP */
  
  /*
@@ -4991,6 +5006,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 flags = ENQUEUE_WAKEUP;
         }
@@ -4998,6 +5014,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running++;
+               walt_inc_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@ -5029,7 +5046,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 add_nr_running(rq, 1);
                 if (!task_new)
                         update_overutilized_status(rq);
+               walt_inc_cumulative_runnable_avg(rq, p);
         }
+
         hrtick_update(rq);
  }
  
@@ -5059,6 +5078,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (cfs_rq_throttled(cfs_rq))
                         break;
                 cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 /* Don't dequeue parent if it has other entities besides us */
                 if (cfs_rq->load.weight) {
@@ -5078,6 +5098,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
         for_each_sched_entity(se) {
                 cfs_rq = cfs_rq_of(se);
                 cfs_rq->h_nr_running--;
+               walt_dec_cfs_cumulative_runnable_avg(cfs_rq, p);
  
                 if (cfs_rq_throttled(cfs_rq))
                         break;
@@ -5095,8 +5116,10 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
          */
         schedtune_dequeue_task(p, cpu_of(rq));
  
-       if (!se)
+       if (!se) {
                 sub_nr_running(rq, 1);
+               walt_dec_cumulative_runnable_avg(rq, p);
+       }
  
         hrtick_update(rq);
  }
@@ -5405,16 +5428,6 @@ static unsigned long target_load(int cpu, int type)
         return max(rq->cpu_load[type-1], total);
  }
  
-static unsigned long capacity_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity;
-}
-
-static unsigned long capacity_orig_of(int cpu)
-{
-       return cpu_rq(cpu)->cpu_capacity_orig;
-}
-
  static unsigned long cpu_avg_load_per_task(int cpu)
  {
         struct rq *rq = cpu_rq(cpu);
@@ -5427,49 +5440,6 @@ static unsigned long cpu_avg_load_per_task(int cpu)
         return 0;
  }
  
-/*
- * cpu_util returns the amount of capacity of a CPU that is used by CFS
- * tasks. The unit of the return value must be the one of capacity so we can
- * compare the utilization with the capacity of the CPU that is available for
- * CFS task (ie cpu_capacity).
- *
- * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
- * recent utilization of currently non-runnable tasks on a CPU. It represents
- * the amount of utilization of a CPU in the range [0..capacity_orig] where
- * capacity_orig is the cpu_capacity available at the highest frequency
- * (arch_scale_freq_capacity()).
- * The utilization of a CPU converges towards a sum equal to or less than the
- * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
- * the running time on this CPU scaled by capacity_curr.
- *
- * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
- * higher than capacity_orig because of unfortunate rounding in
- * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
- * the average stabilizes with the new running time. We need to check that the
- * utilization stays within the range of [0..capacity_orig] and cap it if
- * necessary. Without utilization capping, a group could be seen as overloaded
- * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
- * available capacity. We allow utilization to overshoot capacity_curr (but not
- * capacity_orig) as it useful for predicting the capacity required after task
- * migrations (scheduler-driven DVFS).
- */
-static unsigned long __cpu_util(int cpu, int delta)
-{
-       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
-       unsigned long capacity = capacity_orig_of(cpu);
-
-       delta += util;
-       if (delta < 0)
-               return 0;
-
-       return (delta >= capacity) ? capacity : delta;
-}
-
-static unsigned long cpu_util(int cpu)
-{
-       return __cpu_util(cpu, 0);
-}
-
  static void record_wakee(struct task_struct *p)
  {
         /*
@@ -6138,7 +6108,7 @@ schedtune_task_margin(struct task_struct *task)
  unsigned long
  boosted_cpu_util(int cpu)
  {
-       unsigned long util = cpu_util(cpu);
+       unsigned long util = cpu_util_freq(cpu);
         long margin = schedtune_cpu_margin(util, cpu);
  
         trace_sched_boost_cpu(cpu, util, margin);
@@ -6685,6 +6655,11 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target)
  
  static inline unsigned long task_util(struct task_struct *p)
  {
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_task_util) {
+               return (p->ravg.demand / (walt_ravg_window >> SCHED_CAPACITY_SHIFT));
+       }
+#endif
         return p->se.avg.util_avg;
  }
  
@@ -6696,6 +6671,16 @@ static int cpu_util_wake(int cpu, struct task_struct *p)
  {
         unsigned long util, capacity;
  
+#ifdef CONFIG_SCHED_WALT
+       /*
+        * WALT does not decay idle tasks in the same manner
+        * as PELT, so it makes little sense to subtract task
+        * utilization from cpu utilization. Instead just use
+        * cpu_util for this case.
+        */
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util)
+               return cpu_util(cpu);
+#endif
         /* Task has no contribution or is new */
         if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
                 return cpu_util(cpu);
@@ -6759,6 +6744,9 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
                         if (!cpu_online(i))
                                 continue;
  
+                       if (walt_cpu_high_irqload(i))
+                               continue;
+
                         /*
                          * p's blocked utilization is still accounted for on prev_cpu
                          * so prev_cpu will receive a negative bias due to the double
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c

index 8874c09b8f7ad17c4b35be1cb5f48c5bb227ba19..0e5660145cc545d5404d2cd4bad300e25c128399 100644 (file)
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -9,6 +9,8 @@
  #include <linux/slab.h>
  #include <linux/irq_work.h>
  
+#include "walt.h"
+
  int sched_rr_timeslice = RR_TIMESLICE;
  int sysctl_sched_rr_timeslice = (MSEC_PER_SEC / HZ) * RR_TIMESLICE;
  
@@ -1335,6 +1337,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
                 rt_se->timeout = 0;
  
         enqueue_rt_entity(rt_se, flags);
+       walt_inc_cumulative_runnable_avg(rq, p);
  
         if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                 enqueue_pushable_task(rq, p);
@@ -1346,6 +1349,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
  
         update_curr_rt(rq);
         dequeue_rt_entity(rt_se, flags);
+       walt_dec_cumulative_runnable_avg(rq, p);
  
         dequeue_pushable_task(rq, p);
  }
@@ -1863,7 +1867,9 @@ retry:
         }
  
         deactivate_task(rq, next_task, 0);
+       next_task->on_rq = TASK_ON_RQ_MIGRATING;
         set_task_cpu(next_task, lowest_rq->cpu);
+       next_task->on_rq = TASK_ON_RQ_QUEUED;
         activate_task(lowest_rq, next_task, 0);
         ret = 1;
  
@@ -2198,7 +2204,9 @@ static void pull_rt_task(struct rq *this_rq)
                         resched = true;
  
                         deactivate_task(src_rq, p, 0);
+                       p->on_rq = TASK_ON_RQ_MIGRATING;
                         set_task_cpu(p, this_cpu);
+                       p->on_rq = TASK_ON_RQ_QUEUED;
                         activate_task(this_rq, p, 0);
                         /*
                          * We continue with the search, just in
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 671af4025fd81e6bae62963e6ad53522dc4c472b..d767020c89150a0fc6a7179024ce740972844b1a 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -483,6 +483,10 @@ struct cfs_rq {
         struct list_head leaf_cfs_rq_list;
         struct task_group *tg;  /* group that "owns" this runqueue */
  
+#ifdef CONFIG_SCHED_WALT
+       u64 cumulative_runnable_avg;
+#endif
+
  #ifdef CONFIG_CFS_BANDWIDTH
         int runtime_enabled;
         u64 runtime_expires;
@@ -759,6 +763,20 @@ struct rq {
         u64 max_idle_balance_cost;
  #endif
  
+#ifdef CONFIG_SCHED_WALT
+       u64 cumulative_runnable_avg;
+       u64 window_start;
+       u64 curr_runnable_sum;
+       u64 prev_runnable_sum;
+       u64 nt_curr_runnable_sum;
+       u64 nt_prev_runnable_sum;
+       u64 cur_irqload;
+       u64 avg_irqload;
+       u64 irqload_ts;
+       u64 cum_window_demand;
+#endif /* CONFIG_SCHED_WALT */
+
+
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
         u64 prev_irq_time;
  #endif
@@ -1710,6 +1728,86 @@ unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu)
  }
  #endif
  
+#ifdef CONFIG_SMP
+static inline unsigned long capacity_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_capacity;
+}
+
+static inline unsigned long capacity_orig_of(int cpu)
+{
+       return cpu_rq(cpu)->cpu_capacity_orig;
+}
+
+extern unsigned int sysctl_sched_use_walt_cpu_util;
+extern unsigned int walt_ravg_window;
+extern bool walt_disabled;
+
+/*
+ * cpu_util returns the amount of capacity of a CPU that is used by CFS
+ * tasks. The unit of the return value must be the one of capacity so we can
+ * compare the utilization with the capacity of the CPU that is available for
+ * CFS task (ie cpu_capacity).
+ *
+ * cfs_rq.avg.util_avg is the sum of running time of runnable tasks plus the
+ * recent utilization of currently non-runnable tasks on a CPU. It represents
+ * the amount of utilization of a CPU in the range [0..capacity_orig] where
+ * capacity_orig is the cpu_capacity available at the highest frequency
+ * (arch_scale_freq_capacity()).
+ * The utilization of a CPU converges towards a sum equal to or less than the
+ * current capacity (capacity_curr <= capacity_orig) of the CPU because it is
+ * the running time on this CPU scaled by capacity_curr.
+ *
+ * Nevertheless, cfs_rq.avg.util_avg can be higher than capacity_curr or even
+ * higher than capacity_orig because of unfortunate rounding in
+ * cfs.avg.util_avg or just after migrating tasks and new task wakeups until
+ * the average stabilizes with the new running time. We need to check that the
+ * utilization stays within the range of [0..capacity_orig] and cap it if
+ * necessary. Without utilization capping, a group could be seen as overloaded
+ * (CPU0 utilization at 121% + CPU1 utilization at 80%) whereas CPU1 has 20% of
+ * available capacity. We allow utilization to overshoot capacity_curr (but not
+ * capacity_orig) as it useful for predicting the capacity required after task
+ * migrations (scheduler-driven DVFS).
+ */
+static inline unsigned long __cpu_util(int cpu, int delta)
+{
+       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+               util = cpu_rq(cpu)->cumulative_runnable_avg << SCHED_CAPACITY_SHIFT;
+               util = div_u64(util, walt_ravg_window);
+       }
+#endif
+       delta += util;
+       if (delta < 0)
+               return 0;
+
+       return (delta >= capacity) ? capacity : delta;
+}
+
+static inline unsigned long cpu_util(int cpu)
+{
+       return __cpu_util(cpu, 0);
+}
+
+static inline unsigned long cpu_util_freq(int cpu)
+{
+       unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg;
+       unsigned long capacity = capacity_orig_of(cpu);
+
+#ifdef CONFIG_SCHED_WALT
+       if (!walt_disabled && sysctl_sched_use_walt_cpu_util) {
+               util = cpu_rq(cpu)->prev_runnable_sum << SCHED_CAPACITY_SHIFT;
+               do_div(util, walt_ravg_window);
+       }
+#endif
+       return (util >= capacity) ? capacity : util;
+}
+
+#endif
+
  static inline void sched_rt_avg_update(struct rq *rq, u64 rt_delta)
  {
         rq->rt_avg += rt_delta * arch_scale_freq_capacity(NULL, cpu_of(rq));
@@ -2123,6 +2221,17 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags)
  static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {}
  #endif /* CONFIG_CPU_FREQ */
  
+#ifdef CONFIG_SCHED_WALT
+
+static inline bool
+walt_task_in_cum_window_demand(struct rq *rq, struct task_struct *p)
+{
+       return cpu_of(rq) == task_cpu(p) &&
+              (p->on_rq || p->last_sleep_ts >= rq->window_start);
+}
+
+#endif /* CONFIG_SCHED_WALT */
+
  #ifdef arch_scale_freq_capacity
  #ifndef arch_scale_freq_invariant
  #define arch_scale_freq_invariant()    (true)
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c

index ea6de023db26f76848e123dc093956090c3ea68c..7ca03e528c45d3224f8662a09d2a485162e886cf 100644 (file)
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -1,5 +1,6 @@
  // SPDX-License-Identifier: GPL-2.0
  #include "sched.h"
+#include "walt.h"
  
  /*
   * stop-task scheduling class.
@@ -44,12 +45,14 @@ static void
  enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags)
  {
         add_nr_running(rq, 1);
+       walt_inc_cumulative_runnable_avg(rq, p);
  }
  
  static void
  dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags)
  {
         sub_nr_running(rq, 1);
+       walt_dec_cumulative_runnable_avg(rq, p);
  }
  
  static void yield_task_stop(struct rq *rq)
diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c

new file mode 100644 (file)

index 0000000..a6aef2f
--- /dev/null
+++ b/kernel/sched/walt.c
@@ -0,0 +1,901 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ *
+ * Window Assisted Load Tracking (WALT) implementation credits:
+ * Srivatsa Vaddagiri, Steve Muckle, Syed Rameez Mustafa, Joonwoo Park,
+ * Pavan Kumar Kondeti, Olav Haugan
+ *
+ * 2016-03-06: Integration with EAS/refactoring by Vikram Mulukutla
+ *             and Todd Kjos
+ */
+
+#include <linux/acpi.h>
+#include <linux/syscore_ops.h>
+#include <trace/events/sched.h>
+#include "sched.h"
+#include "walt.h"
+
+#define WINDOW_STATS_RECENT            0
+#define WINDOW_STATS_MAX               1
+#define WINDOW_STATS_MAX_RECENT_AVG    2
+#define WINDOW_STATS_AVG               3
+#define WINDOW_STATS_INVALID_POLICY    4
+
+#define EXITING_TASK_MARKER    0xdeaddead
+
+static __read_mostly unsigned int walt_ravg_hist_size = 5;
+static __read_mostly unsigned int walt_window_stats_policy =
+       WINDOW_STATS_MAX_RECENT_AVG;
+static __read_mostly unsigned int walt_account_wait_time = 1;
+static __read_mostly unsigned int walt_freq_account_wait_time = 0;
+static __read_mostly unsigned int walt_io_is_busy = 0;
+
+unsigned int sysctl_sched_walt_init_task_load_pct = 15;
+
+/* true -> use PELT based load stats, false -> use window-based load stats */
+bool __read_mostly walt_disabled = false;
+
+/*
+ * Window size (in ns). Adjust for the tick size so that the window
+ * rollover occurs just before the tick boundary.
+ */
+__read_mostly unsigned int walt_ravg_window =
+                                           (20000000 / TICK_NSEC) * TICK_NSEC;
+#define MIN_SCHED_RAVG_WINDOW ((10000000 / TICK_NSEC) * TICK_NSEC)
+#define MAX_SCHED_RAVG_WINDOW ((1000000000 / TICK_NSEC) * TICK_NSEC)
+
+static unsigned int sync_cpu;
+static ktime_t ktime_last;
+static __read_mostly bool walt_ktime_suspended;
+
+static unsigned int task_load(struct task_struct *p)
+{
+       return p->ravg.demand;
+}
+
+static inline void fixup_cum_window_demand(struct rq *rq, s64 delta)
+{
+       rq->cum_window_demand += delta;
+       if (unlikely((s64)rq->cum_window_demand < 0))
+               rq->cum_window_demand = 0;
+}
+
+void
+walt_inc_cumulative_runnable_avg(struct rq *rq,
+                                struct task_struct *p)
+{
+       rq->cumulative_runnable_avg += p->ravg.demand;
+
+       /*
+        * Add a task's contribution to the cumulative window demand when
+        *
+        * (1) task is enqueued with on_rq = 1 i.e migration,
+        *     prio/cgroup/class change.
+        * (2) task is waking for the first time in this window.
+        */
+       if (p->on_rq || (p->last_sleep_ts < rq->window_start))
+               fixup_cum_window_demand(rq, p->ravg.demand);
+}
+
+void
+walt_dec_cumulative_runnable_avg(struct rq *rq,
+                                struct task_struct *p)
+{
+       rq->cumulative_runnable_avg -= p->ravg.demand;
+       BUG_ON((s64)rq->cumulative_runnable_avg < 0);
+
+       /*
+        * on_rq will be 1 for sleeping tasks. So check if the task
+        * is migrating or dequeuing in RUNNING state to change the
+        * prio/cgroup/class.
+        */
+       if (task_on_rq_migrating(p) || p->state == TASK_RUNNING)
+               fixup_cum_window_demand(rq, -(s64)p->ravg.demand);
+}
+
+static void
+fixup_cumulative_runnable_avg(struct rq *rq,
+                             struct task_struct *p, u64 new_task_load)
+{
+       s64 task_load_delta = (s64)new_task_load - task_load(p);
+
+       rq->cumulative_runnable_avg += task_load_delta;
+       if ((s64)rq->cumulative_runnable_avg < 0)
+               panic("cra less than zero: tld: %lld, task_load(p) = %u\n",
+                       task_load_delta, task_load(p));
+
+       fixup_cum_window_demand(rq, task_load_delta);
+}
+
+u64 walt_ktime_clock(void)
+{
+       if (unlikely(walt_ktime_suspended))
+               return ktime_to_ns(ktime_last);
+       return ktime_get_ns();
+}
+
+static void walt_resume(void)
+{
+       walt_ktime_suspended = false;
+}
+
+static int walt_suspend(void)
+{
+       ktime_last = ktime_get();
+       walt_ktime_suspended = true;
+       return 0;
+}
+
+static struct syscore_ops walt_syscore_ops = {
+       .resume = walt_resume,
+       .suspend = walt_suspend
+};
+
+static int __init walt_init_ops(void)
+{
+       register_syscore_ops(&walt_syscore_ops);
+       return 0;
+}
+late_initcall(walt_init_ops);
+
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+               struct task_struct *p)
+{
+       cfs_rq->cumulative_runnable_avg += p->ravg.demand;
+}
+
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *cfs_rq,
+               struct task_struct *p)
+{
+       cfs_rq->cumulative_runnable_avg -= p->ravg.demand;
+}
+
+static int exiting_task(struct task_struct *p)
+{
+       if (p->flags & PF_EXITING) {
+               if (p->ravg.sum_history[0] != EXITING_TASK_MARKER) {
+                       p->ravg.sum_history[0] = EXITING_TASK_MARKER;
+               }
+               return 1;
+       }
+       return 0;
+}
+
+static int __init set_walt_ravg_window(char *str)
+{
+       unsigned int adj_window;
+       bool no_walt = walt_disabled;
+
+       get_option(&str, &walt_ravg_window);
+
+       /* Adjust for CONFIG_HZ */
+       adj_window = (walt_ravg_window / TICK_NSEC) * TICK_NSEC;
+
+       /* Warn if we're a bit too far away from the expected window size */
+       WARN(adj_window < walt_ravg_window - NSEC_PER_MSEC,
+            "tick-adjusted window size %u, original was %u\n", adj_window,
+            walt_ravg_window);
+
+       walt_ravg_window = adj_window;
+
+       walt_disabled = walt_disabled ||
+                       (walt_ravg_window < MIN_SCHED_RAVG_WINDOW ||
+                        walt_ravg_window > MAX_SCHED_RAVG_WINDOW);
+
+       WARN(!no_walt && walt_disabled,
+            "invalid window size, disabling WALT\n");
+
+       return 0;
+}
+
+early_param("walt_ravg_window", set_walt_ravg_window);
+
+static void
+update_window_start(struct rq *rq, u64 wallclock)
+{
+       s64 delta;
+       int nr_windows;
+
+       delta = wallclock - rq->window_start;
+       /* If the MPM global timer is cleared, set delta as 0 to avoid kernel BUG happening */
+       if (delta < 0) {
+               delta = 0;
+               WARN_ONCE(1, "WALT wallclock appears to have gone backwards or reset\n");
+       }
+
+       if (delta < walt_ravg_window)
+               return;
+
+       nr_windows = div64_u64(delta, walt_ravg_window);
+       rq->window_start += (u64)nr_windows * (u64)walt_ravg_window;
+
+       rq->cum_window_demand = rq->cumulative_runnable_avg;
+}
+
+extern unsigned long capacity_curr_of(int cpu);
+/*
+ * Translate absolute delta time accounted on a CPU
+ * to a scale where 1024 is the capacity of the most
+ * capable CPU running at FMAX
+ */
+static u64 scale_exec_time(u64 delta, struct rq *rq)
+{
+       unsigned long capcurr = capacity_curr_of(cpu_of(rq));
+
+       return (delta * capcurr) >> SCHED_CAPACITY_SHIFT;
+}
+
+static int cpu_is_waiting_on_io(struct rq *rq)
+{
+       if (!walt_io_is_busy)
+               return 0;
+
+       return atomic_read(&rq->nr_iowait);
+}
+
+void walt_account_irqtime(int cpu, struct task_struct *curr,
+                                u64 delta, u64 wallclock)
+{
+       struct rq *rq = cpu_rq(cpu);
+       unsigned long flags, nr_windows;
+       u64 cur_jiffies_ts;
+
+       raw_spin_lock_irqsave(&rq->lock, flags);
+
+       /*
+        * cputime (wallclock) uses sched_clock so use the same here for
+        * consistency.
+        */
+       delta += sched_clock() - wallclock;
+       cur_jiffies_ts = get_jiffies_64();
+
+       if (is_idle_task(curr))
+               walt_update_task_ravg(curr, rq, IRQ_UPDATE, walt_ktime_clock(),
+                                delta);
+
+       nr_windows = cur_jiffies_ts - rq->irqload_ts;
+
+       if (nr_windows) {
+               if (nr_windows < 10) {
+                       /* Decay CPU's irqload by 3/4 for each window. */
+                       rq->avg_irqload *= (3 * nr_windows);
+                       rq->avg_irqload = div64_u64(rq->avg_irqload,
+                                                   4 * nr_windows);
+               } else {
+                       rq->avg_irqload = 0;
+               }
+               rq->avg_irqload += rq->cur_irqload;
+               rq->cur_irqload = 0;
+       }
+
+       rq->cur_irqload += delta;
+       rq->irqload_ts = cur_jiffies_ts;
+       raw_spin_unlock_irqrestore(&rq->lock, flags);
+}
+
+
+#define WALT_HIGH_IRQ_TIMEOUT 3
+
+u64 walt_irqload(int cpu) {
+       struct rq *rq = cpu_rq(cpu);
+       s64 delta;
+       delta = get_jiffies_64() - rq->irqload_ts;
+
+        /*
+        * Current context can be preempted by irq and rq->irqload_ts can be
+        * updated by irq context so that delta can be negative.
+        * But this is okay and we can safely return as this means there
+        * was recent irq occurrence.
+        */
+
+        if (delta < WALT_HIGH_IRQ_TIMEOUT)
+               return rq->avg_irqload;
+        else
+               return 0;
+}
+
+int walt_cpu_high_irqload(int cpu) {
+       return walt_irqload(cpu) >= sysctl_sched_walt_cpu_high_irqload;
+}
+
+static int account_busy_for_cpu_time(struct rq *rq, struct task_struct *p,
+                                    u64 irqtime, int event)
+{
+       if (is_idle_task(p)) {
+               /* TASK_WAKE && TASK_MIGRATE is not possible on idle task! */
+               if (event == PICK_NEXT_TASK)
+                       return 0;
+
+               /* PUT_PREV_TASK, TASK_UPDATE && IRQ_UPDATE are left */
+               return irqtime || cpu_is_waiting_on_io(rq);
+       }
+
+       if (event == TASK_WAKE)
+               return 0;
+
+       if (event == PUT_PREV_TASK || event == IRQ_UPDATE ||
+                                        event == TASK_UPDATE)
+               return 1;
+
+       /* Only TASK_MIGRATE && PICK_NEXT_TASK left */
+       return walt_freq_account_wait_time;
+}
+
+/*
+ * Account cpu activity in its busy time counters (rq->curr/prev_runnable_sum)
+ */
+static void update_cpu_busy_time(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock, u64 irqtime)
+{
+       int new_window, nr_full_windows = 0;
+       int p_is_curr_task = (p == rq->curr);
+       u64 mark_start = p->ravg.mark_start;
+       u64 window_start = rq->window_start;
+       u32 window_size = walt_ravg_window;
+       u64 delta;
+
+       new_window = mark_start < window_start;
+       if (new_window) {
+               nr_full_windows = div64_u64((window_start - mark_start),
+                                               window_size);
+               if (p->ravg.active_windows < USHRT_MAX)
+                       p->ravg.active_windows++;
+       }
+
+       /* Handle per-task window rollover. We don't care about the idle
+        * task or exiting tasks. */
+       if (new_window && !is_idle_task(p) && !exiting_task(p)) {
+               u32 curr_window = 0;
+
+               if (!nr_full_windows)
+                       curr_window = p->ravg.curr_window;
+
+               p->ravg.prev_window = curr_window;
+               p->ravg.curr_window = 0;
+       }
+
+       if (!account_busy_for_cpu_time(rq, p, irqtime, event)) {
+               /* account_busy_for_cpu_time() = 0, so no update to the
+                * task's current window needs to be made. This could be
+                * for example
+                *
+                *   - a wakeup event on a task within the current
+                *     window (!new_window below, no action required),
+                *   - switching to a new task from idle (PICK_NEXT_TASK)
+                *     in a new window where irqtime is 0 and we aren't
+                *     waiting on IO */
+
+               if (!new_window)
+                       return;
+
+               /* A new window has started. The RQ demand must be rolled
+                * over if p is the current task. */
+               if (p_is_curr_task) {
+                       u64 prev_sum = 0;
+
+                       /* p is either idle task or an exiting task */
+                       if (!nr_full_windows) {
+                               prev_sum = rq->curr_runnable_sum;
+                       }
+
+                       rq->prev_runnable_sum = prev_sum;
+                       rq->curr_runnable_sum = 0;
+               }
+
+               return;
+       }
+
+       if (!new_window) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. No rollover
+                * since we didn't start a new window. An example of this is
+                * when a task starts execution and then sleeps within the
+                * same window. */
+
+               if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq))
+                       delta = wallclock - mark_start;
+               else
+                       delta = irqtime;
+               delta = scale_exec_time(delta, rq);
+               rq->curr_runnable_sum += delta;
+               if (!is_idle_task(p) && !exiting_task(p))
+                       p->ravg.curr_window += delta;
+
+               return;
+       }
+
+       if (!p_is_curr_task) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has also started, but p is not the current task, so the
+                * window is not rolled over - just split up and account
+                * as necessary into curr and prev. The window is only
+                * rolled over when a new window is processed for the current
+                * task.
+                *
+                * Irqtime can't be accounted by a task that isn't the
+                * currently running task. */
+
+               if (!nr_full_windows) {
+                       /* A full window hasn't elapsed, account partial
+                        * contribution to previous completed window. */
+                       delta = scale_exec_time(window_start - mark_start, rq);
+                       if (!exiting_task(p))
+                               p->ravg.prev_window += delta;
+               } else {
+                       /* Since at least one full window has elapsed,
+                        * the contribution to the previous window is the
+                        * full window (window_size). */
+                       delta = scale_exec_time(window_size, rq);
+                       if (!exiting_task(p))
+                               p->ravg.prev_window = delta;
+               }
+               rq->prev_runnable_sum += delta;
+
+               /* Account piece of busy time in the current window. */
+               delta = scale_exec_time(wallclock - window_start, rq);
+               rq->curr_runnable_sum += delta;
+               if (!exiting_task(p))
+                       p->ravg.curr_window = delta;
+
+               return;
+       }
+
+       if (!irqtime || !is_idle_task(p) || cpu_is_waiting_on_io(rq)) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has started and p is the current task so rollover is
+                * needed. If any of these three above conditions are true
+                * then this busy time can't be accounted as irqtime.
+                *
+                * Busy time for the idle task or exiting tasks need not
+                * be accounted.
+                *
+                * An example of this would be a task that starts execution
+                * and then sleeps once a new window has begun. */
+
+               if (!nr_full_windows) {
+                       /* A full window hasn't elapsed, account partial
+                        * contribution to previous completed window. */
+                       delta = scale_exec_time(window_start - mark_start, rq);
+                       if (!is_idle_task(p) && !exiting_task(p))
+                               p->ravg.prev_window += delta;
+
+                       delta += rq->curr_runnable_sum;
+               } else {
+                       /* Since at least one full window has elapsed,
+                        * the contribution to the previous window is the
+                        * full window (window_size). */
+                       delta = scale_exec_time(window_size, rq);
+                       if (!is_idle_task(p) && !exiting_task(p))
+                               p->ravg.prev_window = delta;
+
+               }
+               /*
+                * Rollover for normal runnable sum is done here by overwriting
+                * the values in prev_runnable_sum and curr_runnable_sum.
+                * Rollover for new task runnable sum has completed by previous
+                * if-else statement.
+                */
+               rq->prev_runnable_sum = delta;
+
+               /* Account piece of busy time in the current window. */
+               delta = scale_exec_time(wallclock - window_start, rq);
+               rq->curr_runnable_sum = delta;
+               if (!is_idle_task(p) && !exiting_task(p))
+                       p->ravg.curr_window = delta;
+
+               return;
+       }
+
+       if (irqtime) {
+               /* account_busy_for_cpu_time() = 1 so busy time needs
+                * to be accounted to the current window. A new window
+                * has started and p is the current task so rollover is
+                * needed. The current task must be the idle task because
+                * irqtime is not accounted for any other task.
+                *
+                * Irqtime will be accounted each time we process IRQ activity
+                * after a period of idleness, so we know the IRQ busy time
+                * started at wallclock - irqtime. */
+
+               BUG_ON(!is_idle_task(p));
+               mark_start = wallclock - irqtime;
+
+               /* Roll window over. If IRQ busy time was just in the current
+                * window then that is all that need be accounted. */
+               rq->prev_runnable_sum = rq->curr_runnable_sum;
+               if (mark_start > window_start) {
+                       rq->curr_runnable_sum = scale_exec_time(irqtime, rq);
+                       return;
+               }
+
+               /* The IRQ busy time spanned multiple windows. Process the
+                * busy time preceding the current window start first. */
+               delta = window_start - mark_start;
+               if (delta > window_size)
+                       delta = window_size;
+               delta = scale_exec_time(delta, rq);
+               rq->prev_runnable_sum += delta;
+
+               /* Process the remaining IRQ busy time in the current window. */
+               delta = wallclock - window_start;
+               rq->curr_runnable_sum = scale_exec_time(delta, rq);
+
+               return;
+       }
+
+       BUG();
+}
+
+static int account_busy_for_task_demand(struct task_struct *p, int event)
+{
+       /* No need to bother updating task demand for exiting tasks
+        * or the idle task. */
+       if (exiting_task(p) || is_idle_task(p))
+               return 0;
+
+       /* When a task is waking up it is completing a segment of non-busy
+        * time. Likewise, if wait time is not treated as busy time, then
+        * when a task begins to run or is migrated, it is not running and
+        * is completing a segment of non-busy time. */
+       if (event == TASK_WAKE || (!walt_account_wait_time &&
+                        (event == PICK_NEXT_TASK || event == TASK_MIGRATE)))
+               return 0;
+
+       return 1;
+}
+
+/*
+ * Called when new window is starting for a task, to record cpu usage over
+ * recently concluded window(s). Normally 'samples' should be 1. It can be > 1
+ * when, say, a real-time task runs without preemption for several windows at a
+ * stretch.
+ */
+static void update_history(struct rq *rq, struct task_struct *p,
+                        u32 runtime, int samples, int event)
+{
+       u32 *hist = &p->ravg.sum_history[0];
+       int ridx, widx;
+       u32 max = 0, avg, demand;
+       u64 sum = 0;
+
+       /* Ignore windows where task had no activity */
+       if (!runtime || is_idle_task(p) || exiting_task(p) || !samples)
+                       goto done;
+
+       /* Push new 'runtime' value onto stack */
+       widx = walt_ravg_hist_size - 1;
+       ridx = widx - samples;
+       for (; ridx >= 0; --widx, --ridx) {
+               hist[widx] = hist[ridx];
+               sum += hist[widx];
+               if (hist[widx] > max)
+                       max = hist[widx];
+       }
+
+       for (widx = 0; widx < samples && widx < walt_ravg_hist_size; widx++) {
+               hist[widx] = runtime;
+               sum += hist[widx];
+               if (hist[widx] > max)
+                       max = hist[widx];
+       }
+
+       p->ravg.sum = 0;
+
+       if (walt_window_stats_policy == WINDOW_STATS_RECENT) {
+               demand = runtime;
+       } else if (walt_window_stats_policy == WINDOW_STATS_MAX) {
+               demand = max;
+       } else {
+               avg = div64_u64(sum, walt_ravg_hist_size);
+               if (walt_window_stats_policy == WINDOW_STATS_AVG)
+                       demand = avg;
+               else
+                       demand = max(avg, runtime);
+       }
+
+       /*
+        * A throttled deadline sched class task gets dequeued without
+        * changing p->on_rq. Since the dequeue decrements hmp stats
+        * avoid decrementing it here again.
+        *
+        * When window is rolled over, the cumulative window demand
+        * is reset to the cumulative runnable average (contribution from
+        * the tasks on the runqueue). If the current task is dequeued
+        * already, it's demand is not included in the cumulative runnable
+        * average. So add the task demand separately to cumulative window
+        * demand.
+        */
+       if (!task_has_dl_policy(p) || !p->dl.dl_throttled) {
+               if (task_on_rq_queued(p))
+                       fixup_cumulative_runnable_avg(rq, p, demand);
+               else if (rq->curr == p)
+                       fixup_cum_window_demand(rq, demand);
+       }
+
+       p->ravg.demand = demand;
+
+done:
+       trace_walt_update_history(rq, p, runtime, samples, event);
+       return;
+}
+
+static void add_to_task_demand(struct rq *rq, struct task_struct *p,
+                               u64 delta)
+{
+       delta = scale_exec_time(delta, rq);
+       p->ravg.sum += delta;
+       if (unlikely(p->ravg.sum > walt_ravg_window))
+               p->ravg.sum = walt_ravg_window;
+}
+
+/*
+ * Account cpu demand of task and/or update task's cpu demand history
+ *
+ * ms = p->ravg.mark_start;
+ * wc = wallclock
+ * ws = rq->window_start
+ *
+ * Three possibilities:
+ *
+ *     a) Task event is contained within one window.
+ *             window_start < mark_start < wallclock
+ *
+ *             ws   ms  wc
+ *             |    |   |
+ *             V    V   V
+ *             |---------------|
+ *
+ *     In this case, p->ravg.sum is updated *iff* event is appropriate
+ *     (ex: event == PUT_PREV_TASK)
+ *
+ *     b) Task event spans two windows.
+ *             mark_start < window_start < wallclock
+ *
+ *             ms   ws   wc
+ *             |    |    |
+ *             V    V    V
+ *             -----|-------------------
+ *
+ *     In this case, p->ravg.sum is updated with (ws - ms) *iff* event
+ *     is appropriate, then a new window sample is recorded followed
+ *     by p->ravg.sum being set to (wc - ws) *iff* event is appropriate.
+ *
+ *     c) Task event spans more than two windows.
+ *
+ *             ms ws_tmp                          ws  wc
+ *             |  |                               |   |
+ *             V  V                               V   V
+ *             ---|-------|-------|-------|-------|------
+ *                |                               |
+ *                |<------ nr_full_windows ------>|
+ *
+ *     In this case, p->ravg.sum is updated with (ws_tmp - ms) first *iff*
+ *     event is appropriate, window sample of p->ravg.sum is recorded,
+ *     'nr_full_window' samples of window_size is also recorded *iff*
+ *     event is appropriate and finally p->ravg.sum is set to (wc - ws)
+ *     *iff* event is appropriate.
+ *
+ * IMPORTANT : Leave p->ravg.mark_start unchanged, as update_cpu_busy_time()
+ * depends on it!
+ */
+static void update_task_demand(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock)
+{
+       u64 mark_start = p->ravg.mark_start;
+       u64 delta, window_start = rq->window_start;
+       int new_window, nr_full_windows;
+       u32 window_size = walt_ravg_window;
+
+       new_window = mark_start < window_start;
+       if (!account_busy_for_task_demand(p, event)) {
+               if (new_window)
+                       /* If the time accounted isn't being accounted as
+                        * busy time, and a new window started, only the
+                        * previous window need be closed out with the
+                        * pre-existing demand. Multiple windows may have
+                        * elapsed, but since empty windows are dropped,
+                        * it is not necessary to account those. */
+                       update_history(rq, p, p->ravg.sum, 1, event);
+               return;
+       }
+
+       if (!new_window) {
+               /* The simple case - busy time contained within the existing
+                * window. */
+               add_to_task_demand(rq, p, wallclock - mark_start);
+               return;
+       }
+
+       /* Busy time spans at least two windows. Temporarily rewind
+        * window_start to first window boundary after mark_start. */
+       delta = window_start - mark_start;
+       nr_full_windows = div64_u64(delta, window_size);
+       window_start -= (u64)nr_full_windows * (u64)window_size;
+
+       /* Process (window_start - mark_start) first */
+       add_to_task_demand(rq, p, window_start - mark_start);
+
+       /* Push new sample(s) into task's demand history */
+       update_history(rq, p, p->ravg.sum, 1, event);
+       if (nr_full_windows)
+               update_history(rq, p, scale_exec_time(window_size, rq),
+                              nr_full_windows, event);
+
+       /* Roll window_start back to current to process any remainder
+        * in current window. */
+       window_start += (u64)nr_full_windows * (u64)window_size;
+
+       /* Process (wallclock - window_start) next */
+       mark_start = window_start;
+       add_to_task_demand(rq, p, wallclock - mark_start);
+}
+
+/* Reflect task activity on its demand and cpu's busy time statistics */
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+            int event, u64 wallclock, u64 irqtime)
+{
+       if (walt_disabled || !rq->window_start)
+               return;
+
+       lockdep_assert_held(&rq->lock);
+
+       update_window_start(rq, wallclock);
+
+       if (!p->ravg.mark_start)
+               goto done;
+
+       update_task_demand(p, rq, event, wallclock);
+       update_cpu_busy_time(p, rq, event, wallclock, irqtime);
+
+done:
+       trace_walt_update_task_ravg(p, rq, event, wallclock, irqtime);
+
+       p->ravg.mark_start = wallclock;
+}
+
+static void reset_task_stats(struct task_struct *p)
+{
+       u32 sum = 0;
+
+       if (exiting_task(p))
+               sum = EXITING_TASK_MARKER;
+
+       memset(&p->ravg, 0, sizeof(struct ravg));
+       /* Retain EXITING_TASK marker */
+       p->ravg.sum_history[0] = sum;
+}
+
+void walt_mark_task_starting(struct task_struct *p)
+{
+       u64 wallclock;
+       struct rq *rq = task_rq(p);
+
+       if (!rq->window_start) {
+               reset_task_stats(p);
+               return;
+       }
+
+       wallclock = walt_ktime_clock();
+       p->ravg.mark_start = wallclock;
+}
+
+void walt_set_window_start(struct rq *rq, struct rq_flags *rf)
+{
+       if (likely(rq->window_start))
+               return;
+
+       if (cpu_of(rq) == sync_cpu) {
+               rq->window_start = 1;
+       } else {
+               struct rq *sync_rq = cpu_rq(sync_cpu);
+               rq_unpin_lock(rq, rf);
+               double_lock_balance(rq, sync_rq);
+               rq->window_start = sync_rq->window_start;
+               rq->curr_runnable_sum = rq->prev_runnable_sum = 0;
+               raw_spin_unlock(&sync_rq->lock);
+               rq_repin_lock(rq, rf);
+       }
+
+       rq->curr->ravg.mark_start = rq->window_start;
+}
+
+void walt_migrate_sync_cpu(int cpu)
+{
+       if (cpu == sync_cpu)
+               sync_cpu = smp_processor_id();
+}
+
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu)
+{
+       struct rq *src_rq = task_rq(p);
+       struct rq *dest_rq = cpu_rq(new_cpu);
+       u64 wallclock;
+
+       if (!p->on_rq && p->state != TASK_WAKING)
+               return;
+
+       if (exiting_task(p)) {
+               return;
+       }
+
+       if (p->state == TASK_WAKING)
+               double_rq_lock(src_rq, dest_rq);
+
+       wallclock = walt_ktime_clock();
+
+       walt_update_task_ravg(task_rq(p)->curr, task_rq(p),
+                       TASK_UPDATE, wallclock, 0);
+       walt_update_task_ravg(dest_rq->curr, dest_rq,
+                       TASK_UPDATE, wallclock, 0);
+
+       walt_update_task_ravg(p, task_rq(p), TASK_MIGRATE, wallclock, 0);
+
+       /*
+        * When a task is migrating during the wakeup, adjust
+        * the task's contribution towards cumulative window
+        * demand.
+        */
+       if (p->state == TASK_WAKING &&
+           p->last_sleep_ts >= src_rq->window_start) {
+               fixup_cum_window_demand(src_rq, -(s64)p->ravg.demand);
+               fixup_cum_window_demand(dest_rq, p->ravg.demand);
+       }
+
+       if (p->ravg.curr_window) {
+               src_rq->curr_runnable_sum -= p->ravg.curr_window;
+               dest_rq->curr_runnable_sum += p->ravg.curr_window;
+       }
+
+       if (p->ravg.prev_window) {
+               src_rq->prev_runnable_sum -= p->ravg.prev_window;
+               dest_rq->prev_runnable_sum += p->ravg.prev_window;
+       }
+
+       if ((s64)src_rq->prev_runnable_sum < 0) {
+               src_rq->prev_runnable_sum = 0;
+               WARN_ON(1);
+       }
+       if ((s64)src_rq->curr_runnable_sum < 0) {
+               src_rq->curr_runnable_sum = 0;
+               WARN_ON(1);
+       }
+
+       trace_walt_migration_update_sum(src_rq, p);
+       trace_walt_migration_update_sum(dest_rq, p);
+
+       if (p->state == TASK_WAKING)
+               double_rq_unlock(src_rq, dest_rq);
+}
+
+void walt_init_new_task_load(struct task_struct *p)
+{
+       int i;
+       u32 init_load_windows =
+                       div64_u64((u64)sysctl_sched_walt_init_task_load_pct *
+                          (u64)walt_ravg_window, 100);
+       u32 init_load_pct = current->init_load_pct;
+
+       p->init_load_pct = 0;
+       memset(&p->ravg, 0, sizeof(struct ravg));
+
+       if (init_load_pct) {
+               init_load_windows = div64_u64((u64)init_load_pct *
+                         (u64)walt_ravg_window, 100);
+       }
+
+       p->ravg.demand = init_load_windows;
+       for (i = 0; i < RAVG_HIST_SIZE_MAX; ++i)
+               p->ravg.sum_history[i] = init_load_windows;
+}
diff --git a/kernel/sched/walt.h b/kernel/sched/walt.h

new file mode 100644 (file)

index 0000000..c7a4ef9
--- /dev/null
+++ b/kernel/sched/walt.h
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2016, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#ifndef __WALT_H
+#define __WALT_H
+
+#ifdef CONFIG_SCHED_WALT
+
+void walt_update_task_ravg(struct task_struct *p, struct rq *rq, int event,
+               u64 wallclock, u64 irqtime);
+void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p);
+void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p);
+void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p);
+void walt_fixup_busy_time(struct task_struct *p, int new_cpu);
+void walt_init_new_task_load(struct task_struct *p);
+void walt_mark_task_starting(struct task_struct *p);
+void walt_set_window_start(struct rq *rq, struct rq_flags *rf);
+void walt_migrate_sync_cpu(int cpu);
+u64 walt_ktime_clock(void);
+void walt_account_irqtime(int cpu, struct task_struct *curr, u64 delta,
+                                  u64 wallclock);
+
+u64 walt_irqload(int cpu);
+int walt_cpu_high_irqload(int cpu);
+
+#else /* CONFIG_SCHED_WALT */
+
+static inline void walt_update_task_ravg(struct task_struct *p, struct rq *rq,
+               int event, u64 wallclock, u64 irqtime) { }
+static inline void walt_inc_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_dec_cumulative_runnable_avg(struct rq *rq, struct task_struct *p) { }
+static inline void walt_inc_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p) { }
+static inline void walt_dec_cfs_cumulative_runnable_avg(struct cfs_rq *rq,
+               struct task_struct *p) { }
+static inline void walt_fixup_busy_time(struct task_struct *p, int new_cpu) { }
+static inline void walt_init_new_task_load(struct task_struct *p) { }
+static inline void walt_mark_task_starting(struct task_struct *p) { }
+static inline void walt_set_window_start(struct rq *rq, struct rq_flags *rf) { }
+static inline void walt_migrate_sync_cpu(int cpu) { }
+static inline u64 walt_ktime_clock(void) { return 0; }
+
+#define walt_cpu_high_irqload(cpu) false
+
+#endif /* CONFIG_SCHED_WALT */
+
+extern bool walt_disabled;
+
+#endif
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index 4d931de68a188ed6c2af9f43a5055735e27a5949..bc15c2a8fe7ba52b8d17fd953f4c86c56957df4a 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -329,6 +329,36 @@ static struct ctl_table kern_table[] = {
                 .extra1         = &min_sched_granularity_ns,
                 .extra2         = &max_sched_granularity_ns,
         },
+#ifdef CONFIG_SCHED_WALT
+       {
+               .procname       = "sched_use_walt_cpu_util",
+               .data           = &sysctl_sched_use_walt_cpu_util,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_use_walt_task_util",
+               .data           = &sysctl_sched_use_walt_task_util,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_walt_init_task_load_pct",
+               .data           = &sysctl_sched_walt_init_task_load_pct,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+       {
+               .procname       = "sched_walt_cpu_high_irqload",
+               .data           = &sysctl_sched_walt_cpu_high_irqload,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
+#endif
         {
                 .procname       = "sched_sync_hint_enable",
                 .data           = &sysctl_sched_sync_hint_enable,
author	Srivatsa Vaddagiri <vatsa@codeaurora.org>
	Wed, 8 Nov 2017 19:56:30 +0000 (19:56 +0000)
committer	Chris Redpath <chris.redpath@arm.com>
	Tue, 19 Dec 2017 19:45:37 +0000 (19:45 +0000)
include/linux/sched.h		patch \| blob \| blame \| history
include/linux/sched/sysctl.h		patch \| blob \| blame \| history
include/trace/events/sched.h		patch \| blob \| blame \| history
init/Kconfig		patch \| blob \| blame \| history
kernel/sched/Makefile		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/cputime.c		patch \| blob \| blame \| history
kernel/sched/deadline.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history
kernel/sched/rt.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history
kernel/sched/stop_task.c		patch \| blob \| blame \| history
kernel/sched/walt.c	[new file with mode: 0644]	patch \| blob
kernel/sched/walt.h	[new file with mode: 0644]	patch \| blob
kernel/sysctl.c		patch \| blob \| blame \| history