From 94219ee2a96c9c7444012b9960a6e0ff83d1dc8f Mon Sep 17 00:00:00 2001 From: Park Bumgyu Date: Tue, 16 Jan 2018 19:01:05 +0900 Subject: [PATCH] [COMMON] sched: ehmp: support EHMP(Exynos HMP) Change-Id: Ie7ee8a84ed0fdc3a62d10a5b55488477edcdba7f Signed-off-by: Park Bumgyu --- include/linux/ehmp.h | 88 ++ include/linux/sched.h | 22 + include/trace/events/ehmp.h | 340 +++++++ include/trace/events/sched.h | 61 ++ init/Kconfig | 12 + kernel/sched/Makefile | 1 + kernel/sched/ehmp.c | 1670 ++++++++++++++++++++++++++++++++++ kernel/sched/fair.c | 135 +-- kernel/sched/features.h | 5 + kernel/sched/sched.h | 82 ++ kernel/sched/tune.c | 468 ++++++++++ kernel/sched/tune.h | 12 + 12 files changed, 2811 insertions(+), 85 deletions(-) create mode 100644 include/linux/ehmp.h create mode 100644 include/trace/events/ehmp.h create mode 100644 kernel/sched/ehmp.c diff --git a/include/linux/ehmp.h b/include/linux/ehmp.h new file mode 100644 index 000000000000..24948551e089 --- /dev/null +++ b/include/linux/ehmp.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017 Samsung Electronics Co., Ltd + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 and + * only version 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include +#include + +#ifdef CONFIG_SCHED_TUNE +enum stune_group { + STUNE_ROOT, + STUNE_FOREGROUND, + STUNE_BACKGROUND, + STUNE_TOPAPP, + STUNE_GROUP_COUNT, +}; +#endif + +struct gb_qos_request { + struct plist_node node; + char *name; + bool active; +}; + +#ifdef CONFIG_SCHED_EHMP +extern void exynos_init_entity_util_avg(struct sched_entity *se); +extern int exynos_need_active_balance(enum cpu_idle_type idle, + struct sched_domain *sd, int src_cpu, int dst_cpu); + +extern unsigned long global_boost(void); +extern int find_second_max_cap(void); + +extern int exynos_select_cpu(struct task_struct *p, int prev_cpu, + int sync, int sd_flag); + +extern void ontime_migration(void); +extern int ontime_can_migration(struct task_struct *p, int cpu); +extern void ontime_update_load_avg(u64 delta, int cpu, unsigned long weight, + struct sched_avg *sa); +extern void ontime_new_entity_load(struct task_struct *parent, + struct sched_entity *se); +extern void ontime_trace_task_info(struct task_struct *p); +extern void ehmp_update_max_cpu_capacity(int cpu, unsigned long val); + +extern void ehmp_update_overutilized(int cpu, unsigned long capacity); +extern bool ehmp_trigger_lb(int src_cpu, int dst_cpu); + +extern void gb_qos_update_request(struct gb_qos_request *req, u32 new_value); + +extern void request_kernel_prefer_perf(int grp_idx, int enable); +#else +static inline void exynos_init_entity_util_avg(struct sched_entity *se) { } +static inline int exynos_need_active_balance(enum cpu_idle_type idle, + struct sched_domain *sd, int src_cpu, int dst_cpu) { return 0; } + +static inline unsigned long global_boost(void) { return 0; } +static inline int find_second_max_cap(void) { return -EINVAL; } + +static inline int exynos_select_cpu(struct task_struct *p, + int prev_cpu) { return -EINVAL; } +static inline int exynos_select_cpu(struct task_struct *p, int prev_cpu, + int sync, int sd_flag) { return -EINVAL; } + +static inline void ontime_migration(void) { } +static inline int ontime_can_migration(struct task_struct *p, int cpu) { return 1; } +static inline void ontime_update_load_avg(u64 delta, int cpu, unsigned long weight, + struct sched_avg *sa) { } +static inline void ontime_new_entity_load(struct task_struct *p, + struct sched_entity *se) { } +static inline void ontime_trace_task_info(struct task_struct *p) { } + +static inline void ehmp_update_max_cpu_capacity(int cpu, unsigned long val) { } + +static inline void ehmp_update_overutilized(int cpu, unsigned long capacity) { } +static inline bool ehmp_trigger_lb(int src_cpu, int dst_cpu) { return false; } + +static inline void gb_qos_update_request(struct gb_qos_request *req, u32 new_value) { } + +extern void request_kernel_prefer_perf(int grp_idx, int enable) { } +#endif /* CONFIG_SCHED_EHMP */ diff --git a/include/linux/sched.h b/include/linux/sched.h index f56992ff5508..9e4757aa1704 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -417,6 +417,25 @@ struct sched_avg { struct util_est util_est; }; +#ifdef CONFIG_SCHED_EHMP +#define NOT_ONTIME 1 +#define ONTIME_MIGRATING 2 +#define ONTIME 4 + +struct ontime_avg { + u64 ontime_migration_time; + u64 load_sum; + u32 period_contrib; + unsigned long load_avg; +}; + +struct ontime_entity { + struct ontime_avg avg; + int flags; + int cpu; +}; +#endif + struct sched_statistics { #ifdef CONFIG_SCHEDSTATS u64 wait_start; @@ -487,6 +506,9 @@ struct sched_entity { */ struct sched_avg avg ____cacheline_aligned_in_smp; #endif +#ifdef CONFIG_SCHED_EHMP + struct ontime_entity ontime; +#endif }; #ifdef CONFIG_SCHED_WALT diff --git a/include/trace/events/ehmp.h b/include/trace/events/ehmp.h new file mode 100644 index 000000000000..cd99ba3e93e3 --- /dev/null +++ b/include/trace/events/ehmp.h @@ -0,0 +1,340 @@ +/* + * Copyright (C) 2017 Park Bumgyu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM ehmp + +#if !defined(_TRACE_EHMP_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EHMP_H + +#include +#include + +/* + * Tracepoint for selection of boost cpu + */ +TRACE_EVENT(ehmp_select_boost_cpu, + + TP_PROTO(struct task_struct *p, int cpu, int trigger, char *state), + + TP_ARGS(p, cpu, trigger, state), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( int, trigger ) + __array( char, state, 64 ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->cpu = cpu; + __entry->trigger = trigger; + memcpy(__entry->state, state, 64); + ), + + TP_printk("comm=%s pid=%d target_cpu=%d trigger=%d state=%s", + __entry->comm, __entry->pid, __entry->cpu, + __entry->trigger, __entry->state) +); + +/* + * Tracepoint for selection of group balancer + */ +TRACE_EVENT(ehmp_select_group_boost, + + TP_PROTO(struct task_struct *p, int cpu, char *state), + + TP_ARGS(p, cpu, state), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __array( char, state, 64 ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->cpu = cpu; + memcpy(__entry->state, state, 64); + ), + + TP_printk("comm=%s pid=%d target_cpu=%d state=%s", + __entry->comm, __entry->pid, __entry->cpu, __entry->state) +); + +TRACE_EVENT(ehmp_global_boost, + + TP_PROTO(char *name, unsigned long boost), + + TP_ARGS(name, boost), + + TP_STRUCT__entry( + __array( char, name, 64 ) + __field( unsigned long, boost ) + ), + + TP_fast_assign( + memcpy(__entry->name, name, 64); + __entry->boost = boost; + ), + + TP_printk("name=%s global_boost_value=%ld", __entry->name, __entry->boost) +); + +/* + * Tracepoint for prefer idle + */ +TRACE_EVENT(ehmp_prefer_idle, + + TP_PROTO(struct task_struct *p, int orig_cpu, int target_cpu, + unsigned long task_util, unsigned long new_util, int idle), + + TP_ARGS(p, orig_cpu, target_cpu, task_util, new_util, idle), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, orig_cpu ) + __field( int, target_cpu ) + __field( unsigned long, task_util ) + __field( unsigned long, new_util ) + __field( int, idle ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->orig_cpu = orig_cpu; + __entry->target_cpu = target_cpu; + __entry->task_util = task_util; + __entry->new_util = new_util; + __entry->idle = idle; + ), + + TP_printk("comm=%s pid=%d orig_cpu=%d target_cpu=%d task_util=%lu new_util=%lu idle=%d", + __entry->comm, __entry->pid, __entry->orig_cpu, __entry->target_cpu, + __entry->task_util, __entry->new_util, __entry->idle) +); + +TRACE_EVENT(ehmp_prefer_idle_cpu_select, + + TP_PROTO(struct task_struct *p, int cpu), + + TP_ARGS(p, cpu), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->cpu = cpu; + ), + + TP_printk("comm=%s pid=%d target_cpu=%d", + __entry->comm, __entry->pid, __entry->cpu) +); + +/* + * Tracepoint for cpu selection + */ +TRACE_EVENT(ehmp_find_best_target_stat, + + TP_PROTO(int cpu, unsigned long cap, unsigned long util, unsigned long target_util), + + TP_ARGS(cpu, cap, util, target_util), + + TP_STRUCT__entry( + __field( int, cpu ) + __field( unsigned long, cap ) + __field( unsigned long, util ) + __field( unsigned long, target_util ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->cap = cap; + __entry->util = util; + __entry->target_util = target_util; + ), + + TP_printk("find_best : [cpu%d] capacity %lu, util %lu, target_util %lu\n", + __entry->cpu, __entry->cap, __entry->util, __entry->target_util) +); + +TRACE_EVENT(ehmp_find_best_target_candi, + + TP_PROTO(unsigned int cpu), + + TP_ARGS(cpu), + + TP_STRUCT__entry( + __field( unsigned int, cpu ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + ), + + TP_printk("find_best: energy candidate cpu %d\n", __entry->cpu) +); + +TRACE_EVENT(ehmp_find_best_target_cpu, + + TP_PROTO(unsigned int cpu, unsigned long target_util), + + TP_ARGS(cpu, target_util), + + TP_STRUCT__entry( + __field( unsigned int, cpu ) + __field( unsigned long, target_util ) + ), + + TP_fast_assign( + __entry->cpu = cpu; + __entry->target_util = target_util; + ), + + TP_printk("find_best: target_cpu %d, target_util %lu\n", __entry->cpu, __entry->target_util) +); + +/* + * Tracepoint for ontime migration + */ +TRACE_EVENT(ehmp_ontime_migration, + + TP_PROTO(struct task_struct *p, unsigned long load, + int src_cpu, int dst_cpu, int boost_migration), + + TP_ARGS(p, load, src_cpu, dst_cpu, boost_migration), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( unsigned long, load ) + __field( int, src_cpu ) + __field( int, dst_cpu ) + __field( int, bm ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->load = load; + __entry->src_cpu = src_cpu; + __entry->dst_cpu = dst_cpu; + __entry->bm = boost_migration; + ), + + TP_printk("comm=%s pid=%d ontime_load_avg=%lu src_cpu=%d dst_cpu=%d boost_migration=%d", + __entry->comm, __entry->pid, __entry->load, + __entry->src_cpu, __entry->dst_cpu, __entry->bm) +); + +/* + * Tracepoint for accounting ontime load averages for tasks. + */ +TRACE_EVENT(ehmp_ontime_new_entity_load, + + TP_PROTO(struct task_struct *tsk, struct ontime_avg *avg), + + TP_ARGS(tsk, avg), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( u64, load_sum ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = task_cpu(tsk); + __entry->load_avg = avg->load_avg; + __entry->load_sum = avg->load_sum; + ), + TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu load_sum=%llu", + __entry->comm, + __entry->pid, + __entry->cpu, + __entry->load_avg, + (u64)__entry->load_sum) +); + +/* + * Tracepoint for accounting ontime load averages for tasks. + */ +TRACE_EVENT(ehmp_ontime_load_avg_task, + + TP_PROTO(struct task_struct *tsk, struct ontime_avg *avg, int ontime_flag), + + TP_ARGS(tsk, avg, ontime_flag), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( unsigned long, load_avg ) + __field( u64, load_sum ) + __field( int, ontime_flag ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = task_cpu(tsk); + __entry->load_avg = avg->load_avg; + __entry->load_sum = avg->load_sum; + __entry->ontime_flag = ontime_flag; + ), + TP_printk("comm=%s pid=%d cpu=%d load_avg=%lu load_sum=%llu ontime_flag=%d", + __entry->comm, __entry->pid, __entry->cpu, __entry->load_avg, + (u64)__entry->load_sum, __entry->ontime_flag) +); + +TRACE_EVENT(ehmp_ontime_check_migrate, + + TP_PROTO(struct task_struct *tsk, int cpu, int migrate, char *label), + + TP_ARGS(tsk, cpu, migrate, label), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, cpu ) + __field( int, migrate ) + __array( char, label, 64 ) + ), + + TP_fast_assign( + memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN); + __entry->pid = tsk->pid; + __entry->cpu = cpu; + __entry->migrate = migrate; + strncpy(__entry->label, label, 64); + ), + + TP_printk("comm=%s pid=%d target_cpu=%d migrate=%d reason=%s", + __entry->comm, __entry->pid, __entry->cpu, + __entry->migrate, __entry->label) +); + +#endif /* _TRACE_EHMP_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index f1cb20ce6892..6494144d42aa 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -967,6 +967,67 @@ TRACE_EVENT(sched_tune_tasks_update, __entry->group_ts) ); +/* + * Tracepoint for schedtune_grouputil_update + */ +TRACE_EVENT(sched_tune_grouputil_update, + + TP_PROTO(int idx, int total, int accumulated, unsigned long group_util, + struct task_struct *heaviest_p, unsigned long biggest_util), + + TP_ARGS(idx, total, accumulated, group_util, heaviest_p, biggest_util), + + TP_STRUCT__entry( + __field( int, idx ) + __field( int, total ) + __field( int, accumulated ) + __field( unsigned long, group_util ) + __field( pid_t, pid ) + __array( char, comm, TASK_COMM_LEN ) + __field( unsigned long, biggest_util ) + ), + + TP_fast_assign( + __entry->idx = idx; + __entry->total = total; + __entry->accumulated = accumulated; + __entry->group_util = group_util; + __entry->pid = heaviest_p->pid; + memcpy(__entry->comm, heaviest_p->comm, TASK_COMM_LEN); + __entry->biggest_util = biggest_util; + ), + + TP_printk("idx=%d total=%d accumulated=%d group_util=%lu " + "heaviest task(pid=%d comm=%s util=%lu)", + __entry->idx, __entry->total, __entry->accumulated, __entry->group_util, + __entry->pid, __entry->comm, __entry->biggest_util) +); + +/* + * Tracepoint for checking group balancing + */ +TRACE_EVENT(sched_tune_check_group_balance, + + TP_PROTO(int idx, int ib_count, bool balancing), + + TP_ARGS(idx, ib_count, balancing), + + TP_STRUCT__entry( + __field( int, idx ) + __field( int, ib_count ) + __field( bool, balancing ) + ), + + TP_fast_assign( + __entry->idx = idx; + __entry->ib_count = ib_count; + __entry->balancing = balancing; + ), + + TP_printk("idx=%d imbalance_count=%d balancing=%d", + __entry->idx, __entry->ib_count, __entry->balancing) +); + /* * Tracepoint for schedtune_boostgroup_update */ diff --git a/init/Kconfig b/init/Kconfig index 3988656c235d..f1fa25160c22 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1066,6 +1066,18 @@ config SCHED_TUNE If unsure, say N. +config SCHED_EHMP + bool "Exynos scheduler for Heterogeneous Multi-Processor" + depends on SMP + help + This option supports Exynos scheduler for HMP architecture. It is + designed to secure the limits of energy aware scheduler. This option + provides features such as independent boosting functinos such as + global boost and on-time migration, and prefer_perf and enhanced + prefer_idle that work in conjunction with SCHEDTUNE. + + If unsure, say N. + config DEFAULT_USE_ENERGY_AWARE bool "Default to enabling the Energy Aware Scheduler feature" default n diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile index 7d5422367729..3fedfec4697a 100644 --- a/kernel/sched/Makefile +++ b/kernel/sched/Makefile @@ -20,6 +20,7 @@ obj-y += core.o loadavg.o clock.o cputime.o obj-y += idle_task.o fair.o rt.o deadline.o obj-y += wait.o wait_bit.o swait.o completion.o idle.o obj-$(CONFIG_SMP) += cpupri.o cpudeadline.o topology.o stop_task.o +obj-$(CONFIG_SCHED_EHMP) += ehmp.o obj-$(CONFIG_GENERIC_ARCH_TOPOLOGY) += energy.o obj-$(CONFIG_SCHED_WALT) += walt.o obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o diff --git a/kernel/sched/ehmp.c b/kernel/sched/ehmp.c new file mode 100644 index 000000000000..73b1692f16f5 --- /dev/null +++ b/kernel/sched/ehmp.c @@ -0,0 +1,1670 @@ +/* + * Exynos scheduler for Heterogeneous Multi-Processing (HMP) + * + * Copyright (C) 2017 Samsung Electronics Co., Ltd + * Park Bumgyu + */ + +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +#include "sched.h" +#include "tune.h" + +static unsigned long task_util(struct task_struct *p) +{ + return p->se.avg.util_avg; +} + +static inline struct task_struct *task_of(struct sched_entity *se) +{ + return container_of(se, struct task_struct, se); +} + +static inline struct sched_entity *se_of(struct sched_avg *sa) +{ + return container_of(sa, struct sched_entity, avg); +} + +#define entity_is_cfs_rq(se) (se->my_q) +#define entity_is_task(se) (!se->my_q) +#define LOAD_AVG_MAX 47742 + +static unsigned long maxcap_val = 1024; +static int maxcap_cpu = 7; + +void ehmp_update_max_cpu_capacity(int cpu, unsigned long val) +{ + maxcap_cpu = cpu; + maxcap_val = val; +} + +static inline struct device_node *get_ehmp_node(void) +{ + return of_find_node_by_path("/cpus/ehmp"); +} + +static inline struct cpumask *sched_group_cpus(struct sched_group *sg) +{ + return to_cpumask(sg->cpumask); +} + +static bool sd_overutilized(struct sched_domain *sd) +{ + return sd->shared->overutilized; +} + +#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed) + +/********************************************************************** + * task initialization * + **********************************************************************/ +void exynos_init_entity_util_avg(struct sched_entity *se) +{ + struct cfs_rq *cfs_rq = se->cfs_rq; + struct sched_avg *sa = &se->avg; + int cpu = cpu_of(cfs_rq->rq); + unsigned long cap_org = capacity_orig_of(cpu); + long cap = (long)(cap_org - cfs_rq->avg.util_avg) / 2; + + if (cap > 0) { + if (cfs_rq->avg.util_avg != 0) { + sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; + sa->util_avg /= (cfs_rq->avg.load_avg + 1); + + if (sa->util_avg > cap) + sa->util_avg = cap; + } else { + sa->util_avg = cap_org >> 2; + } + /* + * If we wish to restore tuning via setting initial util, + * this is where we should do it. + */ + sa->util_sum = sa->util_avg * LOAD_AVG_MAX; + } +} + +/********************************************************************** + * load balance * + **********************************************************************/ +bool cpu_overutilized(int cpu); + +#define lb_sd_parent(sd) \ + (sd->parent && sd->parent->groups != sd->parent->groups->next) + +static inline int +check_cpu_capacity(struct rq *rq, struct sched_domain *sd) +{ + return ((rq->cpu_capacity * sd->imbalance_pct) < + (rq->cpu_capacity_orig * 100)); +} + +unsigned long global_boost(void); +int exynos_need_active_balance(enum cpu_idle_type idle, struct sched_domain *sd, + int src_cpu, int dst_cpu) +{ + unsigned int src_imb_pct = lb_sd_parent(sd) ? sd->imbalance_pct : 1; + unsigned int dst_imb_pct = lb_sd_parent(sd) ? 100 : 1; + unsigned long src_cap = capacity_of(src_cpu); + unsigned long dst_cap = capacity_of(dst_cpu); + + if ((idle != CPU_NOT_IDLE) && + (cpu_rq(src_cpu)->cfs.h_nr_running == 1)) { + if ((check_cpu_capacity(cpu_rq(src_cpu), sd)) && + (src_cap * sd->imbalance_pct < dst_cap * 100)) { + return 1; + } + + if (!lb_sd_parent(sd) && src_cap < dst_cap) + if (cpu_overutilized(src_cpu) || global_boost()) + return 1; + } + + if ((src_cap * src_imb_pct < dst_cap * dst_imb_pct) && + cpu_rq(src_cpu)->cfs.h_nr_running == 1 && + cpu_overutilized(src_cpu) && + !cpu_overutilized(dst_cpu)) { + return 1; + } + + return unlikely(sd->nr_balance_failed > sd->cache_nice_tries + 2); +} + +/********************************************************************** + * load balance_trigger * + **********************************************************************/ +struct lbt_overutil { + /* + * overutil_ratio means + * N < 0 : disable user_overutilized + * N == 0 : Always overutilized + * N > 0 : overutil_cap = org_capacity * overutil_ratio / 100 + */ + unsigned long overutil_cap; + int overutil_ratio; +}; + +DEFINE_PER_CPU(struct lbt_overutil, ehmp_bot_overutil); +DEFINE_PER_CPU(struct lbt_overutil, ehmp_top_overutil); +#define DISABLE_OU -1 + +bool cpu_overutilized(int cpu) +{ + struct lbt_overutil *ou = &per_cpu(ehmp_top_overutil, cpu); + + /* + * If top overutil is disabled, use main stream condition + * in the fair.c + */ + if (ou->overutil_ratio == DISABLE_OU) + return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * 1280); + + return cpu_util(cpu) > ou->overutil_cap; +} + +static bool inline lbt_top_overutilized(int cpu) +{ +// struct rq *rq = cpu_rq(cpu); +// return sched_feat(ENERGY_AWARE) && rq->rd->overutilized; + return sched_feat(ENERGY_AWARE); +} + +static bool inline lbt_bot_overutilized(int cpu) +{ + struct lbt_overutil *ou = &per_cpu(ehmp_bot_overutil, cpu); + + /* if bot overutil is disabled, return false */ + if (ou->overutil_ratio == DISABLE_OU) + return false; + + return cpu_util(cpu) > ou->overutil_cap; +} + +static void inline lbt_update_overutilized(int cpu, + unsigned long capacity, bool top) +{ + struct lbt_overutil *ou; + ou = top ? &per_cpu(ehmp_top_overutil, cpu) : + &per_cpu(ehmp_bot_overutil, cpu); + + if (ou->overutil_ratio == DISABLE_OU) + ou->overutil_cap = 0; + else + ou->overutil_cap = (capacity * ou->overutil_ratio) / 100; +} + +void ehmp_update_overutilized(int cpu, unsigned long capacity) +{ + lbt_update_overutilized(cpu, capacity, true); + lbt_update_overutilized(cpu, capacity, false); +} + +static bool lbt_is_same_group(int src_cpu, int dst_cpu) +{ + struct sched_domain *sd = rcu_dereference(per_cpu(sd_ea, src_cpu)); + struct sched_group *sg; + + if (!sd) + return false; + + sg = sd->groups; + return cpumask_test_cpu(dst_cpu, sched_group_cpus(sg)); +} + +static bool lbt_overutilized(int src_cpu, int dst_cpu) +{ + bool top_overutilized, bot_overutilized; + + /* src and dst are in the same domain, check top_overutilized */ + top_overutilized = lbt_top_overutilized(src_cpu); + if (!lbt_is_same_group(src_cpu, dst_cpu)) + return top_overutilized; + + /* check bot overutilized */ + bot_overutilized = lbt_bot_overutilized(src_cpu); + return bot_overutilized || top_overutilized; +} + +static ssize_t _show_overutil(char *buf, bool top) +{ + struct sched_domain *sd; + struct sched_group *sg; + struct lbt_overutil *ou; + int cpu, ret = 0; + + rcu_read_lock(); + + sd = rcu_dereference(per_cpu(sd_ea, 0)); + if (!sd) { + rcu_read_unlock(); + return ret; + } + + sg = sd->groups; + do { + for_each_cpu_and(cpu, sched_group_cpus(sg), cpu_active_mask) { + ou = top ? &per_cpu(ehmp_top_overutil, cpu) : + &per_cpu(ehmp_bot_overutil, cpu); + ret += sprintf(buf + ret, "cpu%d ratio:%3d cap:%4lu\n", + cpu, ou->overutil_ratio, ou->overutil_cap); + + } + } while (sg = sg->next, sg != sd->groups); + + rcu_read_unlock(); + return ret; +} + +static ssize_t _store_overutil(const char *buf, + size_t count, bool top) +{ + struct sched_domain *sd; + struct sched_group *sg; + struct lbt_overutil *ou; + unsigned long capacity; + int cpu; + const char *cp = buf; + int tokenized_data; + + rcu_read_lock(); + + sd = rcu_dereference(per_cpu(sd_ea, 0)); + if (!sd) { + rcu_read_unlock(); + return count; + } + + sg = sd->groups; + do { + if (sscanf(cp, "%d", &tokenized_data) != 1) + tokenized_data = -1; + + for_each_cpu_and(cpu, sched_group_cpus(sg), cpu_active_mask) { + ou = top ? &per_cpu(ehmp_top_overutil, cpu) : + &per_cpu(ehmp_bot_overutil, cpu); + ou->overutil_ratio = tokenized_data; + + capacity = arch_scale_cpu_capacity(sd, cpu); + ehmp_update_overutilized(cpu, capacity); + } + + cp = strpbrk(cp, " :"); + if (!cp) + break; + cp++; + } while (sg = sg->next, sg != sd->groups); + + rcu_read_unlock(); + return count; +} + +static ssize_t show_top_overutil(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return _show_overutil(buf, true); +} +static ssize_t store_top_overutil(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + return _store_overutil(buf, count, true); +} +static ssize_t show_bot_overutil(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return _show_overutil(buf, false); +} +static ssize_t store_bot_overutil(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + return _store_overutil(buf, count, false); +} + +static struct kobj_attribute top_overutil_attr = +__ATTR(top_overutil, 0644, show_top_overutil, store_top_overutil); +static struct kobj_attribute bot_overutil_attr = +__ATTR(bot_overutil, 0644, show_bot_overutil, store_bot_overutil); + +static int __init init_lbt(void) +{ + struct device_node *dn; + int top_ou[NR_CPUS] = {-1, }, bot_ou[NR_CPUS] = {-1, }; + int cpu; + + dn = get_ehmp_node(); + if (!dn) + return 0; + + if (of_property_read_u32_array(dn, "top-overutil", top_ou, NR_CPUS) < 0) + return 0; + + if (of_property_read_u32_array(dn, "bot-overutil", bot_ou, NR_CPUS) < 0) + return 0; + + for_each_possible_cpu(cpu) { + per_cpu(ehmp_top_overutil, cpu).overutil_ratio = top_ou[cpu]; + per_cpu(ehmp_bot_overutil, cpu).overutil_ratio = bot_ou[cpu]; + } + + return 0; +} +pure_initcall(init_lbt); + +bool ehmp_trigger_lb(int src_cpu, int dst_cpu) +{ + /* check overutilized condition */ + return lbt_overutilized(src_cpu, dst_cpu); +} + +/********************************************************************** + * Global boost * + **********************************************************************/ +static unsigned long gb_value = 0; +static unsigned long gb_max_value = 0; +static struct gb_qos_request gb_req_user = +{ + .name = "ehmp_gb_req_user", +}; + +static struct plist_head gb_list = PLIST_HEAD_INIT(gb_list); + +static DEFINE_SPINLOCK(gb_lock); + +static int gb_qos_max_value(void) +{ + return plist_last(&gb_list)->prio; +} + +static int gb_qos_req_value(struct gb_qos_request *req) +{ + return req->node.prio; +} + +void gb_qos_update_request(struct gb_qos_request *req, u32 new_value) +{ + unsigned long flags; + + if (req->node.prio == new_value) + return; + + spin_lock_irqsave(&gb_lock, flags); + + if (req->active) + plist_del(&req->node, &gb_list); + else + req->active = 1; + + plist_node_init(&req->node, new_value); + plist_add(&req->node, &gb_list); + + gb_value = gb_max_value * gb_qos_max_value() / 100; + trace_ehmp_global_boost(req->name, new_value); + + spin_unlock_irqrestore(&gb_lock, flags); +} + +static ssize_t show_global_boost(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + struct gb_qos_request *req; + int ret = 0; + + plist_for_each_entry(req, &gb_list, node) + ret += snprintf(buf + ret, 30, "%s : %d\n", + req->name, gb_qos_req_value(req)); + + return ret; +} + +static ssize_t store_global_boost(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + unsigned int input; + + if (!sscanf(buf, "%d", &input)) + return -EINVAL; + + gb_qos_update_request(&gb_req_user, input); + + return count; +} + +static struct kobj_attribute global_boost_attr = +__ATTR(global_boost, 0644, show_global_boost, store_global_boost); + +#define BOOT_BOOST_DURATION 40000000 /* microseconds */ +unsigned long global_boost(void) +{ + u64 now = ktime_to_us(ktime_get()); + + if (now < BOOT_BOOST_DURATION) + return gb_max_value; + + return gb_value; +} + +int find_second_max_cap(void) +{ + struct sched_domain *sd = rcu_dereference(per_cpu(sd_ea, 0)); + struct sched_group *sg; + int max_cap = 0, second_max_cap = 0; + + if (!sd) + return 0; + + sg = sd->groups; + do { + int i; + + for_each_cpu(i, sched_group_cpus(sg)) { + if (max_cap < cpu_rq(i)->cpu_capacity_orig) { + second_max_cap = max_cap; + max_cap = cpu_rq(i)->cpu_capacity_orig; + } + } + } while (sg = sg->next, sg != sd->groups); + + return second_max_cap; +} + +static int __init init_global_boost(void) +{ + gb_max_value = find_second_max_cap() + 1; + + return 0; +} +pure_initcall(init_global_boost); + +/********************************************************************** + * Boost cpu selection (global boost, schedtune.prefer_perf) * + **********************************************************************/ +#define cpu_selected(cpu) (cpu >= 0) + +int kernel_prefer_perf(int grp_idx); +static ssize_t show_prefer_perf(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + int i, ret = 0; + + for (i = 0; i < STUNE_GROUP_COUNT; i++) + ret += snprintf(buf + ret, 10, "%d ", kernel_prefer_perf(i)); + + ret += snprintf(buf + ret, 10, "\n"); + + return ret; +} + +static struct kobj_attribute prefer_perf_attr = +__ATTR(kernel_prefer_perf, 0444, show_prefer_perf, NULL); + +enum { + BT_PREFER_PERF = 0, + BT_GROUP_BALANCE, + BT_GLOBAL_BOOST, +}; + +struct boost_trigger { + int trigger; + int boost_val; +}; + +static int check_boost_trigger(struct task_struct *p, struct boost_trigger *bt) +{ + int gb; + +#ifdef CONFIG_SCHED_TUNE + if (schedtune_prefer_perf(p) > 0) { + bt->trigger = BT_PREFER_PERF; + bt->boost_val = schedtune_perf_threshold(); + return 1; + } + + if (schedtune_need_group_balance(p) > 0) { + bt->trigger = BT_GROUP_BALANCE; + bt->boost_val = schedtune_perf_threshold(); + return 1; + } +#endif + + gb = global_boost(); + if (gb) { + bt->trigger = BT_GLOBAL_BOOST; + bt->boost_val = gb; + return 1; + } + + /* not boost state */ + return 0; +} + +static int boost_select_cpu(struct task_struct *p, struct cpumask *target_cpus) +{ + int i, cpu = 0; + + if (cpumask_empty(target_cpus)) + return -1; + + if (cpumask_test_cpu(task_cpu(p), target_cpus)) + return task_cpu(p); + + /* Return last cpu in target_cpus */ + for_each_cpu(i, target_cpus) + cpu = i; + + return cpu; +} + +static void mark_shallowest_cpu(int cpu, unsigned int *min_exit_latency, + struct cpumask *shallowest_cpus) +{ + struct rq *rq = cpu_rq(cpu); + struct cpuidle_state *idle = idle_get_state(rq); + + /* Before enabling cpuidle, all idle cpus are marked */ + if (!idle) { + cpumask_set_cpu(cpu, shallowest_cpus); + return; + } + + /* Deeper idle cpu is ignored */ + if (idle->exit_latency > *min_exit_latency) + return; + + /* if shallower idle cpu is found, previsouly founded cpu is ignored */ + if (idle->exit_latency < *min_exit_latency) { + cpumask_clear(shallowest_cpus); + *min_exit_latency = idle->exit_latency; + } + + cpumask_set_cpu(cpu, shallowest_cpus); +} +static int check_migration_task(struct task_struct *p) +{ + return !p->se.avg.last_update_time; +} + +static unsigned long cpu_util_wake(int cpu, struct task_struct *p) +{ + unsigned long util, capacity; + + /* Task has no contribution or is new */ + if (cpu != task_cpu(p) || check_migration_task(p)) + return cpu_util(cpu); + + capacity = capacity_orig_of(cpu); + util = max_t(long, cpu_util(cpu) - task_util(p), 0); + + return (util >= capacity) ? capacity : util; +} + +static int find_group_boost_target(struct task_struct *p) +{ + struct sched_domain *sd; + int shallowest_cpu = -1; + int lowest_cpu = -1; + unsigned int min_exit_latency = UINT_MAX; + unsigned long lowest_util = ULONG_MAX; + int target_cpu = -1; + int cpu; + char state[30] = "fail"; + + sd = rcu_dereference(per_cpu(sd_ea, maxcap_cpu)); + if (!sd) + return target_cpu; + + if (cpumask_test_cpu(task_cpu(p), sched_group_cpus(sd->groups))) { + if (idle_cpu(task_cpu(p))) { + target_cpu = task_cpu(p); + strcpy(state, "current idle"); + goto find_target; + } + } + + for_each_cpu_and(cpu, tsk_cpus_allowed(p), sched_group_cpus(sd->groups)) { + unsigned long util = cpu_util_wake(cpu, p); + + if (idle_cpu(cpu)) { + struct cpuidle_state *idle; + + idle = idle_get_state(cpu_rq(cpu)); + if (!idle) { + target_cpu = cpu; + strcpy(state, "idle wakeup"); + goto find_target; + } + + if (idle->exit_latency < min_exit_latency) { + min_exit_latency = idle->exit_latency; + shallowest_cpu = cpu; + continue; + } + } + + if (cpu_selected(shallowest_cpu)) + continue; + + if (util < lowest_util) { + lowest_cpu = cpu; + lowest_util = util; + } + } + + if (cpu_selected(shallowest_cpu)) { + target_cpu = shallowest_cpu; + strcpy(state, "shallowest idle"); + goto find_target; + } + + if (cpu_selected(lowest_cpu)) { + target_cpu = lowest_cpu; + strcpy(state, "lowest util"); + } + +find_target: + trace_ehmp_select_group_boost(p, target_cpu, state); + + return target_cpu; +} + +static int +find_boost_target(struct sched_domain *sd, struct task_struct *p, + unsigned long min_util, struct boost_trigger *bt) +{ + struct sched_group *sg; + int boost = bt->boost_val; + unsigned long max_capacity; + struct cpumask boost_candidates; + struct cpumask backup_boost_candidates; + unsigned int min_exit_latency = UINT_MAX; + unsigned int backup_min_exit_latency = UINT_MAX; + int target_cpu; + bool go_up = false; + unsigned long lowest_util = ULONG_MAX; + int lowest_cpu = -1; + char state[30] = "fail"; + + if (bt->trigger == BT_GROUP_BALANCE) + return find_group_boost_target(p); + + cpumask_setall(&boost_candidates); + cpumask_clear(&backup_boost_candidates); + + max_capacity = maxcap_val; + + sg = sd->groups; + + do { + int i; + + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + unsigned long new_util, wake_util; + + if (!cpu_online(i)) + continue; + + wake_util = cpu_util_wake(i, p); + new_util = wake_util + task_util(p); + new_util = max(min_util, new_util); + + if (min(new_util + boost, max_capacity) > capacity_orig_of(i)) { + if (!cpu_rq(i)->nr_running) + mark_shallowest_cpu(i, &backup_min_exit_latency, + &backup_boost_candidates); + else if (cpumask_test_cpu(task_cpu(p), sched_group_cpus(sg))) + go_up = true; + + continue; + } + + if (cpumask_weight(&boost_candidates) >= nr_cpu_ids) + cpumask_clear(&boost_candidates); + + if (!cpu_rq(i)->nr_running) { + mark_shallowest_cpu(i, &min_exit_latency, &boost_candidates); + continue; + } + + if (wake_util < lowest_util) { + lowest_util = wake_util; + lowest_cpu = i; + } + } + + if (cpumask_weight(&boost_candidates) >= nr_cpu_ids) + continue; + + target_cpu = boost_select_cpu(p, &boost_candidates); + if (cpu_selected(target_cpu)) { + strcpy(state, "big idle"); + goto out; + } + + target_cpu = boost_select_cpu(p, &backup_boost_candidates); + if (cpu_selected(target_cpu)) { + strcpy(state, "little idle"); + goto out; + } + } while (sg = sg->next, sg != sd->groups); + + if (go_up) { + strcpy(state, "lowest big cpu"); + target_cpu = lowest_cpu; + goto out; + } + + strcpy(state, "current cpu"); + target_cpu = task_cpu(p); + +out: + trace_ehmp_select_boost_cpu(p, target_cpu, bt->trigger, state); + return target_cpu; +} + +/********************************************************************** + * schedtune.prefer_idle * + **********************************************************************/ +static void mark_lowest_cpu(int cpu, unsigned long new_util, + int *lowest_cpu, unsigned long *lowest_util) +{ + if (new_util >= *lowest_util) + return; + + *lowest_util = new_util; + *lowest_cpu = cpu; +} + +static int find_prefer_idle_target(struct sched_domain *sd, + struct task_struct *p, unsigned long min_util) +{ + struct sched_group *sg; + int target_cpu = -1; + int lowest_cpu = -1; + int lowest_idle_cpu = -1; + int overcap_cpu = -1; + unsigned long lowest_util = ULONG_MAX; + unsigned long lowest_idle_util = ULONG_MAX; + unsigned long overcap_util = ULONG_MAX; + struct cpumask idle_candidates; + struct cpumask overcap_idle_candidates; + + cpumask_clear(&idle_candidates); + cpumask_clear(&overcap_idle_candidates); + + sg = sd->groups; + + do { + int i; + + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + unsigned long new_util, wake_util; + + if (!cpu_online(i)) + continue; + + wake_util = cpu_util_wake(i, p); + new_util = wake_util + task_util(p); + new_util = max(min_util, new_util); + + trace_ehmp_prefer_idle(p, task_cpu(p), i, task_util(p), + new_util, idle_cpu(i)); + + if (new_util > capacity_orig_of(i)) { + if (idle_cpu(i)) { + cpumask_set_cpu(i, &overcap_idle_candidates); + mark_lowest_cpu(i, new_util, + &overcap_cpu, &overcap_util); + } + + continue; + } + + if (idle_cpu(i)) { + if (task_cpu(p) == i) { + target_cpu = i; + break; + } + + cpumask_set_cpu(i, &idle_candidates); + mark_lowest_cpu(i, new_util, + &lowest_idle_cpu, &lowest_idle_util); + + continue; + } + + mark_lowest_cpu(i, new_util, &lowest_cpu, &lowest_util); + } + + if (cpu_selected(target_cpu)) + break; + + if (cpumask_weight(&idle_candidates)) { + target_cpu = lowest_idle_cpu; + break; + } + + if (cpu_selected(lowest_cpu)) { + target_cpu = lowest_cpu; + break; + } + + } while (sg = sg->next, sg != sd->groups); + + if (cpu_selected(target_cpu)) + goto out; + + if (cpumask_weight(&overcap_idle_candidates)) { + if (cpumask_test_cpu(task_cpu(p), &overcap_idle_candidates)) + target_cpu = task_cpu(p); + else + target_cpu = overcap_cpu; + + goto out; + } + +out: + trace_ehmp_prefer_idle_cpu_select(p, target_cpu); + + return target_cpu; +} + +/********************************************************************** + * On-time migration * + **********************************************************************/ +static unsigned long up_threshold; +static unsigned long down_threshold; +static unsigned int min_residency_us; + +static ssize_t show_min_residency(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, 10, "%d\n", min_residency_us); +} + +static ssize_t store_min_residency(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + int input; + + if (!sscanf(buf, "%d", &input)) + return -EINVAL; + + input = input < 0 ? 0 : input; + + min_residency_us = input; + + return count; +} + +static struct kobj_attribute min_residency_attr = +__ATTR(min_residency, 0644, show_min_residency, store_min_residency); + +static ssize_t show_up_threshold(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, 10, "%ld\n", up_threshold); +} + +static ssize_t store_up_threshold(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + long input; + + if (!sscanf(buf, "%ld", &input)) + return -EINVAL; + + input = input < 0 ? 0 : input; + input = input > 1024 ? 1024 : input; + + up_threshold = input; + + return count; +} + +static struct kobj_attribute up_threshold_attr = +__ATTR(up_threshold, 0644, show_up_threshold, store_up_threshold); + +static ssize_t show_down_threshold(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return snprintf(buf, 10, "%ld\n", down_threshold); +} + +static ssize_t store_down_threshold(struct kobject *kobj, + struct kobj_attribute *attr, const char *buf, + size_t count) +{ + long input; + + if (!sscanf(buf, "%ld", &input)) + return -EINVAL; + + input = input < 0 ? 0 : input; + input = input > 1024 ? 1024 : input; + + down_threshold = input; + + return count; +} + +static struct kobj_attribute down_threshold_attr = +__ATTR(down_threshold, 0644, show_down_threshold, store_down_threshold); + +#define ontime_flag(p) (ontime_of(p)->flags) +#define ontime_migration_time(p) (ontime_of(p)->avg.ontime_migration_time) +#define ontime_load_avg(p) (ontime_of(p)->avg.load_avg) + +static inline struct ontime_entity *ontime_of(struct task_struct *p) +{ + return &p->se.ontime; +} + +static inline void include_ontime_task(struct task_struct *p) +{ + ontime_flag(p) = ONTIME; + + /* Manage time based on clock task of boot cpu(cpu0) */ + ontime_migration_time(p) = cpu_rq(0)->clock_task; +} + +static inline void exclude_ontime_task(struct task_struct *p) +{ + ontime_migration_time(p) = 0; + ontime_flag(p) = NOT_ONTIME; +} + +static int +ontime_select_target_cpu(struct sched_group *sg, const struct cpumask *mask) +{ + int cpu; + int dest_cpu = -1; + unsigned int min_exit_latency = UINT_MAX; + struct cpuidle_state *idle; + + for_each_cpu_and(cpu, sched_group_cpus(sg), mask) { + if (!idle_cpu(cpu)) + continue; + + if (cpu_rq(cpu)->ontime_migrating) + continue; + + idle = idle_get_state(cpu_rq(cpu)); + if (!idle) + return cpu; + + if (idle && idle->exit_latency < min_exit_latency) { + min_exit_latency = idle->exit_latency; + dest_cpu = cpu; + } + } + + return dest_cpu; +} + +#define TASK_TRACK_COUNT 5 + +extern struct sched_entity *__pick_next_entity(struct sched_entity *se); +static struct task_struct * +ontime_pick_heavy_task(struct sched_entity *se, struct cpumask *dst_cpus, + int *boost_migration) +{ + struct task_struct *heaviest_task = NULL; + struct task_struct *p; + unsigned int max_util_avg = 0; + int task_count = 0; + int boosted = !!global_boost(); + + /* + * Since current task does not exist in entity list of cfs_rq, + * check first that current task is heavy. + */ + if (boosted || ontime_load_avg(task_of(se)) >= up_threshold) { + heaviest_task = task_of(se); + max_util_avg = ontime_load_avg(task_of(se)); + if (boosted) + *boost_migration = 1; + } + + se = __pick_first_entity(se->cfs_rq); + while (se && task_count < TASK_TRACK_COUNT) { + /* Skip non-task entity */ + if (entity_is_cfs_rq(se)) + goto next_entity; + + p = task_of(se); + if (schedtune_prefer_perf(p)) { + heaviest_task = p; + *boost_migration = 1; + break; + } + + if (!boosted && ontime_load_avg(p) < up_threshold) + goto next_entity; + + if (ontime_load_avg(p) > max_util_avg && + cpumask_intersects(dst_cpus, tsk_cpus_allowed(p))) { + heaviest_task = p; + max_util_avg = ontime_load_avg(p); + *boost_migration = boosted; + } + +next_entity: + se = __pick_next_entity(se); + task_count++; + } + + return heaviest_task; +} + +void ontime_new_entity_load(struct task_struct *parent, struct sched_entity *se) +{ + struct ontime_entity *ontime; + + if (entity_is_cfs_rq(se)) + return; + + ontime = &se->ontime; + + ontime->avg.load_sum = ontime_of(parent)->avg.load_sum; + ontime->avg.load_avg = ontime_of(parent)->avg.load_avg; + ontime->avg.ontime_migration_time = 0; + ontime->avg.period_contrib = 1023; + ontime->flags = NOT_ONTIME; + + trace_ehmp_ontime_new_entity_load(task_of(se), &ontime->avg); +} + +/* Structure of ontime migration environment */ +struct ontime_env { + struct rq *dst_rq; + int dst_cpu; + struct rq *src_rq; + int src_cpu; + struct task_struct *target_task; + int boost_migration; +}; +DEFINE_PER_CPU(struct ontime_env, ontime_env); + +static int can_migrate(struct task_struct *p, struct ontime_env *env) +{ + if (!cpumask_test_cpu(env->dst_cpu, tsk_cpus_allowed(p))) + return 0; + + if (task_running(env->src_rq, p)) + return 0; + + return 1; +} + +static void move_task(struct task_struct *p, struct ontime_env *env) +{ + p->on_rq = TASK_ON_RQ_MIGRATING; + deactivate_task(env->src_rq, p, 0); + set_task_cpu(p, env->dst_cpu); + + activate_task(env->dst_rq, p, 0); + p->on_rq = TASK_ON_RQ_QUEUED; + check_preempt_curr(env->dst_rq, p, 0); +} + +static int move_specific_task(struct task_struct *target, struct ontime_env *env) +{ + struct task_struct *p, *n; + + list_for_each_entry_safe(p, n, &env->src_rq->cfs_tasks, se.group_node) { + if (!can_migrate(p, env)) + continue; + + if (p != target) + continue; + + move_task(p, env); + return 1; + } + + return 0; +} + +static int ontime_migration_cpu_stop(void *data) +{ + struct ontime_env *env = data; + struct rq *src_rq, *dst_rq; + int src_cpu, dst_cpu; + struct task_struct *p; + struct sched_domain *sd; + int boost_migration; + + /* Initialize environment data */ + src_rq = env->src_rq; + dst_rq = env->dst_rq = cpu_rq(env->dst_cpu); + src_cpu = env->src_cpu = env->src_rq->cpu; + dst_cpu = env->dst_cpu; + p = env->target_task; + boost_migration = env->boost_migration; + + raw_spin_lock_irq(&src_rq->lock); + + if (!(ontime_flag(p) & ONTIME_MIGRATING)) + goto out_unlock; + + if (p->exit_state) + goto out_unlock; + + if (unlikely(src_cpu != smp_processor_id())) + goto out_unlock; + + if (src_rq->nr_running <= 1) + goto out_unlock; + + if (src_rq != task_rq(p)) + goto out_unlock; + + BUG_ON(src_rq == dst_rq); + + double_lock_balance(src_rq, dst_rq); + + rcu_read_lock(); + for_each_domain(dst_cpu, sd) + if (cpumask_test_cpu(src_cpu, sched_domain_span(sd))) + break; + + if (likely(sd) && move_specific_task(p, env)) { + if (boost_migration) { + /* boost task is not classified as ontime task */ + exclude_ontime_task(p); + } else + include_ontime_task(p); + + rcu_read_unlock(); + double_unlock_balance(src_rq, dst_rq); + + trace_ehmp_ontime_migration(p, ontime_of(p)->avg.load_avg, + src_cpu, dst_cpu, boost_migration); + goto success_unlock; + } + + rcu_read_unlock(); + double_unlock_balance(src_rq, dst_rq); + +out_unlock: + exclude_ontime_task(p); + +success_unlock: + src_rq->active_balance = 0; + dst_rq->ontime_migrating = 0; + + raw_spin_unlock_irq(&src_rq->lock); + put_task_struct(p); + + return 0; +} + +DEFINE_PER_CPU(struct cpu_stop_work, ontime_migration_work); + +static DEFINE_SPINLOCK(om_lock); + +void ontime_migration(void) +{ + struct sched_domain *sd; + struct sched_group *src_sg, *dst_sg; + int cpu; + + if (!spin_trylock(&om_lock)) + return; + + rcu_read_lock(); + + sd = rcu_dereference(per_cpu(sd_ea, 0)); + if (!sd) + goto ontime_migration_exit; + + src_sg = sd->groups; + + do { + dst_sg = src_sg->next; + for_each_cpu_and(cpu, sched_group_cpus(src_sg), cpu_active_mask) { + unsigned long flags; + struct rq *rq; + struct sched_entity *se; + struct task_struct *p; + int dst_cpu; + struct ontime_env *env = &per_cpu(ontime_env, cpu); + int boost_migration = 0; + + rq = cpu_rq(cpu); + raw_spin_lock_irqsave(&rq->lock, flags); + + /* + * Ontime migration is not performed when active balance + * is in progress. + */ + if (rq->active_balance) { + raw_spin_unlock_irqrestore(&rq->lock, flags); + continue; + } + + /* + * No need to migration if source cpu does not have cfs + * tasks. + */ + if (!rq->cfs.curr) { + raw_spin_unlock_irqrestore(&rq->lock, flags); + continue; + } + + se = rq->cfs.curr; + + /* Find task entity if entity is cfs_rq. */ + if (entity_is_cfs_rq(se)) { + struct cfs_rq *cfs_rq; + + cfs_rq = se->my_q; + while (cfs_rq) { + se = cfs_rq->curr; + cfs_rq = se->my_q; + } + } + + /* + * Select cpu to migrate the task to. Return negative number + * if there is no idle cpu in sg. + */ + dst_cpu = ontime_select_target_cpu(dst_sg, cpu_active_mask); + if (dst_cpu < 0) { + raw_spin_unlock_irqrestore(&rq->lock, flags); + continue; + } + + /* + * Pick task to be migrated. Return NULL if there is no + * heavy task in rq. + */ + p = ontime_pick_heavy_task(se, sched_group_cpus(dst_sg), + &boost_migration); + if (!p) { + raw_spin_unlock_irqrestore(&rq->lock, flags); + continue; + } + + ontime_flag(p) = ONTIME_MIGRATING; + get_task_struct(p); + + /* Set environment data */ + env->dst_cpu = dst_cpu; + env->src_rq = rq; + env->target_task = p; + env->boost_migration = boost_migration; + + /* Prevent active balance to use stopper for migration */ + rq->active_balance = 1; + + cpu_rq(dst_cpu)->ontime_migrating = 1; + + raw_spin_unlock_irqrestore(&rq->lock, flags); + + /* Migrate task through stopper */ + stop_one_cpu_nowait(cpu, + ontime_migration_cpu_stop, env, + &per_cpu(ontime_migration_work, cpu)); + } + } while (src_sg = src_sg->next, src_sg->next != sd->groups); + +ontime_migration_exit: + rcu_read_unlock(); + spin_unlock(&om_lock); +} + +int ontime_can_migration(struct task_struct *p, int dst_cpu) +{ + u64 delta; + + if (ontime_flag(p) & NOT_ONTIME) { + trace_ehmp_ontime_check_migrate(p, dst_cpu, true, "not ontime"); + return true; + } + + if (ontime_flag(p) & ONTIME_MIGRATING) { + trace_ehmp_ontime_check_migrate(p, dst_cpu, false, "migrating"); + return false; + } + + if (cpumask_test_cpu(dst_cpu, cpu_coregroup_mask(maxcap_cpu))) { + trace_ehmp_ontime_check_migrate(p, dst_cpu, true, "ontime on big"); + return true; + } + + /* + * At this point, task is "ontime task" and running on big + * and load balancer is trying to migrate task to LITTLE. + */ + delta = cpu_rq(0)->clock_task - ontime_migration_time(p); + delta = delta >> 10; + if (delta <= min_residency_us) { + trace_ehmp_ontime_check_migrate(p, dst_cpu, false, "min residency"); + return false; + } + + if (cpu_rq(task_cpu(p))->nr_running > 1) { + trace_ehmp_ontime_check_migrate(p, dst_cpu, true, "big is busy"); + goto release; + } + + if (ontime_load_avg(p) >= down_threshold) { + trace_ehmp_ontime_check_migrate(p, dst_cpu, false, "heavy task"); + return false; + } + + trace_ehmp_ontime_check_migrate(p, dst_cpu, true, "ontime_release"); +release: + exclude_ontime_task(p); + + return true; +} + +static int ontime_task_wakeup(struct task_struct *p) +{ + struct sched_domain *sd; + u64 delta; + int target_cpu = -1; + + if (ontime_flag(p) & NOT_ONTIME) + if (ontime_load_avg(p) < up_threshold) + return -1; + + if (ontime_flag(p) & ONTIME) { + delta = cpu_rq(0)->clock_task - ontime_migration_time(p); + delta = delta >> 10; + + if (delta > min_residency_us && + ontime_load_avg(p) < down_threshold) { + exclude_ontime_task(p); + return -1; + } + + if (idle_cpu(task_cpu(p))) + return task_cpu(p); + } + + /* caller must hold rcu for sched domain */ + sd = rcu_dereference(per_cpu(sd_ea, maxcap_cpu)); + if (!sd) + return -1; + + target_cpu = ontime_select_target_cpu(sd->groups, tsk_cpus_allowed(p)); + if (cpu_selected(target_cpu)) { + if (ontime_flag(p) & NOT_ONTIME) + include_ontime_task(p); + } else { + if (ontime_flag(p) & ONTIME) + exclude_ontime_task(p); + } + + return target_cpu; +} + +static void ontime_update_next_balance(int cpu, struct ontime_avg *oa) +{ + if (cpumask_test_cpu(cpu, cpu_coregroup_mask(maxcap_cpu))) + return; + + if (oa->load_avg < up_threshold) + return; + + /* + * Update the next_balance of this cpu because tick is most likely + * to occur first in currently running cpu. + */ + cpu_rq(smp_processor_id())->next_balance = jiffies; +} + +#define cap_scale(v, s) ((v)*(s) >> SCHED_CAPACITY_SHIFT) + +extern u64 decay_load(u64 val, u64 n); + +static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3) +{ + u32 c1, c2, c3 = d3; + + c1 = decay_load((u64)d1, periods); + c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, periods) - 1024; + + return c1 + c2 + c3; +} + +/* + * ontime_update_load_avg : load tracking for ontime-migration + * + * @sa : sched_avg to be updated + * @delta : elapsed time since last update + * @period_contrib : amount already accumulated against our next period + * @scale_freq : scale vector of cpu frequency + * @scale_cpu : scale vector of cpu capacity + */ +void ontime_update_load_avg(u64 delta, int cpu, unsigned long weight, struct sched_avg *sa) +{ + struct ontime_avg *oa = &se_of(sa)->ontime.avg; + unsigned long scale_freq, scale_cpu; + u32 contrib = (u32)delta; /* p == 0 -> delta < 1024 */ + u64 periods; + + scale_freq = arch_scale_freq_capacity(NULL, cpu); + scale_cpu = arch_scale_cpu_capacity(NULL, cpu); + + delta += oa->period_contrib; + periods = delta / 1024; /* A period is 1024us (~1ms) */ + + if (periods) { + oa->load_sum = decay_load(oa->load_sum, periods); + + delta %= 1024; + contrib = __accumulate_pelt_segments(periods, + 1024 - oa->period_contrib, delta); + } + oa->period_contrib = delta; + + if (weight) { + contrib = cap_scale(contrib, scale_freq); + oa->load_sum += contrib * scale_cpu; + } + + if (!periods) + return; + + oa->load_avg = div_u64(oa->load_sum, LOAD_AVG_MAX - 1024 + oa->period_contrib); + ontime_update_next_balance(cpu, oa); +} + +void ontime_trace_task_info(struct task_struct *p) +{ + trace_ehmp_ontime_load_avg_task(p, &ontime_of(p)->avg, ontime_flag(p)); +} + +static inline unsigned long mincap_of(int cpu) +{ + return sge_array[cpu][SD_LEVEL0]->cap_states[0].cap; +} + +static int __init init_ontime(void) +{ + struct device_node *dn; + u32 prop; + + dn = get_ehmp_node(); + if (!dn) + return 0; + + /* + * Initilize default values: + * up_threshold = 40% of LITTLE maximum capacity + * down_threshold = 50% of big minimum capacity + * min_residency = 8ms + */ + up_threshold = capacity_orig_of(0) * 40 / 100; + down_threshold = mincap_of(maxcap_cpu) * 50 / 100; + min_residency_us = 8192; + + of_property_read_u32(dn, "up-threshold", &prop); + up_threshold = prop; + + of_property_read_u32(dn, "down-threshold", &prop); + down_threshold = prop; + + of_property_read_u32(dn, "min-residency-us", &prop); + min_residency_us = prop; + + return 0; +} +pure_initcall(init_ontime); + +/********************************************************************** + * cpu selection * + **********************************************************************/ +extern unsigned long boosted_task_util(struct task_struct *task); +extern unsigned long capacity_curr_of(int cpu); +extern struct energy_env *get_eenv(struct task_struct *p, int prev_cpu); +extern int select_energy_cpu_idx(struct energy_env *eenv); +extern int find_best_target(struct task_struct *p, int *backup_cpu, + bool boosted, bool prefer_idle); + +#define EAS_CPU_PRV 0 +#define EAS_CPU_NXT 1 +#define EAS_CPU_BKP 2 + +static int select_energy_cpu(struct sched_domain *sd, struct task_struct *p, + int prev_cpu, bool boosted) +{ + struct energy_env *eenv; + int energy_cpu = -1; + + eenv = get_eenv(p, prev_cpu); + if (eenv->max_cpu_count < 2) + return energy_cpu; + + eenv->max_cpu_count = EAS_CPU_BKP + 1; + + /* Find a cpu with sufficient capacity */ + eenv->cpu[EAS_CPU_NXT].cpu_id = find_best_target(p, + &eenv->cpu[EAS_CPU_BKP].cpu_id, boosted, 0); + + /* take note if no backup was found */ + if (eenv->cpu[EAS_CPU_BKP].cpu_id < 0) + eenv->max_cpu_count = EAS_CPU_BKP; + + /* take note if no target was found */ + if (eenv->cpu[EAS_CPU_NXT].cpu_id < 0) + eenv->max_cpu_count = EAS_CPU_NXT; + + if (eenv->max_cpu_count == EAS_CPU_NXT) { + /* + * we did not find any energy-awareness + * candidates beyond prev_cpu, so we will + * fall-back to the regular slow-path. + */ + return energy_cpu; + } + + /* find most energy-efficient CPU */ + energy_cpu = select_energy_cpu_idx(eenv) < 0 ? -1 : + eenv->cpu[eenv->next_idx].cpu_id; + + return energy_cpu; +} + +int exynos_select_cpu(struct task_struct *p, int prev_cpu, int sync, int sd_flag) +{ + struct sched_domain *sd, *prev_sd; + int target_cpu = -1; + bool boosted, prefer_idle; + unsigned long min_util; + struct boost_trigger trigger = { + .trigger = 0, + .boost_val = 0 + }; + + rcu_read_lock(); + + target_cpu = ontime_task_wakeup(p); + if (cpu_selected(target_cpu)) + goto unlock; + + /* Find target cpu from lowest capacity domain(cpu0) */ + sd = rcu_dereference(per_cpu(sd_ea, 0)); + if (!sd) + goto unlock; + + boosted = schedtune_task_boost(p) > 0; + prefer_idle = sched_feat(EAS_PREFER_IDLE) ? (schedtune_task_boost(p) > 0) : 0; + + min_util = boosted_task_util(p); + + if (check_boost_trigger(p, &trigger)) { + target_cpu = find_boost_target(sd, p, min_util, &trigger); + if (cpu_selected(target_cpu)) + goto unlock; + } + + if (sysctl_sched_sync_hint_enable && sync) { + int cpu = smp_processor_id(); + + if (cpumask_test_cpu(cpu, tsk_cpus_allowed(p))) { + target_cpu = cpu; + goto unlock; + } + } + + if (prefer_idle) { + target_cpu = find_prefer_idle_target(sd, p, min_util); + if (cpu_selected(target_cpu)) + goto unlock; + } + + prev_sd = rcu_dereference_sched(cpu_rq(prev_cpu)->sd); + if (sched_feat(ENERGY_AWARE) && sd_overutilized(sd)) + target_cpu = select_energy_cpu(sd, p, prev_cpu, boosted); + +unlock: + rcu_read_unlock(); + + return target_cpu; +} + +/********************************************************************** + * Sysfs * + **********************************************************************/ +static struct attribute *ehmp_attrs[] = { + &global_boost_attr.attr, + &min_residency_attr.attr, + &up_threshold_attr.attr, + &down_threshold_attr.attr, + &top_overutil_attr.attr, + &bot_overutil_attr.attr, + &prefer_perf_attr.attr, + NULL, +}; + +static const struct attribute_group ehmp_group = { + .attrs = ehmp_attrs, +}; + +static struct kobject *ehmp_kobj; + +static int __init init_sysfs(void) +{ + int ret; + + ehmp_kobj = kobject_create_and_add("ehmp", kernel_kobj); + ret = sysfs_create_group(ehmp_kobj, &ehmp_group); + + return 0; +} +late_initcall(init_sysfs); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index be08e5e6c12c..95b50a697842 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -610,7 +611,7 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) return rb_entry(left, struct sched_entity, run_node); } -static struct sched_entity *__pick_next_entity(struct sched_entity *se) +struct sched_entity *__pick_next_entity(struct sched_entity *se) { struct rb_node *next = rb_next(&se->run_node); @@ -756,6 +757,8 @@ void init_entity_runnable_average(struct sched_entity *se) sa->load_avg = scale_load_down(se->load.weight); sa->load_sum = sa->load_avg * LOAD_AVG_MAX; /* when this task enqueue'ed, it will contribute to its cfs_rq's load_avg */ + + ontime_new_entity_load(current, se); } static inline u64 cfs_rq_clock_task(struct cfs_rq *cfs_rq); @@ -794,6 +797,11 @@ void post_init_entity_util_avg(struct sched_entity *se) long cpu_scale = arch_scale_cpu_capacity(NULL, cpu_of(rq_of(cfs_rq))); long cap = (long)(cpu_scale - cfs_rq->avg.util_avg) / 2; + if (sched_feat(EXYNOS_HMP)) { + exynos_init_entity_util_avg(se); + goto util_init_done; + } + if (cap > 0) { if (cfs_rq->avg.util_avg != 0) { sa->util_avg = cfs_rq->avg.util_avg * se->load.weight; @@ -807,6 +815,7 @@ void post_init_entity_util_avg(struct sched_entity *se) sa->util_sum = sa->util_avg * LOAD_AVG_MAX; } +util_init_done: if (entity_is_task(se)) { struct task_struct *p = task_of(se); if (p->sched_class != &fair_sched_class) { @@ -2858,7 +2867,7 @@ static inline void cfs_rq_util_change(struct cfs_rq *cfs_rq) * Approximate: * val * y^n, where y^32 ~= 0.5 (~1 scheduling period) */ -static u64 decay_load(u64 val, u64 n) +u64 decay_load(u64 val, u64 n) { unsigned int local_n; @@ -3052,6 +3061,9 @@ ___update_load_avg(u64 now, int cpu, struct sched_avg *sa, if (!weight) running = 0; + if (!cfs_rq && !rt_rq) + ontime_update_load_avg(delta, cpu, weight, sa); + /* * Now we know we crossed measurement unit boundaries. The *_avg * accrues by two steps: @@ -3523,6 +3535,9 @@ static inline void update_load_avg(struct sched_entity *se, int flags) if (decayed && (flags & UPDATE_TG)) update_tg_load_avg(cfs_rq, 0); + + if (entity_is_task(se)) + ontime_trace_task_info(task_of(se)); } /** @@ -5773,85 +5788,6 @@ static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) * Hence - be careful when enabling DEBUG_EENV_DECISIONS * expecially if WALT is the task signal. */ -/*#define DEBUG_EENV_DECISIONS*/ - -#ifdef DEBUG_EENV_DECISIONS -/* max of 8 levels of sched groups traversed */ -#define EAS_EENV_DEBUG_LEVELS 16 - -struct _eenv_debug { - unsigned long cap; - unsigned long norm_util; - unsigned long cap_energy; - unsigned long idle_energy; - unsigned long this_energy; - unsigned long this_busy_energy; - unsigned long this_idle_energy; - cpumask_t group_cpumask; - unsigned long cpu_util[1]; -}; -#endif - -struct eenv_cpu { - /* CPU ID, must be in cpus_mask */ - int cpu_id; - - /* - * Index (into sched_group_energy::cap_states) of the OPP the - * CPU needs to run at if the task is placed on it. - * This includes the both active and blocked load, due to - * other tasks on this CPU, as well as the task's own - * utilization. - */ - int cap_idx; - int cap; - - /* Estimated system energy */ - unsigned long energy; - - /* Estimated energy variation wrt EAS_CPU_PRV */ - long nrg_delta; - -#ifdef DEBUG_EENV_DECISIONS - struct _eenv_debug *debug; - int debug_idx; -#endif /* DEBUG_EENV_DECISIONS */ -}; - -struct energy_env { - /* Utilization to move */ - struct task_struct *p; - unsigned long util_delta; - unsigned long util_delta_boosted; - - /* Mask of CPUs candidates to evaluate */ - cpumask_t cpus_mask; - - /* CPU candidates to evaluate */ - struct eenv_cpu *cpu; - int eenv_cpu_count; - -#ifdef DEBUG_EENV_DECISIONS - /* pointer to the memory block reserved - * for debug on this CPU - there will be - * sizeof(struct _eenv_debug) * - * (EAS_CPU_CNT * EAS_EENV_DEBUG_LEVELS) - * bytes allocated here. - */ - struct _eenv_debug *debug; -#endif - /* - * Index (into energy_env::cpu) of the morst energy efficient CPU for - * the specified energy_env::task - */ - int next_idx; - int max_cpu_count; - - /* Support data */ - struct sched_group *sg_top; - struct sched_group *sg_cap; - struct sched_group *sg; -}; /** * Amount of capacity of a CPU that is (estimated to be) used by CFS tasks @@ -6466,7 +6402,7 @@ static void dump_eenv_debug(struct energy_env *eenv) * A value greater than zero means that the most energy efficient CPU is the * one represented by eenv->cpu[eenv->next_idx].cpu_id. */ -static inline int select_energy_cpu_idx(struct energy_env *eenv) +int select_energy_cpu_idx(struct energy_env *eenv) { int last_cpu_idx = eenv->max_cpu_count - 1; struct sched_domain *sd; @@ -6745,7 +6681,7 @@ boosted_cpu_util(int cpu) return util + margin; } -static inline unsigned long +unsigned long boosted_task_util(struct task_struct *task) { unsigned long util = task_util_est(task); @@ -7299,7 +7235,7 @@ static int start_cpu(bool boosted) return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; } -static inline int find_best_target(struct task_struct *p, int *backup_cpu, +int find_best_target(struct task_struct *p, int *backup_cpu, bool boosted, bool prefer_idle) { unsigned long min_util = boosted_task_util(p); @@ -7743,7 +7679,7 @@ static inline void reset_eenv(struct energy_env *eenv) * filled in here. Callers are responsible for adding * other CPU candidates up to eenv->max_cpu_count. */ -static inline struct energy_env *get_eenv(struct task_struct *p, int prev_cpu) +struct energy_env *get_eenv(struct task_struct *p, int prev_cpu) { struct energy_env *eenv; cpumask_t cpumask_possible_cpus; @@ -7970,6 +7906,14 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f cpumask_test_cpu(cpu, &p->cpus_allowed); } + if (sched_feat(EXYNOS_HMP)) { + int selected_cpu; + + selected_cpu = exynos_select_cpu(p, prev_cpu, sync, sd_flag); + if (selected_cpu >= 0) + return selected_cpu; + } + for_each_domain(cpu, tmp) { if (!(tmp->flags & SD_LOAD_BALANCE)) break; @@ -8720,6 +8664,11 @@ static inline int migrate_degrades_locality(struct task_struct *p, } #endif +static inline bool smaller_cpu_capacity(int cpu, int ref) +{ + return capacity_orig_of(cpu) < capacity_orig_of(ref); +} + /* * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? */ @@ -8732,11 +8681,21 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) /* * We do not migrate tasks that are: + * 0) cannot be migrated to smaller capacity cpu due to schedtune.prefer_perf, or * 1) throttled_lb_pair, or * 2) cannot be migrated to this CPU due to cpus_allowed, or * 3) running (obviously), or * 4) are cache-hot on their current CPU. */ + if (!ontime_can_migration(p, env->dst_cpu)) + return 0; + +#ifdef CONFIG_SCHED_TUNE + if (smaller_cpu_capacity(env->dst_cpu, env->src_cpu) && + schedtune_prefer_perf(p)) + return 0; +#endif + if (throttled_lb_pair(task_group(p), env->src_cpu, env->dst_cpu)) return 0; @@ -10301,6 +10260,9 @@ static int need_active_balance(struct lb_env *env) return 1; } + if (sched_feat(EXYNOS_HMP)) + return exynos_need_active_balance(env->idle, sd, env->src_cpu, env->dst_cpu); + /* * The dst_cpu is idle and the src_cpu CPU has only 1 CFS task. * It's worth migrating the task if the src_cpu's capacity is reduced @@ -11387,6 +11349,9 @@ static __latent_entropy void run_rebalance_domains(struct softirq_action *h) #else rebalance_domains(this_rq, idle); #endif + + ontime_migration(); + schedtune_group_util_update(); } /* diff --git a/kernel/sched/features.h b/kernel/sched/features.h index dbade300ef8c..ca512de98d61 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -119,6 +119,11 @@ SCHED_FEAT(EAS_PREFER_IDLE, true) SCHED_FEAT(FIND_BEST_TARGET, true) SCHED_FEAT(FBT_STRICT_ORDER, true) +#ifdef CONFIG_SCHED_EHMP +SCHED_FEAT(EXYNOS_HMP, true) +#else +SCHED_FEAT(EXYNOS_HMP, false) +#endif /* * Apply schedtune boost hold to tasks of all sched classes. * If enabled, schedtune will hold the boost applied to a CPU diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index a91c79cb112e..d6987ba1cfc6 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -856,6 +856,9 @@ struct rq { u64 cum_window_demand; #endif /* CONFIG_SCHED_WALT */ +#ifdef CONFIG_SCHED_EHMP + bool ontime_migrating; +#endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING u64 prev_irq_time; @@ -2160,6 +2163,85 @@ extern void nohz_balance_exit_idle(unsigned int cpu); static inline void nohz_balance_exit_idle(unsigned int cpu) { } #endif +/*#define DEBUG_EENV_DECISIONS*/ + +#ifdef DEBUG_EENV_DECISIONS +/* max of 8 levels of sched groups traversed */ +#define EAS_EENV_DEBUG_LEVELS 16 + +struct _eenv_debug { + unsigned long cap; + unsigned long norm_util; + unsigned long cap_energy; + unsigned long idle_energy; + unsigned long this_energy; + unsigned long this_busy_energy; + unsigned long this_idle_energy; + cpumask_t group_cpumask; + unsigned long cpu_util[1]; +}; +#endif + +struct eenv_cpu { + /* CPU ID, must be in cpus_mask */ + int cpu_id; + + /* + * Index (into sched_group_energy::cap_states) of the OPP the + * CPU needs to run at if the task is placed on it. + * This includes the both active and blocked load, due to + * other tasks on this CPU, as well as the task's own + * utilization. + */ + int cap_idx; + int cap; + + /* Estimated system energy */ + unsigned long energy; + + /* Estimated energy variation wrt EAS_CPU_PRV */ + long nrg_delta; + +#ifdef DEBUG_EENV_DECISIONS + struct _eenv_debug *debug; + int debug_idx; +#endif /* DEBUG_EENV_DECISIONS */ +}; + +struct energy_env { + /* Utilization to move */ + struct task_struct *p; + unsigned long util_delta; + unsigned long util_delta_boosted; + + /* Mask of CPUs candidates to evaluate */ + cpumask_t cpus_mask; + + /* CPU candidates to evaluate */ + struct eenv_cpu *cpu; + int eenv_cpu_count; + +#ifdef DEBUG_EENV_DECISIONS + /* pointer to the memory block reserved + * for debug on this CPU - there will be + * sizeof(struct _eenv_debug) * + * (EAS_CPU_CNT * EAS_EENV_DEBUG_LEVELS) + * bytes allocated here. + */ + struct _eenv_debug *debug; +#endif + /* + * Index (into energy_env::cpu) of the morst energy efficient CPU for + * the specified energy_env::task + */ + int next_idx; + int max_cpu_count; + + /* Support data */ + struct sched_group *sg_top; + struct sched_group *sg_cap; + struct sched_group *sg; +}; #ifdef CONFIG_SMP diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 74a45606dc8c..534ee933ceb6 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -17,6 +18,52 @@ extern struct reciprocal_value schedtune_spc_rdiv; /* We hold schedtune boost in effect for at least this long */ #define SCHEDTUNE_BOOST_HOLD_NS 50000000ULL +static int perf_threshold = 0; + +int schedtune_perf_threshold(void) +{ + return perf_threshold + 1; +} + +struct group_balancer { + /* sum of task utilization in group */ + unsigned long util; + + /* group balancing threshold */ + unsigned long threshold; + + /* imbalance ratio by heaviest task */ + unsigned int imbalance_ratio; + + /* balance ratio by heaviest task */ + unsigned int balance_ratio; + + /* heaviest task utilization in group */ + unsigned long heaviest_util; + + /* group utilization update interval */ + unsigned long update_interval; + + /* next group utilization update time */ + unsigned long next_update_time; + + /* + * group imbalance time = imbalance_count * update_interval + * imbalance_count >= imbalance_duration -> need balance + */ + unsigned int imbalance_duration; + unsigned int imbalance_count; + + /* utilization tracking window size */ + unsigned long window; + + /* group balancer locking */ + raw_spinlock_t lock; + + /* need group balancing? */ + bool need_balance; +}; + /* * EAS scheduler tunables for task groups. */ @@ -35,6 +82,13 @@ struct schedtune { /* Hint to bias scheduling of tasks on that SchedTune CGroup * towards idle CPUs */ int prefer_idle; + + /* Hint to bias scheduling of tasks on that SchedTune CGroup + * towards high performance CPUs */ + int prefer_perf; + + /* SchedTune group balancer */ + struct group_balancer gb; }; static inline struct schedtune *css_st(struct cgroup_subsys_state *css) @@ -65,6 +119,7 @@ static struct schedtune root_schedtune = { .boost = 0, .prefer_idle = 0, + .prefer_perf = 0, }; /* @@ -442,6 +497,337 @@ int schedtune_prefer_idle(struct task_struct *p) return prefer_idle; } +#ifdef CONFIG_SCHED_EHMP +static atomic_t kernel_prefer_perf_req[BOOSTGROUPS_COUNT]; +int kernel_prefer_perf(int grp_idx) +{ + if (grp_idx >= BOOSTGROUPS_COUNT) + return -EINVAL; + + return atomic_read(&kernel_prefer_perf_req[grp_idx]); +} + +void request_kernel_prefer_perf(int grp_idx, int enable) +{ + if (grp_idx >= BOOSTGROUPS_COUNT) + return; + + if (enable) + atomic_inc(&kernel_prefer_perf_req[grp_idx]); + else + BUG_ON(atomic_dec_return(&kernel_prefer_perf_req[grp_idx]) < 0); +} +#else +static inline int kernel_prefer_perf(int grp_idx) { return 0; } +#endif + +int schedtune_prefer_perf(struct task_struct *p) +{ + struct schedtune *st; + int prefer_perf; + + if (unlikely(!schedtune_initialized)) + return 0; + + /* Get prefer_perf value */ + rcu_read_lock(); + st = task_schedtune(p); + prefer_perf = max(st->prefer_perf, kernel_prefer_perf(st->idx)); + rcu_read_unlock(); + + return prefer_perf; +} + +int schedtune_need_group_balance(struct task_struct *p) +{ + bool balance; + + if (unlikely(!schedtune_initialized)) + return 0; + + rcu_read_lock(); + balance = task_schedtune(p)->gb.need_balance; + rcu_read_unlock(); + + return balance; +} + +static inline void +check_need_group_balance(int group_idx, struct group_balancer *gb) +{ + int heaviest_ratio; + + if (!gb->util) { + gb->imbalance_count = 0; + gb->need_balance = false; + + goto out; + } + + heaviest_ratio = gb->heaviest_util * 100 / gb->util; + + if (gb->need_balance) { + if (gb->util < gb->threshold || heaviest_ratio < gb->balance_ratio) { + gb->imbalance_count = 0; + gb->need_balance = false; + } + + goto out; + } + + if (gb->util >= gb->threshold && heaviest_ratio > gb->imbalance_ratio) { + gb->imbalance_count++; + + if (gb->imbalance_count >= gb->imbalance_duration) + gb->need_balance = true; + } else { + gb->imbalance_count = 0; + } + +out: + trace_sched_tune_check_group_balance(group_idx, + gb->imbalance_count, gb->need_balance); +} + +static void __schedtune_group_util_update(struct schedtune *st) +{ + struct group_balancer *gb = &st->gb; + unsigned long now = cpu_rq(0)->clock_task; + struct css_task_iter it; + struct task_struct *p; + struct task_struct *heaviest_p = NULL; + unsigned long util_sum = 0; + unsigned long heaviest_util = 0; + unsigned int total = 0, accumulated = 0; + + if (!raw_spin_trylock(&gb->lock)) + return; + + if (!gb->update_interval) + goto out; + + if (time_before(now, gb->next_update_time)) + goto out; + + css_task_iter_start(&st->css, 0, &it); + while ((p = css_task_iter_next(&it))) { + unsigned long clock_task, delta, util; + + total++; + + clock_task = task_rq(p)->clock_task; + delta = clock_task - p->se.avg.last_update_time; + if (p->se.avg.last_update_time && delta > gb->window) + continue; + + util = p->se.avg.util_avg; + if (util > heaviest_util) { + heaviest_util = util; + heaviest_p = p; + } + + util_sum += p->se.avg.util_avg; + accumulated++; + } + css_task_iter_end(&it); + + gb->util = util_sum; + gb->heaviest_util = heaviest_util; + gb->next_update_time = now + gb->update_interval; + + /* if there is no task in group, heaviest_p is always NULL */ + if (heaviest_p) + trace_sched_tune_grouputil_update(st->idx, total, accumulated, + gb->util, heaviest_p, gb->heaviest_util); + + check_need_group_balance(st->idx, gb); +out: + raw_spin_unlock(&gb->lock); +} + +void schedtune_group_util_update(void) +{ + int idx; + + if (unlikely(!schedtune_initialized)) + return; + + rcu_read_lock(); + + for (idx = 1; idx < BOOSTGROUPS_COUNT; idx++) { + struct schedtune *st = allocated_group[idx]; + + if (!st) + continue; + __schedtune_group_util_update(st); + } + + rcu_read_unlock(); +} + +static u64 +gb_util_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->gb.util; +} + +static u64 +gb_heaviest_ratio_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + if (!st->gb.util) + return 0; + + return st->gb.heaviest_util * 100 / st->gb.util; +} + +static u64 +gb_threshold_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->gb.threshold; +} + +static int +gb_threshold_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 threshold) +{ + struct schedtune *st = css_st(css); + struct group_balancer *gb = &st->gb; + + raw_spin_lock(&gb->lock); + gb->threshold = threshold; + check_need_group_balance(st->idx, gb); + raw_spin_unlock(&gb->lock); + + return 0; +} + +static u64 +gb_imbalance_ratio_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->gb.imbalance_ratio; +} + +static int +gb_imbalance_ratio_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 ratio) +{ + struct schedtune *st = css_st(css); + struct group_balancer *gb = &st->gb; + + ratio = min_t(u64, ratio, 100); + + raw_spin_lock(&gb->lock); + gb->imbalance_ratio = ratio; + check_need_group_balance(st->idx, gb); + raw_spin_unlock(&gb->lock); + + return 0; +} + +static u64 +gb_balance_ratio_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->gb.balance_ratio; +} + +static int +gb_balance_ratio_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 ratio) +{ + struct schedtune *st = css_st(css); + struct group_balancer *gb = &st->gb; + + ratio = min_t(u64, ratio, 100); + + raw_spin_lock(&gb->lock); + gb->balance_ratio = ratio; + check_need_group_balance(st->idx, gb); + raw_spin_unlock(&gb->lock); + + return 0; +} + +static u64 +gb_interval_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->gb.update_interval / NSEC_PER_USEC; +} + +static int +gb_interval_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 interval_us) +{ + struct schedtune *st = css_st(css); + struct group_balancer *gb = &st->gb; + + raw_spin_lock(&gb->lock); + gb->update_interval = interval_us * NSEC_PER_USEC; + if (!interval_us) { + gb->util = 0; + gb->need_balance = false; + } + raw_spin_unlock(&gb->lock); + + return 0; +} + +static u64 +gb_duration_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->gb.imbalance_duration; +} + +static int +gb_duration_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 duration) +{ + struct schedtune *st = css_st(css); + struct group_balancer *gb = &st->gb; + + raw_spin_lock(&gb->lock); + gb->imbalance_duration = duration; + check_need_group_balance(st->idx, gb); + raw_spin_unlock(&gb->lock); + + return 0; +} + +static u64 +gb_window_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->gb.window / NSEC_PER_MSEC; +} + +static int +gb_window_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 window) +{ + struct schedtune *st = css_st(css); + struct group_balancer *gb = &st->gb; + + raw_spin_lock(&gb->lock); + gb->window = window * NSEC_PER_MSEC; + raw_spin_unlock(&gb->lock); + + return 0; +} + static u64 prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -460,6 +846,24 @@ prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft, return 0; } +static u64 +prefer_perf_read(struct cgroup_subsys_state *css, struct cftype *cft) +{ + struct schedtune *st = css_st(css); + + return st->prefer_perf; +} + +static int +prefer_perf_write(struct cgroup_subsys_state *css, struct cftype *cft, + u64 prefer_perf) +{ + struct schedtune *st = css_st(css); + st->prefer_perf = prefer_perf; + + return 0; +} + static s64 boost_read(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -496,6 +900,49 @@ static struct cftype files[] = { .read_u64 = prefer_idle_read, .write_u64 = prefer_idle_write, }, + { + .name = "prefer_perf", + .read_u64 = prefer_perf_read, + .write_u64 = prefer_perf_write, + }, + { + .name = "gb_util", + .read_u64 = gb_util_read, + }, + { + .name = "gb_heaviest_ratio", + .read_u64 = gb_heaviest_ratio_read, + }, + { + .name = "gb_threshold", + .read_u64 = gb_threshold_read, + .write_u64 = gb_threshold_write, + }, + { + .name = "gb_imbalance_ratio", + .read_u64 = gb_imbalance_ratio_read, + .write_u64 = gb_imbalance_ratio_write, + }, + { + .name = "gb_balance_ratio", + .read_u64 = gb_balance_ratio_read, + .write_u64 = gb_balance_ratio_write, + }, + { + .name = "gb_interval_us", + .read_u64 = gb_interval_read, + .write_u64 = gb_interval_write, + }, + { + .name = "gb_duration", + .read_u64 = gb_duration_read, + .write_u64 = gb_duration_write, + }, + { + .name = "gb_window_ms", + .read_u64 = gb_window_read, + .write_u64 = gb_window_write, + }, { } /* terminate */ }; @@ -519,6 +966,22 @@ schedtune_boostgroup_init(struct schedtune *st) return 0; } +static void +schedtune_group_balancer_init(struct schedtune *st) +{ + raw_spin_lock_init(&st->gb.lock); + + st->gb.threshold = ULONG_MAX; + st->gb.imbalance_ratio = 0; /* 0% */ + st->gb.update_interval = 0; /* disable update */ + st->gb.next_update_time = cpu_rq(0)->clock_task; + + st->gb.imbalance_duration = 0; + st->gb.imbalance_count = 0; + + st->gb.window = 100 * NSEC_PER_MSEC; /* 100ms */ +} + static struct cgroup_subsys_state * schedtune_css_alloc(struct cgroup_subsys_state *parent_css) { @@ -548,6 +1011,8 @@ schedtune_css_alloc(struct cgroup_subsys_state *parent_css) if (!st) goto out; + schedtune_group_balancer_init(st); + /* Initialize per CPUs boost group support */ st->idx = idx; if (schedtune_boostgroup_init(st)) @@ -616,6 +1081,9 @@ schedtune_init(void) { schedtune_spc_rdiv = reciprocal_value(100); schedtune_init_cgroups(); + + perf_threshold = find_second_max_cap(); + return 0; } postcore_initcall(schedtune_init); diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h index e79e1b198921..1588ba24bff9 100644 --- a/kernel/sched/tune.h +++ b/kernel/sched/tune.h @@ -15,7 +15,13 @@ struct target_nrg { int schedtune_cpu_boost(int cpu); int schedtune_task_boost(struct task_struct *tsk); +void schedtune_group_util_update(void); +int schedtune_need_group_balance(struct task_struct *p); + +int schedtune_perf_threshold(void); + int schedtune_prefer_idle(struct task_struct *tsk); +int schedtune_prefer_perf(struct task_struct *tsk); void schedtune_enqueue_task(struct task_struct *p, int cpu); void schedtune_dequeue_task(struct task_struct *p, int cpu); @@ -25,7 +31,13 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu); #define schedtune_cpu_boost(cpu) 0 #define schedtune_task_boost(tsk) 0 +#define schedtune_group_util_update() do { } while (0) +#define schedtune_need_group_balance(task) 0 + +#define schedtune_perf_threshold() 0 + #define schedtune_prefer_idle(tsk) 0 +#define schedtune_prefer_perf(tsk) 0 #define schedtune_enqueue_task(task, cpu) do { } while (0) #define schedtune_dequeue_task(task, cpu) do { } while (0) -- 2.20.1