[GitHub/LineageOS/android_kernel_motorola_exynos9610.git] / kernel / sched / ems / core.c

/*
 * Core Exynos Mobile Scheduler
 *
 * Copyright (C) 2018 Samsung Electronics Co., Ltd
 * Park Bumgyu <bumgyu.park@samsung.com>
 */

#include <linux/ems.h>

#define CREATE_TRACE_POINTS
#include <trace/events/ems.h>

#include "ems.h"
#include "../sched.h"

int task_util(struct task_struct *p)
{
	if (rt_task(p))
		return p->rt.avg.util_avg;
	else
		return p->se.avg.util_avg;
}

int cpu_util_wake(int cpu, struct task_struct *p)
{
	struct cfs_rq *cfs_rq;
	unsigned int util;

	/* Task has no contribution or is new */
	if (cpu != task_cpu(p) || !READ_ONCE(p->se.avg.last_update_time))
		return cpu_util(cpu);

	cfs_rq = &cpu_rq(cpu)->cfs;
	util = READ_ONCE(cfs_rq->avg.util_avg);

	/* Discount task's blocked util from CPU's util */
	util -= min_t(unsigned int, util, task_util_est(p));

	/*
	 * Covered cases:
	 *
	 * a) if *p is the only task sleeping on this CPU, then:
	 *      cpu_util (== task_util) > util_est (== 0)
	 *    and thus we return:
	 *      cpu_util_wake = (cpu_util - task_util) = 0
	 *
	 * b) if other tasks are SLEEPING on this CPU, which is now exiting
	 *    IDLE, then:
	 *      cpu_util >= task_util
	 *      cpu_util > util_est (== 0)
	 *    and thus we discount *p's blocked utilization to return:
	 *      cpu_util_wake = (cpu_util - task_util) >= 0
	 *
	 * c) if other tasks are RUNNABLE on that CPU and
	 *      util_est > cpu_util
	 *    then we use util_est since it returns a more restrictive
	 *    estimation of the spare capacity on that CPU, by just
	 *    considering the expected utilization of tasks already
	 *    runnable on that CPU.
	 *
	 * Cases a) and b) are covered by the above code, while case c) is
	 * covered by the following code when estimated utilization is
	 * enabled.
	 */
	if (sched_feat(UTIL_EST))
		util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));

	/*
	 * Utilization (estimated) can exceed the CPU capacity, thus let's
	 * clamp to the maximum CPU capacity to ensure consistency with
	 * the cpu_util call.
	 */
	return min_t(unsigned long, util, capacity_orig_of(cpu));
}

static inline int
check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
{
	return ((rq->cpu_capacity * sd->imbalance_pct) <
				(rq->cpu_capacity_orig * 100));
}

#define lb_sd_parent(sd) \
	(sd->parent && sd->parent->groups != sd->parent->groups->next)

int exynos_need_active_balance(enum cpu_idle_type idle, struct sched_domain *sd,
					int src_cpu, int dst_cpu)
{
	unsigned int src_imb_pct = lb_sd_parent(sd) ? sd->imbalance_pct : 1;
	unsigned int dst_imb_pct = lb_sd_parent(sd) ? 100 : 1;
	unsigned long src_cap = capacity_of(src_cpu);
	unsigned long dst_cap = capacity_of(dst_cpu);
	int level = sd->level;

	/* dst_cpu is idle */
	if ((idle != CPU_NOT_IDLE) &&
	    (cpu_rq(src_cpu)->cfs.h_nr_running == 1)) {
		if ((check_cpu_capacity(cpu_rq(src_cpu), sd)) &&
		    (src_cap * sd->imbalance_pct < dst_cap * 100)) {
			return 1;
		}

		/* This domain is top and dst_cpu is bigger than src_cpu*/
		if (!lb_sd_parent(sd) && src_cap < dst_cap)
			if (lbt_overutilized(src_cpu, level) || global_boosted())
				return 1;
	}

	if ((src_cap * src_imb_pct < dst_cap * dst_imb_pct) &&
			cpu_rq(src_cpu)->cfs.h_nr_running == 1 &&
			lbt_overutilized(src_cpu, level) &&
			!lbt_overutilized(dst_cpu, level)) {
		return 1;
	}

	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries + 2);
}

static int select_proper_cpu(struct task_struct *p, int prev_cpu)
{
	int cpu;
	unsigned long best_min_util = ULONG_MAX;
	int best_cpu = -1;

	for_each_possible_cpu(cpu) {
		int i;

		/* visit each coregroup only once */
		if (cpu != cpumask_first(cpu_coregroup_mask(cpu)))
			continue;

		/* skip if task cannot be assigned to coregroup */
		if (!cpumask_intersects(&p->cpus_allowed, cpu_coregroup_mask(cpu)))
			continue;

		for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_coregroup_mask(cpu)) {
			unsigned long capacity_orig = capacity_orig_of(i);
			unsigned long wake_util, new_util;

			wake_util = cpu_util_wake(i, p);
			new_util = wake_util + task_util_est(p);

			/* skip over-capacity cpu */
			if (new_util > capacity_orig)
				continue;

			/*
			 * According to the criteria determined by the LBT(Load
			 * Balance trigger), the cpu that becomes overutilized
			 * when the task is assigned is skipped.
			 */
			if (lbt_bring_overutilize(i, p))
				continue;

			/*
			 * Best target) lowest utilization among lowest-cap cpu
			 *
			 * If the sequence reaches this function, the wakeup task
			 * does not require performance and the prev cpu is over-
			 * utilized, so it should do load balancing without
			 * considering energy side. Therefore, it selects cpu
			 * with smallest cpapacity and the least utilization among
			 * cpu that fits the task.
			 */
			if (best_min_util < new_util)
				continue;

			best_min_util = new_util;
			best_cpu = i;
		}

		/*
		 * if it fails to find the best cpu in this coregroup, visit next
		 * coregroup.
		 */
		if (cpu_selected(best_cpu))
			break;
	}

	trace_ems_select_proper_cpu(p, best_cpu, best_min_util);

	/*
	 * if it fails to find the vest cpu, choosing any cpu is meaningless.
	 * Return prev cpu.
	 */
	return cpu_selected(best_cpu) ? best_cpu : prev_cpu;
}

extern void sync_entity_load_avg(struct sched_entity *se);

int exynos_wakeup_balance(struct task_struct *p, int prev_cpu, int sd_flag, int sync)
{
	int target_cpu = -1;
	char state[30] = "fail";

	/*
	 * Since the utilization of a task is accumulated before sleep, it updates
	 * the utilization to determine which cpu the task will be assigned to.
	 * Exclude new task.
	 */
	if (!(sd_flag & SD_BALANCE_FORK)) {
		unsigned long old_util = task_util(p);

		sync_entity_load_avg(&p->se);
		/* update the band if a large amount of task util is decayed */
		update_band(p, old_util);
	}

	/*
	 * Priority 1 : ontime task
	 *
	 * If task which has more utilization than threshold wakes up, the task is
	 * classified as "ontime task" and assigned to performance cpu. Conversely,
	 * if heavy task that has been classified as ontime task sleeps for a long
	 * time and utilization becomes small, it is excluded from ontime task and
	 * is no longer guaranteed to operate on performance cpu.
	 *
	 * Ontime task is very sensitive to performance because it is usually the
	 * main task of application. Therefore, it has the highest priority.
	 */
	target_cpu = ontime_task_wakeup(p);
	if (cpu_selected(target_cpu)) {
		strcpy(state, "ontime migration");
		goto out;
	}

	/*
	 * Priority 2 : prefer-perf
	 *
	 * Prefer-perf is a function that operates on cgroup basis managed by
	 * schedtune. When perfer-perf is set to 1, the tasks in the group are
	 * preferentially assigned to the performance cpu.
	 *
	 * It has a high priority because it is a function that is turned on
	 * temporarily in scenario requiring reactivity(touch, app laucning).
	 */
	target_cpu = prefer_perf_cpu(p);
	if (cpu_selected(target_cpu)) {
		strcpy(state, "prefer-perf");
		goto out;
	}

	/*
	 * Priority 3 : task band
	 *
	 * The tasks in a process are likely to interact, and its operations are
	 * sequential and share resources. Therefore, if these tasks are packed and
	 * and assign on a specific cpu or cluster, the latency for interaction
	 * decreases and the reusability of the cache increases, thereby improving
	 * performance.
	 *
	 * The "task band" is a function that groups tasks on a per-process basis
	 * and assigns them to a specific cpu or cluster. If the attribute "band"
	 * of schedtune.cgroup is set to '1', task band operate on this cgroup.
	 */
	target_cpu = band_play_cpu(p);
	if (cpu_selected(target_cpu)) {
		strcpy(state, "task band");
		goto out;
	}

	/*
	 * Priority 4 : global boosting
	 *
	 * Global boost is a function that preferentially assigns all tasks in the
	 * system to the performance cpu. Unlike prefer-perf, which targets only
	 * group tasks, global boost targets all tasks. So, it maximizes performance
	 * cpu utilization.
	 *
	 * Typically, prefer-perf operates on groups that contains UX related tasks,
	 * such as "top-app" or "foreground", so that major tasks are likely to be
	 * assigned to performance cpu. On the other hand, global boost assigns
	 * all tasks to performance cpu, which is not as effective as perfer-perf.
	 * For this reason, global boost has a lower priority than prefer-perf.
	 */
	target_cpu = global_boosting(p);
	if (cpu_selected(target_cpu)) {
		strcpy(state, "global boosting");
		goto out;
	}

	/*
	 * Priority 5 : group balancing
	 */
	target_cpu = group_balancing(p);
	if (cpu_selected(target_cpu)) {
		strcpy(state, "group balancing");
		goto out;
	}

	/*
	 * Priority 6 : prefer-idle
	 *
	 * Prefer-idle is a function that operates on cgroup basis managed by
	 * schedtune. When perfer-idle is set to 1, the tasks in the group are
	 * preferentially assigned to the idle cpu.
	 *
	 * Prefer-idle has a smaller performance impact than the above. Therefore
	 * it has a relatively low priority.
	 */
	target_cpu = prefer_idle_cpu(p);
	if (cpu_selected(target_cpu)) {
		strcpy(state, "prefer-idle");
		goto out;
	}

	/*
	 * Priority 7 : energy cpu
	 *
	 * A scheduling scheme based on cpu energy, find the least power consumption
	 * cpu referring energy table when assigning task.
	 */
	target_cpu = select_energy_cpu(p, prev_cpu, sd_flag, sync);
	if (cpu_selected(target_cpu)) {
		strcpy(state, "energy cpu");
		goto out;
	}

	/*
	 * Priority 8 : proper cpu
	 */
	target_cpu = select_proper_cpu(p, prev_cpu);
	if (cpu_selected(target_cpu))
		strcpy(state, "proper cpu");

out:
	trace_ems_wakeup_balance(p, target_cpu, state);
	return target_cpu;
}

struct kobject *ems_kobj;

static int __init init_sysfs(void)
{
	ems_kobj = kobject_create_and_add("ems", kernel_kobj);

	return 0;
}
core_initcall(init_sysfs);
Commit	Line	Data
9da19d24 PB	1	/*
	2	* Core Exynos Mobile Scheduler
	3	*
	4	* Copyright (C) 2018 Samsung Electronics Co., Ltd
	5	* Park Bumgyu <bumgyu.park@samsung.com>
	6	*/
	7
f18d73c2 PB	8	#include <linux/ems.h>
f18d73c2 PB	9
9da19d24 PB	10	#define CREATE_TRACE_POINTS
	11	#include <trace/events/ems.h>
	12
	13	#include "ems.h"
	14	#include "../sched.h"
	15
2191aa1c PB	16	int task_util(struct task_struct *p)
2191aa1c PB	17	{
6d016614 JP	18	if (rt_task(p))
	19	return p->rt.avg.util_avg;
	20	else
	21	return p->se.avg.util_avg;
2191aa1c PB	22	}
	23
	24	int cpu_util_wake(int cpu, struct task_struct *p)
	25	{
5684373f	26	struct cfs_rq *cfs_rq;
5684373f	27	unsigned int util;
2191aa1c PB	28
2191aa1c PB	29	/* Task has no contribution or is new */
5684373f	30	if (cpu != task_cpu(p) \|\| !READ_ONCE(p->se.avg.last_update_time))
2191aa1c PB	31	return cpu_util(cpu);
2191aa1c PB	32
5684373f	33	cfs_rq = &cpu_rq(cpu)->cfs;
	34	util = READ_ONCE(cfs_rq->avg.util_avg);
	35
	36	/* Discount task's blocked util from CPU's util */
	37	util -= min_t(unsigned int, util, task_util_est(p));
	38
	39	/*
	40	* Covered cases:
	41	*
	42	* a) if *p is the only task sleeping on this CPU, then:
	43	* cpu_util (== task_util) > util_est (== 0)
	44	* and thus we return:
	45	* cpu_util_wake = (cpu_util - task_util) = 0
	46	*
	47	* b) if other tasks are SLEEPING on this CPU, which is now exiting
	48	* IDLE, then:
	49	* cpu_util >= task_util
	50	* cpu_util > util_est (== 0)
	51	* and thus we discount *p's blocked utilization to return:
	52	* cpu_util_wake = (cpu_util - task_util) >= 0
	53	*
	54	* c) if other tasks are RUNNABLE on that CPU and
	55	* util_est > cpu_util
	56	* then we use util_est since it returns a more restrictive
	57	* estimation of the spare capacity on that CPU, by just
	58	* considering the expected utilization of tasks already
	59	* runnable on that CPU.
	60	*
	61	* Cases a) and b) are covered by the above code, while case c) is
	62	* covered by the following code when estimated utilization is
	63	* enabled.
	64	*/
	65	if (sched_feat(UTIL_EST))
	66	util = max(util, READ_ONCE(cfs_rq->avg.util_est.enqueued));
	67
	68	/*
	69	* Utilization (estimated) can exceed the CPU capacity, thus let's
	70	* clamp to the maximum CPU capacity to ensure consistency with
	71	* the cpu_util call.
	72	*/
	73	return min_t(unsigned long, util, capacity_orig_of(cpu));
2191aa1c PB	74	}
2191aa1c PB	75
1b4978b2 PB	76	static inline int
	77	check_cpu_capacity(struct rq rq, struct sched_domain sd)
	78	{
	79	return ((rq->cpu_capacity * sd->imbalance_pct) <
	80	(rq->cpu_capacity_orig * 100));
	81	}
	82
	83	#define lb_sd_parent(sd) \
	84	(sd->parent && sd->parent->groups != sd->parent->groups->next)
	85
	86	int exynos_need_active_balance(enum cpu_idle_type idle, struct sched_domain *sd,
	87	int src_cpu, int dst_cpu)
	88	{
	89	unsigned int src_imb_pct = lb_sd_parent(sd) ? sd->imbalance_pct : 1;
	90	unsigned int dst_imb_pct = lb_sd_parent(sd) ? 100 : 1;
	91	unsigned long src_cap = capacity_of(src_cpu);
	92	unsigned long dst_cap = capacity_of(dst_cpu);
	93	int level = sd->level;
	94
	95	/* dst_cpu is idle */
	96	if ((idle != CPU_NOT_IDLE) &&
	97	(cpu_rq(src_cpu)->cfs.h_nr_running == 1)) {
	98	if ((check_cpu_capacity(cpu_rq(src_cpu), sd)) &&
	99	(src_cap * sd->imbalance_pct < dst_cap * 100)) {
	100	return 1;
	101	}
	102
	103	/* This domain is top and dst_cpu is bigger than src_cpu*/
	104	if (!lb_sd_parent(sd) && src_cap < dst_cap)
	105	if (lbt_overutilized(src_cpu, level) \|\| global_boosted())
	106	return 1;
	107	}
	108
	109	if ((src_cap * src_imb_pct < dst_cap * dst_imb_pct) &&
	110	cpu_rq(src_cpu)->cfs.h_nr_running == 1 &&
	111	lbt_overutilized(src_cpu, level) &&
	112	!lbt_overutilized(dst_cpu, level)) {
	113	return 1;
	114	}
	115
	116	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries + 2);
	117	}
	118
aa533107	119	static int select_proper_cpu(struct task_struct *p, int prev_cpu)
9da19d24	120	{
aa533107 PB	121	int cpu;
	122	unsigned long best_min_util = ULONG_MAX;
	123	int best_cpu = -1;
	124
	125	for_each_possible_cpu(cpu) {
	126	int i;
	127
	128	/* visit each coregroup only once */
	129	if (cpu != cpumask_first(cpu_coregroup_mask(cpu)))
	130	continue;
	131
	132	/* skip if task cannot be assigned to coregroup */
	133	if (!cpumask_intersects(&p->cpus_allowed, cpu_coregroup_mask(cpu)))
	134	continue;
	135
	136	for_each_cpu_and(i, tsk_cpus_allowed(p), cpu_coregroup_mask(cpu)) {
	137	unsigned long capacity_orig = capacity_orig_of(i);
	138	unsigned long wake_util, new_util;
	139
	140	wake_util = cpu_util_wake(i, p);
5684373f	141	new_util = wake_util + task_util_est(p);
aa533107 PB	142
	143	/* skip over-capacity cpu */
	144	if (new_util > capacity_orig)
	145	continue;
	146
	147	/*
	148	* According to the criteria determined by the LBT(Load
	149	* Balance trigger), the cpu that becomes overutilized
	150	* when the task is assigned is skipped.
	151	*/
	152	if (lbt_bring_overutilize(i, p))
	153	continue;
	154
	155	/*
	156	* Best target) lowest utilization among lowest-cap cpu
	157	*
	158	* If the sequence reaches this function, the wakeup task
	159	* does not require performance and the prev cpu is over-
	160	* utilized, so it should do load balancing without
	161	* considering energy side. Therefore, it selects cpu
	162	* with smallest cpapacity and the least utilization among
	163	* cpu that fits the task.
	164	*/
	165	if (best_min_util < new_util)
	166	continue;
	167
	168	best_min_util = new_util;
	169	best_cpu = i;
	170	}
	171
	172	/*
	173	* if it fails to find the best cpu in this coregroup, visit next
	174	* coregroup.
	175	*/
	176	if (cpu_selected(best_cpu))
	177	break;
	178	}
	179
	180	trace_ems_select_proper_cpu(p, best_cpu, best_min_util);
	181
	182	/*
	183	* if it fails to find the vest cpu, choosing any cpu is meaningless.
	184	* Return prev cpu.
	185	*/
	186	return cpu_selected(best_cpu) ? best_cpu : prev_cpu;
9da19d24 PB	187	}
9da19d24 PB	188
9da19d24 PB	189	extern void sync_entity_load_avg(struct sched_entity *se);
9da19d24 PB	190
405334d3	191	int exynos_wakeup_balance(struct task_struct *p, int prev_cpu, int sd_flag, int sync)
9da19d24 PB	192	{
	193	int target_cpu = -1;
	194	char state[30] = "fail";
	195
	196	/*
	197	* Since the utilization of a task is accumulated before sleep, it updates
	198	* the utilization to determine which cpu the task will be assigned to.
	199	* Exclude new task.
	200	*/
8a2d6134 PB	201	if (!(sd_flag & SD_BALANCE_FORK)) {
	202	unsigned long old_util = task_util(p);
	203
9da19d24	204	sync_entity_load_avg(&p->se);
8a2d6134 PB	205	/* update the band if a large amount of task util is decayed */
	206	update_band(p, old_util);
	207	}
9da19d24 PB	208
	209	/*
	210	* Priority 1 : ontime task
	211	*
	212	* If task which has more utilization than threshold wakes up, the task is
	213	* classified as "ontime task" and assigned to performance cpu. Conversely,
	214	* if heavy task that has been classified as ontime task sleeps for a long
	215	* time and utilization becomes small, it is excluded from ontime task and
	216	* is no longer guaranteed to operate on performance cpu.
	217	*
	218	* Ontime task is very sensitive to performance because it is usually the
	219	* main task of application. Therefore, it has the highest priority.
	220	*/
	221	target_cpu = ontime_task_wakeup(p);
	222	if (cpu_selected(target_cpu)) {
	223	strcpy(state, "ontime migration");
	224	goto out;
	225	}
	226
	227	/*
	228	* Priority 2 : prefer-perf
	229	*
	230	* Prefer-perf is a function that operates on cgroup basis managed by
	231	* schedtune. When perfer-perf is set to 1, the tasks in the group are
	232	* preferentially assigned to the performance cpu.
	233	*
	234	* It has a high priority because it is a function that is turned on
	235	* temporarily in scenario requiring reactivity(touch, app laucning).
	236	*/
	237	target_cpu = prefer_perf_cpu(p);
	238	if (cpu_selected(target_cpu)) {
	239	strcpy(state, "prefer-perf");
	240	goto out;
	241	}
	242
	243	/*
8a2d6134 PB	244	* Priority 3 : task band
	245	*
	246	* The tasks in a process are likely to interact, and its operations are
	247	* sequential and share resources. Therefore, if these tasks are packed and
	248	* and assign on a specific cpu or cluster, the latency for interaction
	249	* decreases and the reusability of the cache increases, thereby improving
	250	* performance.
	251	*
	252	* The "task band" is a function that groups tasks on a per-process basis
	253	* and assigns them to a specific cpu or cluster. If the attribute "band"
	254	* of schedtune.cgroup is set to '1', task band operate on this cgroup.
	255	*/
	256	target_cpu = band_play_cpu(p);
	257	if (cpu_selected(target_cpu)) {
	258	strcpy(state, "task band");
	259	goto out;
	260	}
	261
	262	/*
	263	* Priority 4 : global boosting
9da19d24 PB	264	*
	265	* Global boost is a function that preferentially assigns all tasks in the
	266	* system to the performance cpu. Unlike prefer-perf, which targets only
	267	* group tasks, global boost targets all tasks. So, it maximizes performance
	268	* cpu utilization.
	269	*
	270	* Typically, prefer-perf operates on groups that contains UX related tasks,
	271	* such as "top-app" or "foreground", so that major tasks are likely to be
	272	* assigned to performance cpu. On the other hand, global boost assigns
	273	* all tasks to performance cpu, which is not as effective as perfer-perf.
	274	* For this reason, global boost has a lower priority than prefer-perf.
	275	*/
	276	target_cpu = global_boosting(p);
	277	if (cpu_selected(target_cpu)) {
	278	strcpy(state, "global boosting");
	279	goto out;
	280	}
	281
	282	/*
8a2d6134	283	* Priority 5 : group balancing
9da19d24 PB	284	*/
	285	target_cpu = group_balancing(p);
	286	if (cpu_selected(target_cpu)) {
	287	strcpy(state, "group balancing");
	288	goto out;
	289	}
	290
	291	/*
8a2d6134	292	* Priority 6 : prefer-idle
9da19d24 PB	293	*
	294	* Prefer-idle is a function that operates on cgroup basis managed by
	295	* schedtune. When perfer-idle is set to 1, the tasks in the group are
	296	* preferentially assigned to the idle cpu.
	297	*
	298	* Prefer-idle has a smaller performance impact than the above. Therefore
	299	* it has a relatively low priority.
	300	*/
	301	target_cpu = prefer_idle_cpu(p);
	302	if (cpu_selected(target_cpu)) {
	303	strcpy(state, "prefer-idle");
	304	goto out;
	305	}
	306
	307	/*
8a2d6134	308	* Priority 7 : energy cpu
9da19d24 PB	309	*
	310	* A scheduling scheme based on cpu energy, find the least power consumption
	311	* cpu referring energy table when assigning task.
	312	*/
405334d3	313	target_cpu = select_energy_cpu(p, prev_cpu, sd_flag, sync);
9da19d24 PB	314	if (cpu_selected(target_cpu)) {
	315	strcpy(state, "energy cpu");
	316	goto out;
	317	}
	318
	319	/*
8a2d6134	320	* Priority 8 : proper cpu
9da19d24	321	*/
aa533107	322	target_cpu = select_proper_cpu(p, prev_cpu);
9da19d24 PB	323	if (cpu_selected(target_cpu))
	324	strcpy(state, "proper cpu");
	325
	326	out:
	327	trace_ems_wakeup_balance(p, target_cpu, state);
	328	return target_cpu;
	329	}
	330
	331	struct kobject *ems_kobj;
	332
	333	static int __init init_sysfs(void)
	334	{
	335	ems_kobj = kobject_create_and_add("ems", kernel_kobj);
	336
	337	return 0;
	338	}
	339	core_initcall(init_sysfs);