sched: ems: separate energy coefficient cpu selection
authorPark Bumgyu <bumgyu.park@samsung.com>
Fri, 6 Apr 2018 04:46:39 +0000 (13:46 +0900)
committerChungwoo Park <cww.park@samsung.com>
Mon, 21 May 2018 08:35:34 +0000 (17:35 +0900)
Separate code related to energy conefficient cpu selection to
facilitate code management.

Change-Id: I0a18490da3f178483108c6cb8c34b904cbaca3d6
Signed-off-by: Park Bumgyu <bumgyu.park@samsung.com>
kernel/sched/ems/Makefile
kernel/sched/ems/core.c
kernel/sched/ems/ems.h
kernel/sched/ems/energy.c [new file with mode: 0644]

index bbfe44f9efbecb07260eb59e286551ab5ed61e73..2b53ebfdf9bf7ea76389f0677b1ce49a44692242 100644 (file)
@@ -1,4 +1,4 @@
-obj-y += core.o pcf.o global_boost.o lbt.o ontime.o
+obj-y += core.o pcf.o global_boost.o lbt.o ontime.o energy.o
 
 obj-$(CONFIG_SCHED_TUNE) += st_addon.o
 obj-$(CONFIG_SCHED_EMS) += ehmp.o
index f09b875a82cdf92f7b22587b8267119ddb6cd315..c464588e7090d2b3fc9b6dd542b25452f5334364 100644 (file)
 
 #define cpu_selected(cpu)      (cpu >= 0)
 
-static int task_util(struct task_struct *p)
-{
-       return p->se.avg.util_avg;
-}
-
-static int cpu_util_wake(int cpu, struct task_struct *p)
-{
-       unsigned long util, capacity;
-
-       /* Task has no contribution or is new */
-       if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
-               return cpu_util(cpu);
-
-       capacity = capacity_orig_of(cpu);
-       util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
-
-       return (util >= capacity) ? capacity : util;
-}
-
-/*
- * The compute capacity, power consumption at this compute capacity and
- * frequency of state. The cap and power are used to find the energy
- * efficiency cpu, and the frequency is used to create the capacity table.
- */
-struct energy_state {
-       unsigned long cap;
-       unsigned long power;
-       unsigned long frequency;
-};
-
-/*
- * Each cpu can have its own mips, coefficient and energy table. Generally,
- * cpus in the same frequency domain have the same mips, coefficient and
- * energy table.
- */
-struct energy_table {
-       unsigned int mips;
-       unsigned int coefficient;;
-
-       struct energy_state *states;
-       unsigned int nr_states;
-};
-DEFINE_PER_CPU(struct energy_table, energy_table);
-
-/*
- * When choosing cpu considering energy efficiency, decide best cpu and
- * backup cpu according to policy, and then choose cpu which consumes the
- * least energy including prev cpu.
- */
-struct eco_env {
-       struct task_struct *p;
-
-       int prev_cpu;
-       int best_cpu;
-       int backup_cpu;
-};
-
-static void find_eco_target(struct eco_env *eenv)
-{
-       struct task_struct *p = eenv->p;
-       unsigned long best_min_cap_orig = ULONG_MAX;
-       unsigned long backup_min_cap_orig = ULONG_MAX;
-       unsigned long best_spare_cap = 0;
-       int backup_idle_cstate = INT_MAX;
-       int best_cpu = -1;
-       int backup_cpu = -1;
-       int cpu;
-
-       /*
-        * It is meaningless to find an energy cpu when the energy table is
-        * not created or has not been created yet.
-        */
-       if (!per_cpu(energy_table, eenv->prev_cpu).nr_states)
-               return;
-
-       rcu_read_lock();
-
-       for_each_cpu_and(cpu, &p->cpus_allowed, cpu_active_mask) {
-               unsigned long capacity_orig = capacity_orig_of(cpu);
-               unsigned long wake_util, new_util;
-
-               wake_util = cpu_util_wake(cpu, p);
-               new_util = wake_util + task_util(p);
-
-               /* checking prev cpu is meaningless */
-               if (eenv->prev_cpu == cpu)
-                       continue;
-
-               /* skip over-capacity cpu */
-               if (new_util > capacity_orig)
-                       continue;
-
-               /*
-                * According to the criteria determined by the LBT(Load
-                * Balance trigger), the cpu that becomes overutilized when
-                * the task is assigned is skipped.
-                */
-               if (lbt_bring_overutilize(cpu, p))
-                       continue;
-
-               /*
-                * Backup target) shallowest idle cpu among min-cap cpu
-                *
-                * In general, assigning a task to an idle cpu is
-                * disadvantagerous in energy. To minimize the energy increase
-                * associated with selecting idle cpu, choose a cpu that is
-                * in the lowest performance and shallowest idle state.
-                */
-               if (idle_cpu(cpu)) {
-                       int idle_idx;
-
-                       if (backup_min_cap_orig < capacity_orig)
-                               continue;
-
-                       idle_idx = idle_get_state_idx(cpu_rq(cpu));
-                       if (backup_idle_cstate <= idle_idx)
-                               continue;
-
-                       backup_min_cap_orig = capacity_orig;
-                       backup_idle_cstate = idle_idx;
-                       backup_cpu = cpu;
-                       continue;
-               }
-
-               /*
-                * Best target) biggest spare cpu among min-cap cpu
-                *
-                * Select the cpu with the biggest spare capacity to maintain
-                * frequency as possible without waking up idle cpu. Also, to
-                * maximize the use of energy-efficient cpu, we choose the
-                * lowest performance cpu.
-                */
-               if (best_min_cap_orig < capacity_orig)
-                       continue;
-
-               if (best_spare_cap > (capacity_orig - new_util))
-                       continue;
-
-               best_spare_cap = capacity_orig - new_util;
-               best_min_cap_orig = capacity_orig;
-               best_cpu = cpu;
-       }
-
-       rcu_read_unlock();
-
-       eenv->best_cpu = best_cpu;
-       eenv->backup_cpu = backup_cpu;
-}
-
-static int __init init_sched_energy_data(void)
-{
-       struct device_node *cpu_node, *cpu_phandle;
-       int cpu;
-
-       for_each_possible_cpu(cpu) {
-               struct energy_table *table;
-
-               cpu_node = of_get_cpu_node(cpu, NULL);
-               if (!cpu_node) {
-                       pr_warn("CPU device node missing for CPU %d\n", cpu);
-                       return -ENODATA;
-               }
-
-               cpu_phandle = of_parse_phandle(cpu_node, "sched-energy-data", 0);
-               if (!cpu_phandle) {
-                       pr_warn("CPU device node has no sched-energy-data\n");
-                       return -ENODATA;
-               }
-
-               table = &per_cpu(energy_table, cpu);
-               if (of_property_read_u32(cpu_phandle, "capacity-mips", &table->mips)) {
-                       pr_warn("No capacity-mips data\n");
-                       return -ENODATA;
-               }
-
-               if (of_property_read_u32(cpu_phandle, "power-coefficient", &table->coefficient)) {
-                       pr_warn("No power-coefficient data\n");
-                       return -ENODATA;
-               }
-
-               of_node_put(cpu_phandle);
-               of_node_put(cpu_node);
-
-               pr_info("cpu%d mips=%d, coefficient=%d\n", cpu, table->mips, table->coefficient);
-       }
-
-       return 0;
-}
-pure_initcall(init_sched_energy_data);
-
-static void
-fill_power_table(struct energy_table *table, int table_size,
-                       unsigned long *f_table, unsigned int *v_table,
-                       int max_f, int min_f)
-{
-       int i, index = 0;
-       int c = table->coefficient, v;
-       unsigned long f, power;
-
-       /* energy table and frequency table are inverted */
-       for (i = table_size - 1; i >= 0; i--) {
-               if (f_table[i] > max_f || f_table[i] < min_f)
-                       continue;
-
-               f = f_table[i] / 1000;  /* KHz -> MHz */
-               v = v_table[i] / 1000;  /* uV -> mV */
-
-               /*
-                * power = coefficent * frequency * voltage^2
-                */
-               power = c * f * v * v;
-
-               /*
-                * Generally, frequency is more than treble figures in MHz and
-                * voltage is also more then treble figures in mV, so the
-                * calculated power is larger than 10^9. For convenience of
-                * calculation, divide the value by 10^9.
-                */
-               do_div(power, 1000000000);
-               table->states[index].power = power;
-
-               /* save frequency to energy table */
-               table->states[index].frequency = f_table[i];
-               index++;
-       }
-}
-
-static void
-fill_cap_table(struct energy_table *table, int max_mips, unsigned long max_mips_freq)
-{
-       int i, m = table->mips;
-       unsigned long f;
-
-       for (i = 0; i < table->nr_states; i++) {
-               f = table->states[i].frequency;
-
-               /*
-                * capacity = freq/max_freq * mips/max_mips * 1024
-                */
-               table->states[i].cap = f * m * 1024 / max_mips_freq / max_mips;
-       }
-}
-
-static void show_energy_table(struct energy_table *table, int cpu)
-{
-       int i;
-
-       pr_info("[Energy Table : cpu%d]\n", cpu);
-       for (i = 0; i < table->nr_states; i++) {
-               pr_info("[%d] .cap=%lu .power=%lu\n", i,
-                       table->states[i].cap, table->states[i].power);
-       }
-}
-
-/*
- * Whenever frequency domain is registered, and energy table corresponding to
- * the domain is created. Because cpu in the same frequency domain has the same
- * energy table. Capacity is calculated based on the max frequency of the fastest
- * cpu, so once the frequency domain of the faster cpu is regsitered, capacity
- * is recomputed.
- */
-void init_sched_energy_table(struct cpumask *cpus, int table_size,
-                               unsigned long *f_table, unsigned int *v_table,
-                               int max_f, int min_f)
-{
-       struct energy_table *table;
-       int cpu, i, mips, valid_table_size = 0;
-       int max_mips = 0;
-       unsigned long max_mips_freq = 0;
-
-       mips = per_cpu(energy_table, cpumask_any(cpus)).mips;
-       for_each_cpu(cpu, cpus) {
-               /*
-                * All cpus in a frequency domain must have the smae capacity.
-                * Otherwise, it does not create an energy table because it
-                * is likely to be a human error.
-                */
-               if (mips != per_cpu(energy_table, cpu).mips) {
-                       pr_warn("cpu%d has different cpacity!!\n", cpu);
-                       return;
-               }
-       }
-
-       /* get size of valid frequency table to allocate energy table */
-       for (i = 0; i < table_size; i++) {
-               if (f_table[i] > max_f || f_table[i] < min_f)
-                       continue;
-
-               valid_table_size++;
-       }
-
-       /* there is no valid row in the table, energy table is not created */
-       if (!valid_table_size)
-               return;
-
-       /* allocate memory for energy table and fill power table */
-       for_each_cpu(cpu, cpus) {
-               table = &per_cpu(energy_table, cpu);
-               table->states = kcalloc(valid_table_size,
-                                       sizeof(struct energy_state), GFP_KERNEL);
-               if (unlikely(!table->states))
-                       return;
-
-               table->nr_states = valid_table_size;
-               fill_power_table(table, table_size, f_table, v_table, max_f, min_f);
-       }
-
-       /*
-        * Find fastest cpu among the cpu to which the energy table is allocated.
-        * The mips and max frequency of fastest cpu are needed to calculate
-        * capacity.
-        */
-       for_each_possible_cpu(cpu) {
-               table = &per_cpu(energy_table, cpu);
-               if (!table->states)
-                       continue;
-
-               if (table->mips > max_mips) {
-                       int last_state = table->nr_states - 1;
-
-                       max_mips = table->mips;
-                       max_mips_freq = table->states[last_state].frequency;
-               }
-       }
-
-       /*
-        * Calculate and fill capacity table.
-        * Recalculate the capacity whenever frequency domain changes because
-        * the fastest cpu may have changed and the capacity needs to be
-        * recalculated.
-        */
-       for_each_possible_cpu(cpu) {
-               table = &per_cpu(energy_table, cpu);
-               if (!table->states)
-                       continue;
-
-               fill_cap_table(table, max_mips, max_mips_freq);
-               show_energy_table(table, cpu);
-       }
-}
-
-static unsigned int calculate_energy(struct task_struct *p, int target_cpu)
-{
-       unsigned long util[NR_CPUS] = {0, };
-       unsigned int total_energy = 0;
-       int cpu;
-
-       /*
-        * 0. Calculate utilization of the entire active cpu when task
-        *    is assigned to target cpu.
-        */
-       for_each_cpu(cpu, cpu_active_mask) {
-               util[cpu] = cpu_util_wake(cpu, p);
-
-               if (unlikely(cpu == target_cpu))
-                       util[cpu] += task_util(p);
-       }
-
-       for_each_possible_cpu(cpu) {
-               struct energy_table *table;
-               unsigned long max_util = 0, util_sum = 0;
-               unsigned long capacity;
-               int i, cap_idx;
-
-               /* Compute coregroup energy with only one cpu per coregroup */
-               if (cpu != cpumask_first(cpu_coregroup_mask(cpu)))
-                       continue;
-
-               /*
-                * 1. The cpu in the coregroup has same capacity and the
-                *    capacity depends on the cpu that has the biggest
-                *    utilization. Find biggest utilization in the coregroup
-                *    to know what capacity the cpu will have.
-                */
-               for_each_cpu(i, cpu_coregroup_mask(cpu))
-                       if (util[i] > max_util)
-                               max_util = util[i];
-
-               /*
-                * 2. Find the capacity according to biggest utilization in
-                *    coregroup.
-                */
-               table = &per_cpu(energy_table, cpu);
-               cap_idx = table->nr_states - 1;
-               for (i = 0; i < table->nr_states; i++) {
-                       if (table->states[i].cap >= max_util) {
-                               capacity = table->states[i].cap;
-                               cap_idx = i;
-                               break;
-                       }
-               }
-
-               /*
-                * 3. Get the utilization sum of coregroup. Since cpu
-                *    utilization of CFS reflects the performance of cpu,
-                *    normalize the utilization to calculate the amount of
-                *    cpu usuage that excludes cpu performance.
-                */
-               for_each_cpu(i, cpu_coregroup_mask(cpu)) {
-                       /* utilization with task exceeds max capacity of cpu */
-                       if (util[i] >= capacity) {
-                               util_sum += SCHED_CAPACITY_SCALE;
-                               continue;
-                       }
-
-                       /* normalize cpu utilization */
-                       util_sum += (util[i] << SCHED_CAPACITY_SHIFT) / capacity;
-               }
-
-               /*
-                * 4. compute active energy
-                */
-               total_energy += util_sum * table->states[cap_idx].power;
-       }
-
-       return total_energy;
-}
-
-static int select_eco_cpu(struct eco_env *eenv)
-{
-       unsigned int prev_energy, best_energy, backup_energy;
-       unsigned int temp_energy;
-       int temp_cpu;
-       int eco_cpu = eenv->prev_cpu;
-       int margin;
-
-       prev_energy = calculate_energy(eenv->p, eenv->prev_cpu);
-
-       /*
-        * find_eco_target() may not find best or backup cup. Ignore unfound
-        * cpu, and if both are found, select a cpu that consumes less energy
-        * when assigning task.
-        */
-       best_energy = backup_energy = UINT_MAX;
-
-       if (cpu_selected(eenv->best_cpu))
-               best_energy = calculate_energy(eenv->p, eenv->best_cpu);
-
-       if (cpu_selected(eenv->backup_cpu))
-               backup_energy = calculate_energy(eenv->p, eenv->backup_cpu);
-
-       if (best_energy < backup_energy) {
-               temp_energy = best_energy;
-               temp_cpu = eenv->best_cpu;
-       } else {
-               temp_energy = backup_energy;
-               temp_cpu = eenv->backup_cpu;
-       }
-
-       /*
-        * Compare prev cpu to target cpu among best and backup cpu to determine
-        * whether keeping the task on PREV CPU and sending the task to TARGET
-        * CPU is beneficial for energy.
-        */
-       if (temp_energy < prev_energy) {
-               /*
-                * Compute the dead-zone margin used to prevent too many task
-                * migrations with negligible energy savings.
-                * An energy saving is considered meaningful if it reduces the
-                * energy consumption of PREV CPU candidate by at least ~1.56%.
-                */
-               margin = prev_energy >> 6;
-               if ((prev_energy - temp_energy) < margin)
-                       goto out;
-
-               eco_cpu = temp_cpu;
-       }
-
-out:
-       trace_ems_select_eco_cpu(eenv->p, eco_cpu,
-                       eenv->prev_cpu, eenv->best_cpu, eenv->backup_cpu,
-                       prev_energy, best_energy, backup_energy);
-       return eco_cpu;
-}
-
-static int
-select_energy_cpu(struct task_struct *p, int prev_cpu, int sd_flag, int sync)
-{
-       struct sched_domain *sd = NULL;
-       int cpu = smp_processor_id();
-       struct eco_env eenv = {
-               .p = p,
-               .prev_cpu = prev_cpu,
-               .best_cpu = -1,
-               .backup_cpu = -1,
-       };
-
-       if (!sched_feat(ENERGY_AWARE))
-               return -1;
-
-       /*
-        * Energy-aware wakeup placement on overutilized cpu is hard to get
-        * energy gain.
-        */
-       rcu_read_lock();
-       sd = rcu_dereference_sched(cpu_rq(prev_cpu)->sd);
-       if (!sd || sd->shared->overutilized) {
-               rcu_read_unlock();
-               return -1;
-       }
-       rcu_read_unlock();
-
-       /*
-        * We cannot do energy-aware wakeup placement sensibly for tasks
-        * with 0 utilization, so let them be placed according to the normal
-        * strategy.
-        */
-       if (!task_util(p))
-               return -1;
-
-       if (sysctl_sched_sync_hint_enable && sync)
-               if (cpumask_test_cpu(cpu, &p->cpus_allowed))
-                       return cpu;
-
-       /*
-        * Find eco-friendly target.
-        * After selecting the best and backup cpu according to strategy, we
-        * choose a cpu that is energy efficient compared to prev cpu.
-        */
-       find_eco_target(&eenv);
-       if (eenv.best_cpu < 0 && eenv.backup_cpu < 0)
-               return prev_cpu;
-
-       return select_eco_cpu(&eenv);
-}
-
 static int select_proper_cpu(struct task_struct *p)
 {
        return -1;
index 8d2a02b5b4d4753d02c122bac24ee343848486df..b82d54245db30fc60c0504b228694066878300e2 100644 (file)
@@ -22,6 +22,7 @@ extern int select_perf_cpu(struct task_struct *p);
 extern int global_boosting(struct task_struct *p);
 extern int global_boosted(void);
 extern bool lbt_bring_overutilize(int cpu, struct task_struct *p);
+extern int select_energy_cpu(struct task_struct *p, int prev_cpu, int sd_flag, int sync);
 
 #ifdef CONFIG_SCHED_TUNE
 extern int prefer_perf_cpu(struct task_struct *p);
diff --git a/kernel/sched/ems/energy.c b/kernel/sched/ems/energy.c
new file mode 100644 (file)
index 0000000..8b3ffa1
--- /dev/null
@@ -0,0 +1,536 @@
+/*
+ * Energy efficient cpu selection
+ *
+ * Copyright (C) 2018 Samsung Electronics Co., Ltd
+ * Park Bumgyu <bumgyu.park@samsung.com>
+ */
+
+#include <trace/events/ems.h>
+
+#include "ems.h"
+#include "../sched.h"
+
+static int task_util(struct task_struct *p)
+{
+       return p->se.avg.util_avg;
+}
+
+static int cpu_util_wake(int cpu, struct task_struct *p)
+{
+       unsigned long util, capacity;
+
+       /* Task has no contribution or is new */
+       if (cpu != task_cpu(p) || !p->se.avg.last_update_time)
+               return cpu_util(cpu);
+
+       capacity = capacity_orig_of(cpu);
+       util = max_t(long, cpu_rq(cpu)->cfs.avg.util_avg - task_util(p), 0);
+
+       return (util >= capacity) ? capacity : util;
+}
+
+/*
+ * The compute capacity, power consumption at this compute capacity and
+ * frequency of state. The cap and power are used to find the energy
+ * efficiency cpu, and the frequency is used to create the capacity table.
+ */
+struct energy_state {
+       unsigned long cap;
+       unsigned long power;
+       unsigned long frequency;
+};
+
+/*
+ * Each cpu can have its own mips, coefficient and energy table. Generally,
+ * cpus in the same frequency domain have the same mips, coefficient and
+ * energy table.
+ */
+struct energy_table {
+       unsigned int mips;
+       unsigned int coefficient;;
+
+       struct energy_state *states;
+       unsigned int nr_states;
+};
+DEFINE_PER_CPU(struct energy_table, energy_table);
+
+/*
+ * When choosing cpu considering energy efficiency, decide best cpu and
+ * backup cpu according to policy, and then choose cpu which consumes the
+ * least energy including prev cpu.
+ */
+struct eco_env {
+       struct task_struct *p;
+
+       int prev_cpu;
+       int best_cpu;
+       int backup_cpu;
+};
+
+static void find_eco_target(struct eco_env *eenv)
+{
+       struct task_struct *p = eenv->p;
+       unsigned long best_min_cap_orig = ULONG_MAX;
+       unsigned long backup_min_cap_orig = ULONG_MAX;
+       unsigned long best_spare_cap = 0;
+       int backup_idle_cstate = INT_MAX;
+       int best_cpu = -1;
+       int backup_cpu = -1;
+       int cpu;
+
+       /*
+        * It is meaningless to find an energy cpu when the energy table is
+        * not created or has not been created yet.
+        */
+       if (!per_cpu(energy_table, eenv->prev_cpu).nr_states)
+               return;
+
+       rcu_read_lock();
+
+       for_each_cpu_and(cpu, &p->cpus_allowed, cpu_active_mask) {
+               unsigned long capacity_orig = capacity_orig_of(cpu);
+               unsigned long wake_util, new_util;
+
+               wake_util = cpu_util_wake(cpu, p);
+               new_util = wake_util + task_util(p);
+
+               /* checking prev cpu is meaningless */
+               if (eenv->prev_cpu == cpu)
+                       continue;
+
+               /* skip over-capacity cpu */
+               if (new_util > capacity_orig)
+                       continue;
+
+               /*
+                * According to the criteria determined by the LBT(Load
+                * Balance trigger), the cpu that becomes overutilized when
+                * the task is assigned is skipped.
+                */
+               if (lbt_bring_overutilize(cpu, p))
+                       continue;
+
+               /*
+                * Backup target) shallowest idle cpu among min-cap cpu
+                *
+                * In general, assigning a task to an idle cpu is
+                * disadvantagerous in energy. To minimize the energy increase
+                * associated with selecting idle cpu, choose a cpu that is
+                * in the lowest performance and shallowest idle state.
+                */
+               if (idle_cpu(cpu)) {
+                       int idle_idx;
+
+                       if (backup_min_cap_orig < capacity_orig)
+                               continue;
+
+                       idle_idx = idle_get_state_idx(cpu_rq(cpu));
+                       if (backup_idle_cstate <= idle_idx)
+                               continue;
+
+                       backup_min_cap_orig = capacity_orig;
+                       backup_idle_cstate = idle_idx;
+                       backup_cpu = cpu;
+                       continue;
+               }
+
+               /*
+                * Best target) biggest spare cpu among min-cap cpu
+                *
+                * Select the cpu with the biggest spare capacity to maintain
+                * frequency as possible without waking up idle cpu. Also, to
+                * maximize the use of energy-efficient cpu, we choose the
+                * lowest performance cpu.
+                */
+               if (best_min_cap_orig < capacity_orig)
+                       continue;
+
+               if (best_spare_cap > (capacity_orig - new_util))
+                       continue;
+
+               best_spare_cap = capacity_orig - new_util;
+               best_min_cap_orig = capacity_orig;
+               best_cpu = cpu;
+       }
+
+       rcu_read_unlock();
+
+       eenv->best_cpu = best_cpu;
+       eenv->backup_cpu = backup_cpu;
+}
+
+static unsigned int calculate_energy(struct task_struct *p, int target_cpu)
+{
+       unsigned long util[NR_CPUS] = {0, };
+       unsigned int total_energy = 0;
+       int cpu;
+
+       /*
+        * 0. Calculate utilization of the entire active cpu when task
+        *    is assigned to target cpu.
+        */
+       for_each_cpu(cpu, cpu_active_mask) {
+               util[cpu] = cpu_util_wake(cpu, p);
+
+               if (unlikely(cpu == target_cpu))
+                       util[cpu] += task_util(p);
+       }
+
+       for_each_possible_cpu(cpu) {
+               struct energy_table *table;
+               unsigned long max_util = 0, util_sum = 0;
+               unsigned long capacity;
+               int i, cap_idx;
+
+               /* Compute coregroup energy with only one cpu per coregroup */
+               if (cpu != cpumask_first(cpu_coregroup_mask(cpu)))
+                       continue;
+
+               /*
+                * 1. The cpu in the coregroup has same capacity and the
+                *    capacity depends on the cpu that has the biggest
+                *    utilization. Find biggest utilization in the coregroup
+                *    to know what capacity the cpu will have.
+                */
+               for_each_cpu(i, cpu_coregroup_mask(cpu))
+                       if (util[i] > max_util)
+                               max_util = util[i];
+
+               /*
+                * 2. Find the capacity according to biggest utilization in
+                *    coregroup.
+                */
+               table = &per_cpu(energy_table, cpu);
+               cap_idx = table->nr_states - 1;
+               for (i = 0; i < table->nr_states; i++) {
+                       if (table->states[i].cap >= max_util) {
+                               capacity = table->states[i].cap;
+                               cap_idx = i;
+                               break;
+                       }
+               }
+
+               /*
+                * 3. Get the utilization sum of coregroup. Since cpu
+                *    utilization of CFS reflects the performance of cpu,
+                *    normalize the utilization to calculate the amount of
+                *    cpu usuage that excludes cpu performance.
+                */
+               for_each_cpu(i, cpu_coregroup_mask(cpu)) {
+                       /* utilization with task exceeds max capacity of cpu */
+                       if (util[i] >= capacity) {
+                               util_sum += SCHED_CAPACITY_SCALE;
+                               continue;
+                       }
+
+                       /* normalize cpu utilization */
+                       util_sum += (util[i] << SCHED_CAPACITY_SHIFT) / capacity;
+               }
+
+               /*
+                * 4. compute active energy
+                */
+               total_energy += util_sum * table->states[cap_idx].power;
+       }
+
+       return total_energy;
+}
+
+static int select_eco_cpu(struct eco_env *eenv)
+{
+       unsigned int prev_energy, best_energy, backup_energy;
+       unsigned int temp_energy;
+       int temp_cpu;
+       int eco_cpu = eenv->prev_cpu;
+       int margin;
+
+       prev_energy = calculate_energy(eenv->p, eenv->prev_cpu);
+
+       /*
+        * find_eco_target() may not find best or backup cup. Ignore unfound
+        * cpu, and if both are found, select a cpu that consumes less energy
+        * when assigning task.
+        */
+       best_energy = backup_energy = UINT_MAX;
+
+       if (cpu_selected(eenv->best_cpu))
+               best_energy = calculate_energy(eenv->p, eenv->best_cpu);
+
+       if (cpu_selected(eenv->backup_cpu))
+               backup_energy = calculate_energy(eenv->p, eenv->backup_cpu);
+
+       if (best_energy < backup_energy) {
+               temp_energy = best_energy;
+               temp_cpu = eenv->best_cpu;
+       } else {
+               temp_energy = backup_energy;
+               temp_cpu = eenv->backup_cpu;
+       }
+
+       /*
+        * Compare prev cpu to target cpu among best and backup cpu to determine
+        * whether keeping the task on PREV CPU and sending the task to TARGET
+        * CPU is beneficial for energy.
+        */
+       if (temp_energy < prev_energy) {
+               /*
+                * Compute the dead-zone margin used to prevent too many task
+                * migrations with negligible energy savings.
+                * An energy saving is considered meaningful if it reduces the
+                * energy consumption of PREV CPU candidate by at least ~1.56%.
+                */
+               margin = prev_energy >> 6;
+               if ((prev_energy - temp_energy) < margin)
+                       goto out;
+
+               eco_cpu = temp_cpu;
+       }
+
+out:
+       trace_ems_select_eco_cpu(eenv->p, eco_cpu,
+                       eenv->prev_cpu, eenv->best_cpu, eenv->backup_cpu,
+                       prev_energy, best_energy, backup_energy);
+       return eco_cpu;
+}
+
+int select_energy_cpu(struct task_struct *p, int prev_cpu, int sd_flag, int sync)
+{
+       struct sched_domain *sd = NULL;
+       int cpu = smp_processor_id();
+       struct eco_env eenv = {
+               .p = p,
+               .prev_cpu = prev_cpu,
+               .best_cpu = -1,
+               .backup_cpu = -1,
+       };
+
+       if (!sched_feat(ENERGY_AWARE))
+               return -1;
+
+       /*
+        * Energy-aware wakeup placement on overutilized cpu is hard to get
+        * energy gain.
+        */
+       rcu_read_lock();
+       sd = rcu_dereference_sched(cpu_rq(prev_cpu)->sd);
+       if (!sd || sd->shared->overutilized) {
+               rcu_read_unlock();
+               return -1;
+       }
+       rcu_read_unlock();
+
+       /*
+        * We cannot do energy-aware wakeup placement sensibly for tasks
+        * with 0 utilization, so let them be placed according to the normal
+        * strategy.
+        */
+       if (!task_util(p))
+               return -1;
+
+       if (sysctl_sched_sync_hint_enable && sync)
+               if (cpumask_test_cpu(cpu, &p->cpus_allowed))
+                       return cpu;
+
+       /*
+        * Find eco-friendly target.
+        * After selecting the best and backup cpu according to strategy, we
+        * choose a cpu that is energy efficient compared to prev cpu.
+        */
+       find_eco_target(&eenv);
+       if (eenv.best_cpu < 0 && eenv.backup_cpu < 0)
+               return prev_cpu;
+
+       return select_eco_cpu(&eenv);
+}
+
+static void
+fill_power_table(struct energy_table *table, int table_size,
+                       unsigned long *f_table, unsigned int *v_table,
+                       int max_f, int min_f)
+{
+       int i, index = 0;
+       int c = table->coefficient, v;
+       unsigned long f, power;
+
+       /* energy table and frequency table are inverted */
+       for (i = table_size - 1; i >= 0; i--) {
+               if (f_table[i] > max_f || f_table[i] < min_f)
+                       continue;
+
+               f = f_table[i] / 1000;  /* KHz -> MHz */
+               v = v_table[i] / 1000;  /* uV -> mV */
+
+               /*
+                * power = coefficent * frequency * voltage^2
+                */
+               power = c * f * v * v;
+
+               /*
+                * Generally, frequency is more than treble figures in MHz and
+                * voltage is also more then treble figures in mV, so the
+                * calculated power is larger than 10^9. For convenience of
+                * calculation, divide the value by 10^9.
+                */
+               do_div(power, 1000000000);
+               table->states[index].power = power;
+
+               /* save frequency to energy table */
+               table->states[index].frequency = f_table[i];
+               index++;
+       }
+}
+
+static void
+fill_cap_table(struct energy_table *table, int max_mips, unsigned long max_mips_freq)
+{
+       int i, m = table->mips;
+       unsigned long f;
+
+       for (i = 0; i < table->nr_states; i++) {
+               f = table->states[i].frequency;
+
+               /*
+                * capacity = freq/max_freq * mips/max_mips * 1024
+                */
+               table->states[i].cap = f * m * 1024 / max_mips_freq / max_mips;
+       }
+}
+
+static void show_energy_table(struct energy_table *table, int cpu)
+{
+       int i;
+
+       pr_info("[Energy Table : cpu%d]\n", cpu);
+       for (i = 0; i < table->nr_states; i++) {
+               pr_info("[%d] .cap=%lu .power=%lu\n", i,
+                       table->states[i].cap, table->states[i].power);
+       }
+}
+
+/*
+ * Whenever frequency domain is registered, and energy table corresponding to
+ * the domain is created. Because cpu in the same frequency domain has the same
+ * energy table. Capacity is calculated based on the max frequency of the fastest
+ * cpu, so once the frequency domain of the faster cpu is regsitered, capacity
+ * is recomputed.
+ */
+void init_sched_energy_table(struct cpumask *cpus, int table_size,
+                               unsigned long *f_table, unsigned int *v_table,
+                               int max_f, int min_f)
+{
+       struct energy_table *table;
+       int cpu, i, mips, valid_table_size = 0;
+       int max_mips = 0;
+       unsigned long max_mips_freq = 0;
+
+       mips = per_cpu(energy_table, cpumask_any(cpus)).mips;
+       for_each_cpu(cpu, cpus) {
+               /*
+                * All cpus in a frequency domain must have the smae capacity.
+                * Otherwise, it does not create an energy table because it
+                * is likely to be a human error.
+                */
+               if (mips != per_cpu(energy_table, cpu).mips) {
+                       pr_warn("cpu%d has different cpacity!!\n", cpu);
+                       return;
+               }
+       }
+
+       /* get size of valid frequency table to allocate energy table */
+       for (i = 0; i < table_size; i++) {
+               if (f_table[i] > max_f || f_table[i] < min_f)
+                       continue;
+
+               valid_table_size++;
+       }
+
+       /* there is no valid row in the table, energy table is not created */
+       if (!valid_table_size)
+               return;
+
+       /* allocate memory for energy table and fill power table */
+       for_each_cpu(cpu, cpus) {
+               table = &per_cpu(energy_table, cpu);
+               table->states = kcalloc(valid_table_size,
+                                       sizeof(struct energy_state), GFP_KERNEL);
+               if (unlikely(!table->states))
+                       return;
+
+               table->nr_states = valid_table_size;
+               fill_power_table(table, table_size, f_table, v_table, max_f, min_f);
+       }
+
+       /*
+        * Find fastest cpu among the cpu to which the energy table is allocated.
+        * The mips and max frequency of fastest cpu are needed to calculate
+        * capacity.
+        */
+       for_each_possible_cpu(cpu) {
+               table = &per_cpu(energy_table, cpu);
+               if (!table->states)
+                       continue;
+
+               if (table->mips > max_mips) {
+                       int last_state = table->nr_states - 1;
+
+                       max_mips = table->mips;
+                       max_mips_freq = table->states[last_state].frequency;
+               }
+       }
+
+       /*
+        * Calculate and fill capacity table.
+        * Recalculate the capacity whenever frequency domain changes because
+        * the fastest cpu may have changed and the capacity needs to be
+        * recalculated.
+        */
+       for_each_possible_cpu(cpu) {
+               table = &per_cpu(energy_table, cpu);
+               if (!table->states)
+                       continue;
+
+               fill_cap_table(table, max_mips, max_mips_freq);
+               show_energy_table(table, cpu);
+       }
+}
+
+static int __init init_sched_energy_data(void)
+{
+       struct device_node *cpu_node, *cpu_phandle;
+       int cpu;
+
+       for_each_possible_cpu(cpu) {
+               struct energy_table *table;
+
+               cpu_node = of_get_cpu_node(cpu, NULL);
+               if (!cpu_node) {
+                       pr_warn("CPU device node missing for CPU %d\n", cpu);
+                       return -ENODATA;
+               }
+
+               cpu_phandle = of_parse_phandle(cpu_node, "sched-energy-data", 0);
+               if (!cpu_phandle) {
+                       pr_warn("CPU device node has no sched-energy-data\n");
+                       return -ENODATA;
+               }
+
+               table = &per_cpu(energy_table, cpu);
+               if (of_property_read_u32(cpu_phandle, "capacity-mips", &table->mips)) {
+                       pr_warn("No capacity-mips data\n");
+                       return -ENODATA;
+               }
+
+               if (of_property_read_u32(cpu_phandle, "power-coefficient", &table->coefficient)) {
+                       pr_warn("No power-coefficient data\n");
+                       return -ENODATA;
+               }
+
+               of_node_put(cpu_phandle);
+               of_node_put(cpu_node);
+
+               pr_info("cpu%d mips=%d, coefficient=%d\n", cpu, table->mips, table->coefficient);
+       }
+
+       return 0;
+}
+pure_initcall(init_sched_energy_data);