}
static unsigned long weighted_cpuload(const int cpu);
+static unsigned long source_load(int cpu, int type);
+static unsigned long target_load(int cpu, int type);
+static unsigned long power_of(int cpu);
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
+struct numa_stats {
+ unsigned long load;
+ s64 eff_load;
+ unsigned long faults;
+};
-static int
-find_idlest_cpu_node(int this_cpu, int nid)
-{
- unsigned long load, min_load = ULONG_MAX;
- int i, idlest_cpu = this_cpu;
+struct task_numa_env {
+ struct task_struct *p;
- BUG_ON(cpu_to_node(this_cpu) == nid);
+ int src_cpu, src_nid;
+ int dst_cpu, dst_nid;
- rcu_read_lock();
- for_each_cpu(i, cpumask_of_node(nid)) {
- load = weighted_cpuload(i);
+ struct numa_stats src_stats, dst_stats;
- if (load < min_load) {
- min_load = load;
- idlest_cpu = i;
+ unsigned long best_load;
+ int best_cpu;
+};
+
+static int task_numa_migrate(struct task_struct *p)
+{
+ int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
+ struct task_numa_env env = {
+ .p = p,
+ .src_cpu = task_cpu(p),
+ .src_nid = cpu_to_node(task_cpu(p)),
+ .dst_cpu = node_cpu,
+ .dst_nid = p->numa_preferred_nid,
+ .best_load = ULONG_MAX,
+ .best_cpu = task_cpu(p),
+ };
+ struct sched_domain *sd;
+ int cpu;
+ struct task_group *tg = task_group(p);
+ unsigned long weight;
+ bool balanced;
+ int imbalance_pct, idx = -1;
+
+ /*
+ * Find the lowest common scheduling domain covering the nodes of both
+ * the CPU the task is currently running on and the target NUMA node.
+ */
+ rcu_read_lock();
+ for_each_domain(env.src_cpu, sd) {
+ if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {
+ /*
+ * busy_idx is used for the load decision as it is the
+ * same index used by the regular load balancer for an
+ * active cpu.
+ */
+ idx = sd->busy_idx;
+ imbalance_pct = sd->imbalance_pct;
+ break;
}
}
rcu_read_unlock();
- return idlest_cpu;
+ if (WARN_ON_ONCE(idx == -1))
+ return 0;
+
+ /*
+ * XXX the below is mostly nicked from wake_affine(); we should
+ * see about sharing a bit if at all possible; also it might want
+ * some per entity weight love.
+ */
+ weight = p->se.load.weight;
+ env.src_stats.load = source_load(env.src_cpu, idx);
+ env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
+ env.src_stats.eff_load *= power_of(env.src_cpu);
+ env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
+
+ for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
+ env.dst_cpu = cpu;
+ env.dst_stats.load = target_load(cpu, idx);
+
+ /* If the CPU is idle, use it */
+ if (!env.dst_stats.load) {
+ env.best_cpu = cpu;
+ goto migrate;
+ }
+
+ /* Otherwise check the target CPU load */
+ env.dst_stats.eff_load = 100;
+ env.dst_stats.eff_load *= power_of(cpu);
+ env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
+
+ /*
+ * Destination is considered balanced if the destination CPU is
+ * less loaded than the source CPU. Unfortunately there is a
+ * risk that a task running on a lightly loaded CPU will not
+ * migrate to its preferred node due to load imbalances.
+ */
+ balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
+ if (!balanced)
+ continue;
+
+ if (env.dst_stats.eff_load < env.best_load) {
+ env.best_load = env.dst_stats.eff_load;
+ env.best_cpu = cpu;
+ }
+ }
+
+migrate:
+ return migrate_task_to(p, env.best_cpu);
}
static void task_numa_placement(struct task_struct *p)
* the working set placement.
*/
if (max_faults && max_nid != p->numa_preferred_nid) {
- int preferred_cpu;
-
- /*
- * If the task is not on the preferred node then find the most
- * idle CPU to migrate to.
- */
- preferred_cpu = task_cpu(p);
- if (cpu_to_node(preferred_cpu) != max_nid) {
- preferred_cpu = find_idlest_cpu_node(preferred_cpu,
- max_nid);
- }
-
/* Update the preferred nid and migrate task if possible */
p->numa_preferred_nid = max_nid;
p->numa_migrate_seq = 1;
- migrate_task_to(p, preferred_cpu);
+ task_numa_migrate(p);
}
}
{
struct sched_entity *se = tg->se[cpu];
- if (!tg->parent) /* the trivial, non-cgroup case */
+ if (!tg->parent || !wl) /* the trivial, non-cgroup case */
return wl;
for_each_sched_entity(se) {
}
#else
-static inline unsigned long effective_load(struct task_group *tg, int cpu,
- unsigned long wl, unsigned long wg)
+static long effective_load(struct task_group *tg, int cpu, long wl, long wg)
{
return wl;
}