sched/numa: Use a system-wide search to find swap/migration candidates

author Mel Gorman <mgorman@suse.de>

Mon, 7 Oct 2013 10:29:17 +0000 (11:29 +0100)

committer Ingo Molnar <mingo@kernel.org>

Wed, 9 Oct 2013 12:47:25 +0000 (14:47 +0200)
author Mel Gorman <mgorman@suse.de>
Mon, 7 Oct 2013 10:29:17 +0000 (11:29 +0100)
committer Ingo Molnar <mingo@kernel.org>
Wed, 9 Oct 2013 12:47:25 +0000 (14:47 +0200)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 32a2b29c261088e576b48a14e4c63064d7e9ee0b..1fe59da280e31501cb141a9eae2febee43f4e2a8 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5236,6 +5236,7 @@ static void destroy_sched_domains(struct sched_domain *sd, int cpu)
  DEFINE_PER_CPU(struct sched_domain *, sd_llc);
  DEFINE_PER_CPU(int, sd_llc_size);
  DEFINE_PER_CPU(int, sd_llc_id);
+DEFINE_PER_CPU(struct sched_domain *, sd_numa);
  
  static void update_top_cache_domain(int cpu)
  {
@@ -5252,6 +5253,9 @@ static void update_top_cache_domain(int cpu)
         rcu_assign_pointer(per_cpu(sd_llc, cpu), sd);
         per_cpu(sd_llc_size, cpu) = size;
         per_cpu(sd_llc_id, cpu) = id;
+
+       sd = lowest_flag_domain(cpu, SD_NUMA);
+       rcu_assign_pointer(per_cpu(sd_numa, cpu), sd);
  }
  
  /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index b1e5061287ab6e935b1b5da9aef1d2f54cae7a14..1422765d4b86b950243179b228520be2779a768c 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -681,6 +681,8 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
  }
  
  #ifdef CONFIG_SMP
+static unsigned long task_h_load(struct task_struct *p);
+
  static inline void __update_task_entity_contrib(struct sched_entity *se);
  
  /* Give new task start runnable values to heavy its load in infant time */
@@ -906,12 +908,40 @@ static unsigned long target_load(int cpu, int type);
  static unsigned long power_of(int cpu);
  static long effective_load(struct task_group *tg, int cpu, long wl, long wg);
  
+/* Cached statistics for all CPUs within a node */
  struct numa_stats {
+       unsigned long nr_running;
         unsigned long load;
-       s64 eff_load;
-       unsigned long faults;
+
+       /* Total compute capacity of CPUs on a node */
+       unsigned long power;
+
+       /* Approximate capacity in terms of runnable tasks on a node */
+       unsigned long capacity;
+       int has_capacity;
  };
  
+/*
+ * XXX borrowed from update_sg_lb_stats
+ */
+static void update_numa_stats(struct numa_stats *ns, int nid)
+{
+       int cpu;
+
+       memset(ns, 0, sizeof(*ns));
+       for_each_cpu(cpu, cpumask_of_node(nid)) {
+               struct rq *rq = cpu_rq(cpu);
+
+               ns->nr_running += rq->nr_running;
+               ns->load += weighted_cpuload(cpu);
+               ns->power += power_of(cpu);
+       }
+
+       ns->load = (ns->load * SCHED_POWER_SCALE) / ns->power;
+       ns->capacity = DIV_ROUND_CLOSEST(ns->power, SCHED_POWER_SCALE);
+       ns->has_capacity = (ns->nr_running < ns->capacity);
+}
+
  struct task_numa_env {
         struct task_struct *p;
  
@@ -920,95 +950,178 @@ struct task_numa_env {
  
         struct numa_stats src_stats, dst_stats;
  
-       unsigned long best_load;
+       int imbalance_pct, idx;
+
+       struct task_struct *best_task;
+       long best_imp;
         int best_cpu;
  };
  
+static void task_numa_assign(struct task_numa_env *env,
+                            struct task_struct *p, long imp)
+{
+       if (env->best_task)
+               put_task_struct(env->best_task);
+       if (p)
+               get_task_struct(p);
+
+       env->best_task = p;
+       env->best_imp = imp;
+       env->best_cpu = env->dst_cpu;
+}
+
+/*
+ * This checks if the overall compute and NUMA accesses of the system would
+ * be improved if the source tasks was migrated to the target dst_cpu taking
+ * into account that it might be best if task running on the dst_cpu should
+ * be exchanged with the source task
+ */
+static void task_numa_compare(struct task_numa_env *env, long imp)
+{
+       struct rq *src_rq = cpu_rq(env->src_cpu);
+       struct rq *dst_rq = cpu_rq(env->dst_cpu);
+       struct task_struct *cur;
+       long dst_load, src_load;
+       long load;
+
+       rcu_read_lock();
+       cur = ACCESS_ONCE(dst_rq->curr);
+       if (cur->pid == 0) /* idle */
+               cur = NULL;
+
+       /*
+        * "imp" is the fault differential for the source task between the
+        * source and destination node. Calculate the total differential for
+        * the source task and potential destination task. The more negative
+        * the value is, the more rmeote accesses that would be expected to
+        * be incurred if the tasks were swapped.
+        */
+       if (cur) {
+               /* Skip this swap candidate if cannot move to the source cpu */
+               if (!cpumask_test_cpu(env->src_cpu, tsk_cpus_allowed(cur)))
+                       goto unlock;
+
+               imp += task_faults(cur, env->src_nid) -
+                      task_faults(cur, env->dst_nid);
+       }
+
+       if (imp < env->best_imp)
+               goto unlock;
+
+       if (!cur) {
+               /* Is there capacity at our destination? */
+               if (env->src_stats.has_capacity &&
+                   !env->dst_stats.has_capacity)
+                       goto unlock;
+
+               goto balance;
+       }
+
+       /* Balance doesn't matter much if we're running a task per cpu */
+       if (src_rq->nr_running == 1 && dst_rq->nr_running == 1)
+               goto assign;
+
+       /*
+        * In the overloaded case, try and keep the load balanced.
+        */
+balance:
+       dst_load = env->dst_stats.load;
+       src_load = env->src_stats.load;
+
+       /* XXX missing power terms */
+       load = task_h_load(env->p);
+       dst_load += load;
+       src_load -= load;
+
+       if (cur) {
+               load = task_h_load(cur);
+               dst_load -= load;
+               src_load += load;
+       }
+
+       /* make src_load the smaller */
+       if (dst_load < src_load)
+               swap(dst_load, src_load);
+
+       if (src_load * env->imbalance_pct < dst_load * 100)
+               goto unlock;
+
+assign:
+       task_numa_assign(env, cur, imp);
+unlock:
+       rcu_read_unlock();
+}
+
  static int task_numa_migrate(struct task_struct *p)
  {
-       int node_cpu = cpumask_first(cpumask_of_node(p->numa_preferred_nid));
         struct task_numa_env env = {
                 .p = p,
+
                 .src_cpu = task_cpu(p),
                 .src_nid = cpu_to_node(task_cpu(p)),
-               .dst_cpu = node_cpu,
-               .dst_nid = p->numa_preferred_nid,
-               .best_load = ULONG_MAX,
-               .best_cpu = task_cpu(p),
+
+               .imbalance_pct = 112,
+
+               .best_task = NULL,
+               .best_imp = 0,
+               .best_cpu = -1
         };
         struct sched_domain *sd;
-       int cpu;
-       struct task_group *tg = task_group(p);
-       unsigned long weight;
-       bool balanced;
-       int imbalance_pct, idx = -1;
+       unsigned long faults;
+       int nid, cpu, ret;
  
         /*
-        * Find the lowest common scheduling domain covering the nodes of both
-        * the CPU the task is currently running on and the target NUMA node.
+        * Pick the lowest SD_NUMA domain, as that would have the smallest
+        * imbalance and would be the first to start moving tasks about.
+        *
+        * And we want to avoid any moving of tasks about, as that would create
+        * random movement of tasks -- counter the numa conditions we're trying
+        * to satisfy here.
          */
         rcu_read_lock();
-       for_each_domain(env.src_cpu, sd) {
-               if (cpumask_test_cpu(node_cpu, sched_domain_span(sd))) {
-                       /*
-                        * busy_idx is used for the load decision as it is the
-                        * same index used by the regular load balancer for an
-                        * active cpu.
-                        */
-                       idx = sd->busy_idx;
-                       imbalance_pct = sd->imbalance_pct;
-                       break;
-               }
-       }
+       sd = rcu_dereference(per_cpu(sd_numa, env.src_cpu));
+       env.imbalance_pct = 100 + (sd->imbalance_pct - 100) / 2;
         rcu_read_unlock();
  
-       if (WARN_ON_ONCE(idx == -1))
-               return 0;
+       faults = task_faults(p, env.src_nid);
+       update_numa_stats(&env.src_stats, env.src_nid);
  
-       /*
-        * XXX the below is mostly nicked from wake_affine(); we should
-        * see about sharing a bit if at all possible; also it might want
-        * some per entity weight love.
-        */
-       weight = p->se.load.weight;
-       env.src_stats.load = source_load(env.src_cpu, idx);
-       env.src_stats.eff_load = 100 + (imbalance_pct - 100) / 2;
-       env.src_stats.eff_load *= power_of(env.src_cpu);
-       env.src_stats.eff_load *= env.src_stats.load + effective_load(tg, env.src_cpu, -weight, -weight);
-
-       for_each_cpu(cpu, cpumask_of_node(env.dst_nid)) {
-               env.dst_cpu = cpu;
-               env.dst_stats.load = target_load(cpu, idx);
-
-               /* If the CPU is idle, use it */
-               if (!env.dst_stats.load) {
-                       env.best_cpu = cpu;
-                       goto migrate;
-               }
+       /* Find an alternative node with relatively better statistics */
+       for_each_online_node(nid) {
+               long imp;
  
-               /* Otherwise check the target CPU load */
-               env.dst_stats.eff_load = 100;
-               env.dst_stats.eff_load *= power_of(cpu);
-               env.dst_stats.eff_load *= env.dst_stats.load + effective_load(tg, cpu, weight, weight);
+               if (nid == env.src_nid)
+                       continue;
  
-               /*
-                * Destination is considered balanced if the destination CPU is
-                * less loaded than the source CPU. Unfortunately there is a
-                * risk that a task running on a lightly loaded CPU will not
-                * migrate to its preferred node due to load imbalances.
-                */
-               balanced = (env.dst_stats.eff_load <= env.src_stats.eff_load);
-               if (!balanced)
+               /* Only consider nodes that recorded more faults */
+               imp = task_faults(p, nid) - faults;
+               if (imp < 0)
                         continue;
  
-               if (env.dst_stats.eff_load < env.best_load) {
-                       env.best_load = env.dst_stats.eff_load;
-                       env.best_cpu = cpu;
+               env.dst_nid = nid;
+               update_numa_stats(&env.dst_stats, env.dst_nid);
+               for_each_cpu(cpu, cpumask_of_node(nid)) {
+                       /* Skip this CPU if the source task cannot migrate */
+                       if (!cpumask_test_cpu(cpu, tsk_cpus_allowed(p)))
+                               continue;
+
+                       env.dst_cpu = cpu;
+                       task_numa_compare(&env, imp);
                 }
         }
  
-migrate:
-       return migrate_task_to(p, env.best_cpu);
+       /* No better CPU than the current one was found. */
+       if (env.best_cpu == -1)
+               return -EAGAIN;
+
+       if (env.best_task == NULL) {
+               int ret = migrate_task_to(p, env.best_cpu);
+               return ret;
+       }
+
+       ret = migrate_swap(p, env.best_task);
+       put_task_struct(env.best_task);
+       return ret;
  }
  
  /* Attempt to migrate a task to a CPU on the preferred node. */
@@ -1050,7 +1163,7 @@ static void task_numa_placement(struct task_struct *p)
  
         /* Find the node with the highest number of faults */
         for_each_online_node(nid) {
-               unsigned long faults;
+               unsigned long faults = 0;
                 int priv, i;
  
                 for (priv = 0; priv < 2; priv++) {
@@ -1060,10 +1173,10 @@ static void task_numa_placement(struct task_struct *p)
                         p->numa_faults[i] >>= 1;
                         p->numa_faults[i] += p->numa_faults_buffer[i];
                         p->numa_faults_buffer[i] = 0;
+
+                       faults += p->numa_faults[i];
                 }
  
-               /* Find maximum private faults */
-               faults = p->numa_faults[task_faults_idx(nid, 1)];
                 if (faults > max_faults) {
                         max_faults = faults;
                         max_nid = nid;
@@ -4455,8 +4568,6 @@ static int move_one_task(struct lb_env *env)
         return 0;
  }
  
-static unsigned long task_h_load(struct task_struct *p);
-
  static const unsigned int sched_nr_migrate_break = 32;
  
  /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index 4dc92d016aef430e7b8f55ee752beb060accb529..691e96964dcccf1abfd9cbb1c4066442f5676d56 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -610,9 +610,22 @@ static inline struct sched_domain *highest_flag_domain(int cpu, int flag)
         return hsd;
  }
  
+static inline struct sched_domain *lowest_flag_domain(int cpu, int flag)
+{
+       struct sched_domain *sd;
+
+       for_each_domain(cpu, sd) {
+               if (sd->flags & flag)
+                       break;
+       }
+
+       return sd;
+}
+
  DECLARE_PER_CPU(struct sched_domain *, sd_llc);
  DECLARE_PER_CPU(int, sd_llc_size);
  DECLARE_PER_CPU(int, sd_llc_id);
+DECLARE_PER_CPU(struct sched_domain *, sd_numa);
  
  struct sched_group_power {
         atomic_t ref;
author	Mel Gorman <mgorman@suse.de>
	Mon, 7 Oct 2013 10:29:17 +0000 (11:29 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 9 Oct 2013 12:47:25 +0000 (14:47 +0200)
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history