sched/numa: Avoid migrating tasks that are placed on their preferred node

author Peter Zijlstra <peterz@infradead.org>

Mon, 7 Oct 2013 10:29:33 +0000 (11:29 +0100)

committer Ingo Molnar <mingo@kernel.org>

Wed, 9 Oct 2013 12:48:10 +0000 (14:48 +0200)
author Peter Zijlstra <peterz@infradead.org>
Mon, 7 Oct 2013 10:29:33 +0000 (11:29 +0100)
committer Ingo Molnar <mingo@kernel.org>
Wed, 9 Oct 2013 12:48:10 +0000 (14:48 +0200)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 3e2c893df1730b3ac4e5a82ca6e7c19dbf7ae8b3..8cfd51f6224119b0a25dd8d4ead8bdeb075c8dae 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -4468,6 +4468,35 @@ int migrate_task_to(struct task_struct *p, int target_cpu)
  
         return stop_one_cpu(curr_cpu, migration_cpu_stop, &arg);
  }
+
+/*
+ * Requeue a task on a given node and accurately track the number of NUMA
+ * tasks on the runqueues
+ */
+void sched_setnuma(struct task_struct *p, int nid)
+{
+       struct rq *rq;
+       unsigned long flags;
+       bool on_rq, running;
+
+       rq = task_rq_lock(p, &flags);
+       on_rq = p->on_rq;
+       running = task_current(rq, p);
+
+       if (on_rq)
+               dequeue_task(rq, p, 0);
+       if (running)
+               p->sched_class->put_prev_task(rq, p);
+
+       p->numa_preferred_nid = nid;
+       p->numa_migrate_seq = 1;
+
+       if (running)
+               p->sched_class->set_curr_task(rq);
+       if (on_rq)
+               enqueue_task(rq, p, 0);
+       task_rq_unlock(rq, p, &flags);
+}
  #endif
  
  /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 423316cdee076503e323c0015e63fe4dfd0f887f..5166b9b1af7092faa7e67bf80bbaad4e4a874413 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -888,6 +888,18 @@ static unsigned int task_scan_max(struct task_struct *p)
   */
  unsigned int sysctl_numa_balancing_settle_count __read_mostly = 4;
  
+static void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running += (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running += (p->numa_preferred_nid == task_node(p));
+}
+
+static void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+       rq->nr_numa_running -= (p->numa_preferred_nid != -1);
+       rq->nr_preferred_running -= (p->numa_preferred_nid == task_node(p));
+}
+
  struct numa_group {
         atomic_t refcount;
  
@@ -1227,6 +1239,8 @@ static int task_numa_migrate(struct task_struct *p)
         if (env.best_cpu == -1)
                 return -EAGAIN;
  
+       sched_setnuma(p, env.dst_nid);
+
         if (env.best_task == NULL) {
                 int ret = migrate_task_to(p, env.best_cpu);
                 return ret;
@@ -1342,8 +1356,7 @@ static void task_numa_placement(struct task_struct *p)
         /* Preferred node as the node with the most faults */
         if (max_faults && max_nid != p->numa_preferred_nid) {
                 /* Update the preferred nid and migrate task if possible */
-               p->numa_preferred_nid = max_nid;
-               p->numa_migrate_seq = 1;
+               sched_setnuma(p, max_nid);
                 numa_migrate_preferred(p);
         }
  }
@@ -1741,6 +1754,14 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
  static void task_tick_numa(struct rq *rq, struct task_struct *curr)
  {
  }
+
+static inline void account_numa_enqueue(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void account_numa_dequeue(struct rq *rq, struct task_struct *p)
+{
+}
  #endif /* CONFIG_NUMA_BALANCING */
  
  static void
@@ -1750,8 +1771,12 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
         if (!parent_entity(se))
                 update_load_add(&rq_of(cfs_rq)->load, se->load.weight);
  #ifdef CONFIG_SMP
-       if (entity_is_task(se))
-               list_add(&se->group_node, &rq_of(cfs_rq)->cfs_tasks);
+       if (entity_is_task(se)) {
+               struct rq *rq = rq_of(cfs_rq);
+
+               account_numa_enqueue(rq, task_of(se));
+               list_add(&se->group_node, &rq->cfs_tasks);
+       }
  #endif
         cfs_rq->nr_running++;
  }
@@ -1762,8 +1787,10 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
         update_load_sub(&cfs_rq->load, se->load.weight);
         if (!parent_entity(se))
                 update_load_sub(&rq_of(cfs_rq)->load, se->load.weight);
-       if (entity_is_task(se))
+       if (entity_is_task(se)) {
+               account_numa_dequeue(rq_of(cfs_rq), task_of(se));
                 list_del_init(&se->group_node);
+       }
         cfs_rq->nr_running--;
  }
  
@@ -4605,6 +4632,8 @@ static bool yield_to_task_fair(struct rq *rq, struct task_struct *p, bool preemp
  
  static unsigned long __read_mostly max_load_balance_interval = HZ/10;
  
+enum fbq_type { regular, remote, all };
+
  #define LBF_ALL_PINNED 0x01
  #define LBF_NEED_BREAK 0x02
  #define LBF_DST_PINNED  0x04
@@ -4631,6 +4660,8 @@ struct lb_env {
         unsigned int            loop;
         unsigned int            loop_break;
         unsigned int            loop_max;
+
+       enum fbq_type           fbq_type;
  };
  
  /*
@@ -5092,6 +5123,10 @@ struct sg_lb_stats {
         unsigned int group_weight;
         int group_imb; /* Is there an imbalance in the group ? */
         int group_has_capacity; /* Is there extra capacity in the group? */
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned int nr_numa_running;
+       unsigned int nr_preferred_running;
+#endif
  };
  
  /*
@@ -5409,6 +5444,10 @@ static inline void update_sg_lb_stats(struct lb_env *env,
  
                 sgs->group_load += load;
                 sgs->sum_nr_running += nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+               sgs->nr_numa_running += rq->nr_numa_running;
+               sgs->nr_preferred_running += rq->nr_preferred_running;
+#endif
                 sgs->sum_weighted_load += weighted_cpuload(i);
                 if (idle_cpu(i))
                         sgs->idle_cpus++;
@@ -5474,14 +5513,43 @@ static bool update_sd_pick_busiest(struct lb_env *env,
         return false;
  }
  
+#ifdef CONFIG_NUMA_BALANCING
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       if (sgs->sum_nr_running > sgs->nr_numa_running)
+               return regular;
+       if (sgs->sum_nr_running > sgs->nr_preferred_running)
+               return remote;
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       if (rq->nr_running > rq->nr_numa_running)
+               return regular;
+       if (rq->nr_running > rq->nr_preferred_running)
+               return remote;
+       return all;
+}
+#else
+static inline enum fbq_type fbq_classify_group(struct sg_lb_stats *sgs)
+{
+       return all;
+}
+
+static inline enum fbq_type fbq_classify_rq(struct rq *rq)
+{
+       return regular;
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
  /**
   * update_sd_lb_stats - Update sched_domain's statistics for load balancing.
   * @env: The load balancing environment.
   * @balance: Should we balance.
   * @sds: variable to hold the statistics for this sched_domain.
   */
-static inline void update_sd_lb_stats(struct lb_env *env,
-                                       struct sd_lb_stats *sds)
+static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sds)
  {
         struct sched_domain *child = env->sd->child;
         struct sched_group *sg = env->sd->groups;
@@ -5538,6 +5606,9 @@ next_group:
  
                 sg = sg->next;
         } while (sg != env->sd->groups);
+
+       if (env->sd->flags & SD_NUMA)
+               env->fbq_type = fbq_classify_group(&sds->busiest_stat);
  }
  
  /**
@@ -5841,15 +5912,39 @@ static struct rq *find_busiest_queue(struct lb_env *env,
         int i;
  
         for_each_cpu_and(i, sched_group_cpus(group), env->cpus) {
-               unsigned long power = power_of(i);
-               unsigned long capacity = DIV_ROUND_CLOSEST(power,
-                                                          SCHED_POWER_SCALE);
-               unsigned long wl;
+               unsigned long power, capacity, wl;
+               enum fbq_type rt;
+
+               rq = cpu_rq(i);
+               rt = fbq_classify_rq(rq);
  
+               /*
+                * We classify groups/runqueues into three groups:
+                *  - regular: there are !numa tasks
+                *  - remote:  there are numa tasks that run on the 'wrong' node
+                *  - all:     there is no distinction
+                *
+                * In order to avoid migrating ideally placed numa tasks,
+                * ignore those when there's better options.
+                *
+                * If we ignore the actual busiest queue to migrate another
+                * task, the next balance pass can still reduce the busiest
+                * queue by moving tasks around inside the node.
+                *
+                * If we cannot move enough load due to this classification
+                * the next pass will adjust the group classification and
+                * allow migration of more tasks.
+                *
+                * Both cases only affect the total convergence complexity.
+                */
+               if (rt > env->fbq_type)
+                       continue;
+
+               power = power_of(i);
+               capacity = DIV_ROUND_CLOSEST(power, SCHED_POWER_SCALE);
                 if (!capacity)
                         capacity = fix_small_capacity(env->sd, group);
  
-               rq = cpu_rq(i);
                 wl = weighted_cpuload(i);
  
                 /*
@@ -5966,6 +6061,7 @@ static int load_balance(int this_cpu, struct rq *this_rq,
                 .idle           = idle,
                 .loop_break     = sched_nr_migrate_break,
                 .cpus           = cpus,
+               .fbq_type       = all,
         };
  
         /*
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index eeb1923812a1285c89288de76d1115dac84ba83f..d69cb325c27e745d8570a1dfb9d5320de4aa11fc 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -409,6 +409,10 @@ struct rq {
          * remote CPUs use both these fields when doing load calculation.
          */
         unsigned int nr_running;
+#ifdef CONFIG_NUMA_BALANCING
+       unsigned int nr_numa_running;
+       unsigned int nr_preferred_running;
+#endif
         #define CPU_LOAD_IDX_MAX 5
         unsigned long cpu_load[CPU_LOAD_IDX_MAX];
         unsigned long last_load_update_tick;
@@ -557,6 +561,7 @@ static inline u64 rq_clock_task(struct rq *rq)
  }
  
  #ifdef CONFIG_NUMA_BALANCING
+extern void sched_setnuma(struct task_struct *p, int node);
  extern int migrate_task_to(struct task_struct *p, int cpu);
  extern int migrate_swap(struct task_struct *, struct task_struct *);
  #endif /* CONFIG_NUMA_BALANCING */
author	Peter Zijlstra <peterz@infradead.org>
	Mon, 7 Oct 2013 10:29:33 +0000 (11:29 +0100)
committer	Ingo Molnar <mingo@kernel.org>
	Wed, 9 Oct 2013 12:48:10 +0000 (14:48 +0200)
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history