Shared faults can lead to lots of unnecessary page migrations,
slowing down the system, and causing private faults to hit the
per-pgdat migration ratelimit.
This patch adds sysctl numa_balancing_migrate_deferred, which specifies
how many shared page migrations to skip unconditionally, after each page
migration that is skipped because it is a shared fault.
This reduces the number of page migrations back and forth in
shared fault situations. It also gives a strong preference to
the tasks that are already running where most of the memory is,
and to moving the other tasks to near the memory.
Testing this with a much higher scan rate than the default
still seems to result in fewer page migrations than before.
Memory seems to be somewhat better consolidated than previously,
with multi-instance specjbb runs on a 4 node system.
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1381141781-10992-62-git-send-email-mgorman@suse.de
Signed-off-by: Ingo Molnar <mingo@kernel.org>
feature is too high then the rate the kernel samples for NUMA hinting
faults may be controlled by the numa_balancing_scan_period_min_ms,
numa_balancing_scan_delay_ms, numa_balancing_scan_period_max_ms,
-numa_balancing_scan_size_mb and numa_balancing_settle_count sysctls.
+numa_balancing_scan_size_mb, numa_balancing_settle_count sysctls and
+numa_balancing_migrate_deferred.
==============================================================
gives the scheduler a chance to place the task on an alternative node if the
preferred node is overloaded.
+numa_balancing_migrate_deferred is how many page migrations get skipped
+unconditionally, after a page migration is skipped because a page is shared
+with other tasks. This reduces page migration overhead, and determines
+how much stronger the "move task near its memory" policy scheduler becomes,
+versus the "move memory near its task" memory management policy, for workloads
+with shared memory.
+
==============================================================
osrelease, ostype & version:
int numa_scan_seq;
unsigned int numa_scan_period;
unsigned int numa_scan_period_max;
+ int numa_preferred_nid;
+ int numa_migrate_deferred;
unsigned long numa_migrate_retry;
u64 node_stamp; /* migration stamp */
struct callback_head numa_work;
*/
unsigned long numa_faults_locality[2];
- int numa_preferred_nid;
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
extern pid_t task_numa_group_id(struct task_struct *p);
extern void set_numabalancing_state(bool enabled);
extern void task_numa_free(struct task_struct *p);
+
+extern unsigned int sysctl_numa_balancing_migrate_deferred;
#else
static inline void task_numa_fault(int last_node, int node, int pages,
int flags)
/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
unsigned int sysctl_numa_balancing_scan_delay = 1000;
+/*
+ * After skipping a page migration on a shared page, skip N more numa page
+ * migrations unconditionally. This reduces the number of NUMA migrations
+ * in shared memory workloads, and has the effect of pulling tasks towards
+ * where their memory lives, over pulling the memory towards the task.
+ */
+unsigned int sysctl_numa_balancing_migrate_deferred = 16;
+
static unsigned int task_nr_scan_windows(struct task_struct *p)
{
unsigned long rss = 0;
.mode = 0644,
.proc_handler = proc_dointvec,
},
+ {
+ .procname = "numa_balancing_migrate_deferred",
+ .data = &sysctl_numa_balancing_migrate_deferred,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
#endif /* CONFIG_NUMA_BALANCING */
#endif /* CONFIG_SCHED_DEBUG */
{
kmem_cache_free(sn_cache, n);
}
+#ifdef CONFIG_NUMA_BALANCING
+static bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+ /* Never defer a private fault */
+ if (cpupid_match_pid(p, last_cpupid))
+ return false;
+
+ if (p->numa_migrate_deferred) {
+ p->numa_migrate_deferred--;
+ return true;
+ }
+ return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+ p->numa_migrate_deferred = sysctl_numa_balancing_migrate_deferred;
+}
+#else
+static inline bool numa_migrate_deferred(struct task_struct *p, int last_cpupid)
+{
+ return false;
+}
+
+static inline void defer_numa_migrate(struct task_struct *p)
+{
+}
+#endif /* CONFIG_NUMA_BALANCING */
+
/**
* mpol_misplaced - check whether current page node is valid in policy
*
* relation.
*/
last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
- if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid)
+ if (!cpupid_pid_unset(last_cpupid) && cpupid_to_nid(last_cpupid) != thisnid) {
+
+ /* See sysctl_numa_balancing_migrate_deferred comment */
+ if (!cpupid_match_pid(current, last_cpupid))
+ defer_numa_migrate(current);
+
+ goto out;
+ }
+
+ /*
+ * The quadratic filter above reduces extraneous migration
+ * of shared pages somewhat. This code reduces it even more,
+ * reducing the overhead of page migrations of shared pages.
+ * This makes workloads with shared pages rely more on
+ * "move task near its memory", and less on "move memory
+ * towards its task", which is exactly what we want.
+ */
+ if (numa_migrate_deferred(current, last_cpupid))
goto out;
}