mm: sched: numa: Implement slow start for working set sampling
authorPeter Zijlstra <a.p.zijlstra@chello.nl>
Thu, 25 Oct 2012 12:16:47 +0000 (14:16 +0200)
committerMel Gorman <mgorman@suse.de>
Tue, 11 Dec 2012 14:42:47 +0000 (14:42 +0000)
Add a 1 second delay before starting to scan the working set of
a task and starting to balance it amongst nodes.

[ note that before the constant per task WSS sampling rate patch
  the initial scan would happen much later still, in effect that
  patch caused this regression. ]

The theory is that short-run tasks benefit very little from NUMA
placement: they come and go, and they better stick to the node
they were started on. As tasks mature and rebalance to other CPUs
and nodes, so does their NUMA placement have to change and so
does it start to matter more and more.

In practice this change fixes an observable kbuild regression:

   # [ a perf stat --null --repeat 10 test of ten bzImage builds to /dev/shm ]

   !NUMA:
   45.291088843 seconds time elapsed                                          ( +-  0.40% )
   45.154231752 seconds time elapsed                                          ( +-  0.36% )

   +NUMA, no slow start:
   46.172308123 seconds time elapsed                                          ( +-  0.30% )
   46.343168745 seconds time elapsed                                          ( +-  0.25% )

   +NUMA, 1 sec slow start:
   45.224189155 seconds time elapsed                                          ( +-  0.25% )
   45.160866532 seconds time elapsed                                          ( +-  0.17% )

and it also fixes an observable perf bench (hackbench) regression:

   # perf stat --null --repeat 10 perf bench sched messaging

   -NUMA:

   -NUMA:                  0.246225691 seconds time elapsed                   ( +-  1.31% )
   +NUMA no slow start:    0.252620063 seconds time elapsed                   ( +-  1.13% )

   +NUMA 1sec delay:       0.248076230 seconds time elapsed                   ( +-  1.35% )

The implementation is simple and straightforward, most of the patch
deals with adding the /proc/sys/kernel/numa_balancing_scan_delay_ms tunable
knob.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Rik van Riel <riel@redhat.com>
[ Wrote the changelog, ran measurements, tuned the default. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
Signed-off-by: Mel Gorman <mgorman@suse.de>
Reviewed-by: Rik van Riel <riel@redhat.com>
include/linux/sched.h
kernel/sched/core.c
kernel/sched/fair.c
kernel/sysctl.c

index 37841958d234c42af49ce9f709f7382d5e33c3f2..7d95a232b5b963b538f5079a542597904942ea73 100644 (file)
@@ -2006,6 +2006,7 @@ enum sched_tunable_scaling {
 };
 extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
 
+extern unsigned int sysctl_numa_balancing_scan_delay;
 extern unsigned int sysctl_numa_balancing_scan_period_min;
 extern unsigned int sysctl_numa_balancing_scan_period_max;
 extern unsigned int sysctl_numa_balancing_scan_size;
index cad0d092ce3b02d1e118d0e165072ab65e7dccbf..fbfc4843063ff17140a18d82e59229499aa7f755 100644 (file)
@@ -1543,7 +1543,7 @@ static void __sched_fork(struct task_struct *p)
        p->node_stamp = 0ULL;
        p->numa_scan_seq = p->mm ? p->mm->numa_scan_seq : 0;
        p->numa_migrate_seq = p->mm ? p->mm->numa_scan_seq - 1 : 0;
-       p->numa_scan_period = sysctl_numa_balancing_scan_period_min;
+       p->numa_scan_period = sysctl_numa_balancing_scan_delay;
        p->numa_work.next = &p->numa_work;
 #endif /* CONFIG_NUMA_BALANCING */
 }
index f6e1f25ed2bd6b6c30aa80d93280664d8bfd06ac..7727b01615795b9209641de693f40e5429f541ec 100644 (file)
@@ -788,6 +788,9 @@ unsigned int sysctl_numa_balancing_scan_period_max = 100*16;
 /* Portion of address space to scan in MB */
 unsigned int sysctl_numa_balancing_scan_size = 256;
 
+/* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */
+unsigned int sysctl_numa_balancing_scan_delay = 1000;
+
 static void task_numa_placement(struct task_struct *p)
 {
        int seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -929,6 +932,8 @@ void task_tick_numa(struct rq *rq, struct task_struct *curr)
        period = (u64)curr->numa_scan_period * NSEC_PER_MSEC;
 
        if (now - curr->node_stamp > period) {
+               if (!curr->node_stamp)
+                       curr->numa_scan_period = sysctl_numa_balancing_scan_period_min;
                curr->node_stamp = now;
 
                if (!time_before(jiffies, curr->mm->numa_next_scan)) {
index 7d3a2e0475e51f29dac7a9674eb08b5b0e1b75db..48a68cc258c12fdbffe6912810fe22c21bfcc1db 100644 (file)
@@ -352,6 +352,13 @@ static struct ctl_table kern_table[] = {
        },
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_NUMA_BALANCING
+       {
+               .procname       = "numa_balancing_scan_delay_ms",
+               .data           = &sysctl_numa_balancing_scan_delay,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = proc_dointvec,
+       },
        {
                .procname       = "numa_balancing_scan_period_min_ms",
                .data           = &sysctl_numa_balancing_scan_period_min,