sched/fair: Fix and optimize the fork() path
authorPeter Zijlstra <peterz@infradead.org>
Thu, 16 Jun 2016 16:51:48 +0000 (18:51 +0200)
committerIngo Molnar <mingo@kernel.org>
Mon, 27 Jun 2016 10:17:50 +0000 (12:17 +0200)
The task_fork_fair() callback already calls __set_task_cpu() and takes
rq->lock.

If we move the sched_class::task_fork callback in sched_fork() under
the existing p->pi_lock, right after its set_task_cpu() call, we can
avoid doing two such calls and omit the IRQ disabling on the rq->lock.

Change to __set_task_cpu() to skip the migration bits, this is a new
task, not a migration. Similarly, make wake_up_new_task() use
__set_task_cpu() for the same reason, the task hasn't actually
migrated as it hasn't ever ran.

This cures the problem of calling migrate_task_rq_fair(), which does
remove_entity_from_load_avg() on tasks that have never been added to
the load avg to begin with.

This bug would result in transiently messed up load_avg values, averaged
out after a few dozen milliseconds. This is probably the reason why
this bug was not found for such a long time.

Reported-by: Vincent Guittot <vincent.guittot@linaro.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-kernel@vger.kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
kernel/sched/core.c
kernel/sched/fair.c

index e406ba0c689106d2c7a31eb2bb62cc3a63ba0e74..fa3434dffbbb64c013193d32865dd03cd8aadb27 100644 (file)
@@ -2383,9 +2383,6 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
                p->sched_class = &fair_sched_class;
        }
 
-       if (p->sched_class->task_fork)
-               p->sched_class->task_fork(p);
-
        /*
         * The child is not yet in the pid-hash so no cgroup attach races,
         * and the cgroup is pinned to this child due to cgroup_fork()
@@ -2394,7 +2391,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p)
         * Silence PROVE_RCU.
         */
        raw_spin_lock_irqsave(&p->pi_lock, flags);
-       set_task_cpu(p, cpu);
+       /*
+        * We're setting the cpu for the first time, we don't migrate,
+        * so use __set_task_cpu().
+        */
+       __set_task_cpu(p, cpu);
+       if (p->sched_class->task_fork)
+               p->sched_class->task_fork(p);
        raw_spin_unlock_irqrestore(&p->pi_lock, flags);
 
 #ifdef CONFIG_SCHED_INFO
@@ -2534,8 +2537,11 @@ void wake_up_new_task(struct task_struct *p)
         * Fork balancing, do it here and not earlier because:
         *  - cpus_allowed can change in the fork path
         *  - any previously selected cpu might disappear through hotplug
+        *
+        * Use __set_task_cpu() to avoid calling sched_class::migrate_task_rq,
+        * as we're not fully set-up yet.
         */
-       set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
+       __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0));
 #endif
        rq = __task_rq_lock(p, &rf);
        post_init_entity_util_avg(&p->se);
index 73063560b9ecff8580b01155bd110ecf3ab76d1e..994f5493ee5bb8d2389434d396a668133aeb34b3 100644 (file)
@@ -4448,7 +4448,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 *
                 * note: in the case of encountering a throttled cfs_rq we will
                 * post the final h_nr_running increment below.
-               */
+                */
                if (cfs_rq_throttled(cfs_rq))
                        break;
                cfs_rq->h_nr_running++;
@@ -8289,31 +8289,17 @@ static void task_fork_fair(struct task_struct *p)
 {
        struct cfs_rq *cfs_rq;
        struct sched_entity *se = &p->se, *curr;
-       int this_cpu = smp_processor_id();
        struct rq *rq = this_rq();
-       unsigned long flags;
-
-       raw_spin_lock_irqsave(&rq->lock, flags);
 
+       raw_spin_lock(&rq->lock);
        update_rq_clock(rq);
 
        cfs_rq = task_cfs_rq(current);
        curr = cfs_rq->curr;
-
-       /*
-        * Not only the cpu but also the task_group of the parent might have
-        * been changed after parent->se.parent,cfs_rq were copied to
-        * child->se.parent,cfs_rq. So call __set_task_cpu() to make those
-        * of child point to valid ones.
-        */
-       rcu_read_lock();
-       __set_task_cpu(p, this_cpu);
-       rcu_read_unlock();
-
-       update_curr(cfs_rq);
-
-       if (curr)
+       if (curr) {
+               update_curr(cfs_rq);
                se->vruntime = curr->vruntime;
+       }
        place_entity(cfs_rq, se, 1);
 
        if (sysctl_sched_child_runs_first && curr && entity_before(curr, se)) {
@@ -8326,8 +8312,7 @@ static void task_fork_fair(struct task_struct *p)
        }
 
        se->vruntime -= cfs_rq->min_vruntime;
-
-       raw_spin_unlock_irqrestore(&rq->lock, flags);
+       raw_spin_unlock(&rq->lock);
 }
 
 /*