sched/rt: Fix PI handling vs. sched_setscheduler()
authorPeter Zijlstra <peterz@infradead.org>
Mon, 18 Jan 2016 14:27:07 +0000 (15:27 +0100)
committerIngo Molnar <mingo@kernel.org>
Mon, 29 Feb 2016 08:53:05 +0000 (09:53 +0100)
Andrea Parri reported:

> I found that the following scenario (with CONFIG_RT_GROUP_SCHED=y) is not
> handled correctly:
>
>     T1 (prio = 20)
>        lock(rtmutex);
>
>     T2 (prio = 20)
>        blocks on rtmutex  (rt_nr_boosted = 0 on T1's rq)
>
>     T1 (prio = 20)
>        sys_set_scheduler(prio = 0)
>           [new_effective_prio == oldprio]
>           T1 prio = 20    (rt_nr_boosted = 0 on T1's rq)
>
> The last step is incorrect as T1 is now boosted (c.f., rt_se_boosted());
> in particular, if we continue with
>
>    T1 (prio = 20)
>       unlock(rtmutex)
>          wakeup(T2)
>          adjust_prio(T1)
>             [prio != rt_mutex_getprio(T1)]
>     dequeue(T1)
>        rt_nr_boosted = (unsigned long)(-1)
>        ...
>             T1 prio = 0
>
> then we end up leaving rt_nr_boosted in an "inconsistent" state.
>
> The simple program attached could reproduce the previous scenario; note
> that, as a consequence of the presence of this state, the "assertion"
>
>     WARN_ON(!rt_nr_running && rt_nr_boosted)
>
> from dec_rt_group() may trigger.

So normally we dequeue/enqueue tasks in sched_setscheduler(), which
would ensure the accounting stays correct. However in the early PI path
we fail to do so.

So this was introduced at around v3.14, by:

  c365c292d059 ("sched: Consider pi boosting in setscheduler()")

which fixed another problem exactly because that dequeue/enqueue, joy.

Fix this by teaching rt about DEQUEUE_SAVE/ENQUEUE_RESTORE and have it
preserve runqueue location with that option. This requires decoupling
the on_rt_rq() state from being on the list.

In order to allow for explicit movement during the SAVE/RESTORE,
introduce {DE,EN}QUEUE_MOVE. We still must use SAVE/RESTORE in these
cases to preserve other invariants.

Respecting the SAVE/RESTORE flags also has the (nice) side-effect that
things like sys_nice()/sys_sched_setaffinity() also do not reorder
FIFO tasks (whereas they used to before this patch).

Reported-by: Andrea Parri <parri.andrea@gmail.com>
Tested-by: Andrea Parri <parri.andrea@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Juri Lelli <juri.lelli@arm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@kernel.org>
include/linux/sched.h
kernel/sched/core.c
kernel/sched/rt.c
kernel/sched/sched.h

index a292c4b7e94cdeb9ca79d1f446c6c7cd41501d42..87e5f9886ac86be8504ceadf2ed0e704e567069b 100644 (file)
@@ -1293,6 +1293,8 @@ struct sched_rt_entity {
        unsigned long timeout;
        unsigned long watchdog_stamp;
        unsigned int time_slice;
+       unsigned short on_rq;
+       unsigned short on_list;
 
        struct sched_rt_entity *back;
 #ifdef CONFIG_RT_GROUP_SCHED
index 291552b4d8ee4f9361dc2e454664339b6e518071..bb565b4663c8b7fd18310a96c1fd435279b00789 100644 (file)
@@ -2221,6 +2221,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p)
        __dl_clear_params(p);
 
        INIT_LIST_HEAD(&p->rt.run_list);
+       p->rt.timeout           = 0;
+       p->rt.time_slice        = sched_rr_timeslice;
+       p->rt.on_rq             = 0;
+       p->rt.on_list           = 0;
 
 #ifdef CONFIG_PREEMPT_NOTIFIERS
        INIT_HLIST_HEAD(&p->preempt_notifiers);
@@ -3531,7 +3535,7 @@ EXPORT_SYMBOL(default_wake_function);
  */
 void rt_mutex_setprio(struct task_struct *p, int prio)
 {
-       int oldprio, queued, running, enqueue_flag = ENQUEUE_RESTORE;
+       int oldprio, queued, running, queue_flag = DEQUEUE_SAVE | DEQUEUE_MOVE;
        struct rq *rq;
        const struct sched_class *prev_class;
 
@@ -3559,11 +3563,15 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
 
        trace_sched_pi_setprio(p, prio);
        oldprio = p->prio;
+
+       if (oldprio == prio)
+               queue_flag &= ~DEQUEUE_MOVE;
+
        prev_class = p->sched_class;
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, queue_flag);
        if (running)
                put_prev_task(rq, p);
 
@@ -3581,7 +3589,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                if (!dl_prio(p->normal_prio) ||
                    (pi_task && dl_entity_preempt(&pi_task->dl, &p->dl))) {
                        p->dl.dl_boosted = 1;
-                       enqueue_flag |= ENQUEUE_REPLENISH;
+                       queue_flag |= ENQUEUE_REPLENISH;
                } else
                        p->dl.dl_boosted = 0;
                p->sched_class = &dl_sched_class;
@@ -3589,7 +3597,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
                if (dl_prio(oldprio))
                        p->dl.dl_boosted = 0;
                if (oldprio < prio)
-                       enqueue_flag |= ENQUEUE_HEAD;
+                       queue_flag |= ENQUEUE_HEAD;
                p->sched_class = &rt_sched_class;
        } else {
                if (dl_prio(oldprio))
@@ -3604,7 +3612,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio)
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued)
-               enqueue_task(rq, p, enqueue_flag);
+               enqueue_task(rq, p, queue_flag);
 
        check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -3960,6 +3968,7 @@ static int __sched_setscheduler(struct task_struct *p,
        const struct sched_class *prev_class;
        struct rq *rq;
        int reset_on_fork;
+       int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE;
 
        /* may grab non-irq protected spin_locks */
        BUG_ON(in_interrupt());
@@ -4142,17 +4151,14 @@ change:
                 * itself.
                 */
                new_effective_prio = rt_mutex_get_effective_prio(p, newprio);
-               if (new_effective_prio == oldprio) {
-                       __setscheduler_params(p, attr);
-                       task_rq_unlock(rq, p, &flags);
-                       return 0;
-               }
+               if (new_effective_prio == oldprio)
+                       queue_flags &= ~DEQUEUE_MOVE;
        }
 
        queued = task_on_rq_queued(p);
        running = task_current(rq, p);
        if (queued)
-               dequeue_task(rq, p, DEQUEUE_SAVE);
+               dequeue_task(rq, p, queue_flags);
        if (running)
                put_prev_task(rq, p);
 
@@ -4162,15 +4168,14 @@ change:
        if (running)
                p->sched_class->set_curr_task(rq);
        if (queued) {
-               int enqueue_flags = ENQUEUE_RESTORE;
                /*
                 * We enqueue to tail when the priority of a task is
                 * increased (user space view).
                 */
-               if (oldprio <= p->prio)
-                       enqueue_flags |= ENQUEUE_HEAD;
+               if (oldprio < p->prio)
+                       queue_flags |= ENQUEUE_HEAD;
 
-               enqueue_task(rq, p, enqueue_flags);
+               enqueue_task(rq, p, queue_flags);
        }
 
        check_class_changed(rq, p, prev_class, oldprio);
@@ -7958,7 +7963,7 @@ void sched_move_task(struct task_struct *tsk)
        queued = task_on_rq_queued(tsk);
 
        if (queued)
-               dequeue_task(rq, tsk, DEQUEUE_SAVE);
+               dequeue_task(rq, tsk, DEQUEUE_SAVE | DEQUEUE_MOVE);
        if (unlikely(running))
                put_prev_task(rq, tsk);
 
@@ -7982,7 +7987,7 @@ void sched_move_task(struct task_struct *tsk)
        if (unlikely(running))
                tsk->sched_class->set_curr_task(rq);
        if (queued)
-               enqueue_task(rq, tsk, ENQUEUE_RESTORE);
+               enqueue_task(rq, tsk, ENQUEUE_RESTORE | ENQUEUE_MOVE);
 
        task_rq_unlock(rq, tsk, &flags);
 }
index 8ec86abe0ea188369ee4e7efd21789d8cb127c14..406a9c20b210f90161f534df7884e044a5862980 100644 (file)
@@ -436,7 +436,7 @@ static void dequeue_top_rt_rq(struct rt_rq *rt_rq);
 
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
-       return !list_empty(&rt_se->run_list);
+       return rt_se->on_rq;
 }
 
 #ifdef CONFIG_RT_GROUP_SCHED
@@ -482,8 +482,8 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
        return rt_se->my_q;
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head);
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se);
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags);
 
 static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 {
@@ -499,7 +499,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
                if (!rt_se)
                        enqueue_top_rt_rq(rt_rq);
                else if (!on_rt_rq(rt_se))
-                       enqueue_rt_entity(rt_se, false);
+                       enqueue_rt_entity(rt_se, 0);
 
                if (rt_rq->highest_prio.curr < curr->prio)
                        resched_curr(rq);
@@ -516,7 +516,7 @@ static void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
        if (!rt_se)
                dequeue_top_rt_rq(rt_rq);
        else if (on_rt_rq(rt_se))
-               dequeue_rt_entity(rt_se);
+               dequeue_rt_entity(rt_se, 0);
 }
 
 static inline int rt_rq_throttled(struct rt_rq *rt_rq)
@@ -1166,7 +1166,30 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
        dec_rt_group(rt_se, rt_rq);
 }
 
-static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+/*
+ * Change rt_se->run_list location unless SAVE && !MOVE
+ *
+ * assumes ENQUEUE/DEQUEUE flags match
+ */
+static inline bool move_entity(unsigned int flags)
+{
+       if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) == DEQUEUE_SAVE)
+               return false;
+
+       return true;
+}
+
+static void __delist_rt_entity(struct sched_rt_entity *rt_se, struct rt_prio_array *array)
+{
+       list_del_init(&rt_se->run_list);
+
+       if (list_empty(array->queue + rt_se_prio(rt_se)))
+               __clear_bit(rt_se_prio(rt_se), array->bitmap);
+
+       rt_se->on_list = 0;
+}
+
+static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
@@ -1179,26 +1202,37 @@ static void __enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
         * get throttled and the current group doesn't have any other
         * active members.
         */
-       if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running))
+       if (group_rq && (rt_rq_throttled(group_rq) || !group_rq->rt_nr_running)) {
+               if (rt_se->on_list)
+                       __delist_rt_entity(rt_se, array);
                return;
+       }
 
-       if (head)
-               list_add(&rt_se->run_list, queue);
-       else
-               list_add_tail(&rt_se->run_list, queue);
-       __set_bit(rt_se_prio(rt_se), array->bitmap);
+       if (move_entity(flags)) {
+               WARN_ON_ONCE(rt_se->on_list);
+               if (flags & ENQUEUE_HEAD)
+                       list_add(&rt_se->run_list, queue);
+               else
+                       list_add_tail(&rt_se->run_list, queue);
+
+               __set_bit(rt_se_prio(rt_se), array->bitmap);
+               rt_se->on_list = 1;
+       }
+       rt_se->on_rq = 1;
 
        inc_rt_tasks(rt_se, rt_rq);
 }
 
-static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void __dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
        struct rt_rq *rt_rq = rt_rq_of_se(rt_se);
        struct rt_prio_array *array = &rt_rq->active;
 
-       list_del_init(&rt_se->run_list);
-       if (list_empty(array->queue + rt_se_prio(rt_se)))
-               __clear_bit(rt_se_prio(rt_se), array->bitmap);
+       if (move_entity(flags)) {
+               WARN_ON_ONCE(!rt_se->on_list);
+               __delist_rt_entity(rt_se, array);
+       }
+       rt_se->on_rq = 0;
 
        dec_rt_tasks(rt_se, rt_rq);
 }
@@ -1207,7 +1241,7 @@ static void __dequeue_rt_entity(struct sched_rt_entity *rt_se)
  * Because the prio of an upper entry depends on the lower
  * entries, we must remove entries top - down.
  */
-static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
+static void dequeue_rt_stack(struct sched_rt_entity *rt_se, unsigned int flags)
 {
        struct sched_rt_entity *back = NULL;
 
@@ -1220,31 +1254,31 @@ static void dequeue_rt_stack(struct sched_rt_entity *rt_se)
 
        for (rt_se = back; rt_se; rt_se = rt_se->back) {
                if (on_rt_rq(rt_se))
-                       __dequeue_rt_entity(rt_se);
+                       __dequeue_rt_entity(rt_se, flags);
        }
 }
 
-static void enqueue_rt_entity(struct sched_rt_entity *rt_se, bool head)
+static void enqueue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
        struct rq *rq = rq_of_rt_se(rt_se);
 
-       dequeue_rt_stack(rt_se);
+       dequeue_rt_stack(rt_se, flags);
        for_each_sched_rt_entity(rt_se)
-               __enqueue_rt_entity(rt_se, head);
+               __enqueue_rt_entity(rt_se, flags);
        enqueue_top_rt_rq(&rq->rt);
 }
 
-static void dequeue_rt_entity(struct sched_rt_entity *rt_se)
+static void dequeue_rt_entity(struct sched_rt_entity *rt_se, unsigned int flags)
 {
        struct rq *rq = rq_of_rt_se(rt_se);
 
-       dequeue_rt_stack(rt_se);
+       dequeue_rt_stack(rt_se, flags);
 
        for_each_sched_rt_entity(rt_se) {
                struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
                if (rt_rq && rt_rq->rt_nr_running)
-                       __enqueue_rt_entity(rt_se, false);
+                       __enqueue_rt_entity(rt_se, flags);
        }
        enqueue_top_rt_rq(&rq->rt);
 }
@@ -1260,7 +1294,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        if (flags & ENQUEUE_WAKEUP)
                rt_se->timeout = 0;
 
-       enqueue_rt_entity(rt_se, flags & ENQUEUE_HEAD);
+       enqueue_rt_entity(rt_se, flags);
 
        if (!task_current(rq, p) && p->nr_cpus_allowed > 1)
                enqueue_pushable_task(rq, p);
@@ -1271,7 +1305,7 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags)
        struct sched_rt_entity *rt_se = &p->rt;
 
        update_curr_rt(rq);
-       dequeue_rt_entity(rt_se);
+       dequeue_rt_entity(rt_se, flags);
 
        dequeue_pushable_task(rq, p);
 }
index 3dc7b8b3f94cdf657171f47765aed09eab8e24d1..d3606e40ea0dcc7809ed31c5787403d76d6a5a55 100644 (file)
@@ -1130,18 +1130,40 @@ static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev)
 extern const int sched_prio_to_weight[40];
 extern const u32 sched_prio_to_wmult[40];
 
+/*
+ * {de,en}queue flags:
+ *
+ * DEQUEUE_SLEEP  - task is no longer runnable
+ * ENQUEUE_WAKEUP - task just became runnable
+ *
+ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks
+ *                are in a known state which allows modification. Such pairs
+ *                should preserve as much state as possible.
+ *
+ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location
+ *        in the runqueue.
+ *
+ * ENQUEUE_HEAD      - place at front of runqueue (tail if not specified)
+ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline)
+ * ENQUEUE_WAKING    - sched_class::task_waking was called
+ *
+ */
+
+#define DEQUEUE_SLEEP          0x01
+#define DEQUEUE_SAVE           0x02 /* matches ENQUEUE_RESTORE */
+#define DEQUEUE_MOVE           0x04 /* matches ENQUEUE_MOVE */
+
 #define ENQUEUE_WAKEUP         0x01
-#define ENQUEUE_HEAD           0x02
+#define ENQUEUE_RESTORE                0x02
+#define ENQUEUE_MOVE           0x04
+
+#define ENQUEUE_HEAD           0x08
+#define ENQUEUE_REPLENISH      0x10
 #ifdef CONFIG_SMP
-#define ENQUEUE_WAKING         0x04    /* sched_class::task_waking was called */
+#define ENQUEUE_WAKING         0x20
 #else
 #define ENQUEUE_WAKING         0x00
 #endif
-#define ENQUEUE_REPLENISH      0x08
-#define ENQUEUE_RESTORE        0x10
-
-#define DEQUEUE_SLEEP          0x01
-#define DEQUEUE_SAVE           0x02
 
 #define RETRY_TASK             ((void *)-1UL)