sched: pull RT tasks from overloaded runqueues
authorSteven Rostedt <srostedt@redhat.com>
Fri, 25 Jan 2008 20:08:07 +0000 (21:08 +0100)
committerIngo Molnar <mingo@elte.hu>
Fri, 25 Jan 2008 20:08:07 +0000 (21:08 +0100)
This patch adds the algorithm to pull tasks from RT overloaded runqueues.

When a pull RT is initiated, all overloaded runqueues are examined for
a RT task that is higher in prio than the highest prio task queued on the
target runqueue. If another runqueue holds a RT task that is of higher
prio than the highest prio task on the target runqueue is found it is pulled
to the target runqueue.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
kernel/sched.c
kernel/sched_rt.c

index 97cab609fc31ac71839edc201d2f3f1ed4422e0d..c91797107913d6e593c5679bd32a4f40cec85b4f 100644 (file)
@@ -3721,6 +3721,8 @@ need_resched_nonpreemptible:
                switch_count = &prev->nvcsw;
        }
 
+       schedule_balance_rt(rq, prev);
+
        if (unlikely(!rq->nr_running))
                idle_balance(cpu, rq);
 
index 547f858b0752394d6c9f70622280cf98744bcb7f..bacb32039e95f35a663dfeab18412a93a699ad0c 100644 (file)
@@ -179,8 +179,17 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
 static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
 
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+       if (!task_running(rq, p) &&
+           (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)))
+               return 1;
+       return 0;
+}
+
 /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
+                                                    int cpu)
 {
        struct rt_prio_array *array = &rq->rt.active;
        struct task_struct *next;
@@ -199,26 +208,36 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
        }
 
        queue = array->queue + idx;
+       BUG_ON(list_empty(queue));
+
        next = list_entry(queue->next, struct task_struct, run_list);
-       if (unlikely(next != rq->curr))
-               return next;
+       if (unlikely(pick_rt_task(rq, next, cpu)))
+               goto out;
 
        if (queue->next->next != queue) {
                /* same prio task */
                next = list_entry(queue->next->next, struct task_struct, run_list);
-               return next;
+               if (pick_rt_task(rq, next, cpu))
+                       goto out;
        }
 
+ retry:
        /* slower, but more flexible */
        idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-       if (unlikely(idx >= MAX_RT_PRIO)) {
-               WARN_ON(1); /* rt_nr_running was 2 and above! */
+       if (unlikely(idx >= MAX_RT_PRIO))
                return NULL;
-       }
 
        queue = array->queue + idx;
-       next = list_entry(queue->next, struct task_struct, run_list);
+       BUG_ON(list_empty(queue));
+
+       list_for_each_entry(next, queue, run_list) {
+               if (pick_rt_task(rq, next, cpu))
+                       goto out;
+       }
+
+       goto retry;
 
+ out:
        return next;
 }
 
@@ -305,13 +324,15 @@ static int push_rt_task(struct rq *this_rq)
 
        assert_spin_locked(&this_rq->lock);
 
-       next_task = pick_next_highest_task_rt(this_rq);
+       next_task = pick_next_highest_task_rt(this_rq, -1);
        if (!next_task)
                return 0;
 
  retry:
-       if (unlikely(next_task == this_rq->curr))
+       if (unlikely(next_task == this_rq->curr)) {
+               WARN_ON(1);
                return 0;
+       }
 
        /*
         * It's possible that the next_task slipped in of
@@ -335,7 +356,7 @@ static int push_rt_task(struct rq *this_rq)
                 * so it is possible that next_task has changed.
                 * If it has, then try again.
                 */
-               task = pick_next_highest_task_rt(this_rq);
+               task = pick_next_highest_task_rt(this_rq, -1);
                if (unlikely(task != next_task) && task && paranoid--) {
                        put_task_struct(next_task);
                        next_task = task;
@@ -378,6 +399,149 @@ static void push_rt_tasks(struct rq *rq)
                ;
 }
 
+static int pull_rt_task(struct rq *this_rq)
+{
+       struct task_struct *next;
+       struct task_struct *p;
+       struct rq *src_rq;
+       cpumask_t *rto_cpumask;
+       int this_cpu = this_rq->cpu;
+       int cpu;
+       int ret = 0;
+
+       assert_spin_locked(&this_rq->lock);
+
+       /*
+        * If cpusets are used, and we have overlapping
+        * run queue cpusets, then this algorithm may not catch all.
+        * This is just the price you pay on trying to keep
+        * dirtying caches down on large SMP machines.
+        */
+       if (likely(!rt_overloaded()))
+               return 0;
+
+       next = pick_next_task_rt(this_rq);
+
+       rto_cpumask = rt_overload();
+
+       for_each_cpu_mask(cpu, *rto_cpumask) {
+               if (this_cpu == cpu)
+                       continue;
+
+               src_rq = cpu_rq(cpu);
+               if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
+                       /*
+                        * It is possible that overlapping cpusets
+                        * will miss clearing a non overloaded runqueue.
+                        * Clear it now.
+                        */
+                       if (double_lock_balance(this_rq, src_rq)) {
+                               /* unlocked our runqueue lock */
+                               struct task_struct *old_next = next;
+                               next = pick_next_task_rt(this_rq);
+                               if (next != old_next)
+                                       ret = 1;
+                       }
+                       if (likely(src_rq->rt.rt_nr_running <= 1))
+                               /*
+                                * Small chance that this_rq->curr changed
+                                * but it's really harmless here.
+                                */
+                               rt_clear_overload(this_rq);
+                       else
+                               /*
+                                * Heh, the src_rq is now overloaded, since
+                                * we already have the src_rq lock, go straight
+                                * to pulling tasks from it.
+                                */
+                               goto try_pulling;
+                       spin_unlock(&src_rq->lock);
+                       continue;
+               }
+
+               /*
+                * We can potentially drop this_rq's lock in
+                * double_lock_balance, and another CPU could
+                * steal our next task - hence we must cause
+                * the caller to recalculate the next task
+                * in that case:
+                */
+               if (double_lock_balance(this_rq, src_rq)) {
+                       struct task_struct *old_next = next;
+                       next = pick_next_task_rt(this_rq);
+                       if (next != old_next)
+                               ret = 1;
+               }
+
+               /*
+                * Are there still pullable RT tasks?
+                */
+               if (src_rq->rt.rt_nr_running <= 1) {
+                       spin_unlock(&src_rq->lock);
+                       continue;
+               }
+
+ try_pulling:
+               p = pick_next_highest_task_rt(src_rq, this_cpu);
+
+               /*
+                * Do we have an RT task that preempts
+                * the to-be-scheduled task?
+                */
+               if (p && (!next || (p->prio < next->prio))) {
+                       WARN_ON(p == src_rq->curr);
+                       WARN_ON(!p->se.on_rq);
+
+                       /*
+                        * There's a chance that p is higher in priority
+                        * than what's currently running on its cpu.
+                        * This is just that p is wakeing up and hasn't
+                        * had a chance to schedule. We only pull
+                        * p if it is lower in priority than the
+                        * current task on the run queue or
+                        * this_rq next task is lower in prio than
+                        * the current task on that rq.
+                        */
+                       if (p->prio < src_rq->curr->prio ||
+                           (next && next->prio < src_rq->curr->prio))
+                               goto bail;
+
+                       ret = 1;
+
+                       deactivate_task(src_rq, p, 0);
+                       set_task_cpu(p, this_cpu);
+                       activate_task(this_rq, p, 0);
+                       /*
+                        * We continue with the search, just in
+                        * case there's an even higher prio task
+                        * in another runqueue. (low likelyhood
+                        * but possible)
+                        */
+
+                       /*
+                        * Update next so that we won't pick a task
+                        * on another cpu with a priority lower (or equal)
+                        * than the one we just picked.
+                        */
+                       next = p;
+
+               }
+ bail:
+               spin_unlock(&src_rq->lock);
+       }
+
+       return ret;
+}
+
+static void schedule_balance_rt(struct rq *rq,
+                               struct task_struct *prev)
+{
+       /* Try to pull RT tasks here if we lower this rq's prio */
+       if (unlikely(rt_task(prev)) &&
+           rq->rt.highest_prio > prev->prio)
+               pull_rt_task(rq);
+}
+
 static void schedule_tail_balance_rt(struct rq *rq)
 {
        /*
@@ -500,6 +664,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 #else /* CONFIG_SMP */
 # define schedule_tail_balance_rt(rq)  do { } while (0)
+# define schedule_balance_rt(rq, prev) do { } while (0)
 #endif /* CONFIG_SMP */
 
 static void task_tick_rt(struct rq *rq, struct task_struct *p)