sched: pull RT tasks from overloaded runqueues

author Steven Rostedt <srostedt@redhat.com>

Fri, 25 Jan 2008 20:08:07 +0000 (21:08 +0100)

committer Ingo Molnar <mingo@elte.hu>

Fri, 25 Jan 2008 20:08:07 +0000 (21:08 +0100)
author Steven Rostedt <srostedt@redhat.com>
Fri, 25 Jan 2008 20:08:07 +0000 (21:08 +0100)
committer Ingo Molnar <mingo@elte.hu>
Fri, 25 Jan 2008 20:08:07 +0000 (21:08 +0100)
diff --git a/kernel/sched.c b/kernel/sched.c

index 97cab609fc31ac71839edc201d2f3f1ed4422e0d..c91797107913d6e593c5679bd32a4f40cec85b4f 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3721,6 +3721,8 @@ need_resched_nonpreemptible:
                 switch_count = &prev->nvcsw;
         }
  
+       schedule_balance_rt(rq, prev);
+
         if (unlikely(!rq->nr_running))
                 idle_balance(cpu, rq);
  
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index 547f858b0752394d6c9f70622280cf98744bcb7f..bacb32039e95f35a663dfeab18412a93a699ad0c 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -179,8 +179,17 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
  static int double_lock_balance(struct rq *this_rq, struct rq *busiest);
  static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep);
  
+static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu)
+{
+       if (!task_running(rq, p) &&
+           (cpu < 0 || cpu_isset(cpu, p->cpus_allowed)))
+               return 1;
+       return 0;
+}
+
  /* Return the second highest RT task, NULL otherwise */
-static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq,
+                                                    int cpu)
  {
         struct rt_prio_array *array = &rq->rt.active;
         struct task_struct *next;
@@ -199,26 +208,36 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq)
         }
  
         queue = array->queue + idx;
+       BUG_ON(list_empty(queue));
+
         next = list_entry(queue->next, struct task_struct, run_list);
-       if (unlikely(next != rq->curr))
-               return next;
+       if (unlikely(pick_rt_task(rq, next, cpu)))
+               goto out;
  
         if (queue->next->next != queue) {
                 /* same prio task */
                 next = list_entry(queue->next->next, struct task_struct, run_list);
-               return next;
+               if (pick_rt_task(rq, next, cpu))
+                       goto out;
         }
  
+ retry:
         /* slower, but more flexible */
         idx = find_next_bit(array->bitmap, MAX_RT_PRIO, idx+1);
-       if (unlikely(idx >= MAX_RT_PRIO)) {
-               WARN_ON(1); /* rt_nr_running was 2 and above! */
+       if (unlikely(idx >= MAX_RT_PRIO))
                 return NULL;
-       }
  
         queue = array->queue + idx;
-       next = list_entry(queue->next, struct task_struct, run_list);
+       BUG_ON(list_empty(queue));
+
+       list_for_each_entry(next, queue, run_list) {
+               if (pick_rt_task(rq, next, cpu))
+                       goto out;
+       }
+
+       goto retry;
  
+ out:
         return next;
  }
  
@@ -305,13 +324,15 @@ static int push_rt_task(struct rq *this_rq)
  
         assert_spin_locked(&this_rq->lock);
  
-       next_task = pick_next_highest_task_rt(this_rq);
+       next_task = pick_next_highest_task_rt(this_rq, -1);
         if (!next_task)
                 return 0;
  
   retry:
-       if (unlikely(next_task == this_rq->curr))
+       if (unlikely(next_task == this_rq->curr)) {
+               WARN_ON(1);
                 return 0;
+       }
  
         /*
          * It's possible that the next_task slipped in of
@@ -335,7 +356,7 @@ static int push_rt_task(struct rq *this_rq)
                  * so it is possible that next_task has changed.
                  * If it has, then try again.
                  */
-               task = pick_next_highest_task_rt(this_rq);
+               task = pick_next_highest_task_rt(this_rq, -1);
                 if (unlikely(task != next_task) && task && paranoid--) {
                         put_task_struct(next_task);
                         next_task = task;
@@ -378,6 +399,149 @@ static void push_rt_tasks(struct rq *rq)
                 ;
  }
  
+static int pull_rt_task(struct rq *this_rq)
+{
+       struct task_struct *next;
+       struct task_struct *p;
+       struct rq *src_rq;
+       cpumask_t *rto_cpumask;
+       int this_cpu = this_rq->cpu;
+       int cpu;
+       int ret = 0;
+
+       assert_spin_locked(&this_rq->lock);
+
+       /*
+        * If cpusets are used, and we have overlapping
+        * run queue cpusets, then this algorithm may not catch all.
+        * This is just the price you pay on trying to keep
+        * dirtying caches down on large SMP machines.
+        */
+       if (likely(!rt_overloaded()))
+               return 0;
+
+       next = pick_next_task_rt(this_rq);
+
+       rto_cpumask = rt_overload();
+
+       for_each_cpu_mask(cpu, *rto_cpumask) {
+               if (this_cpu == cpu)
+                       continue;
+
+               src_rq = cpu_rq(cpu);
+               if (unlikely(src_rq->rt.rt_nr_running <= 1)) {
+                       /*
+                        * It is possible that overlapping cpusets
+                        * will miss clearing a non overloaded runqueue.
+                        * Clear it now.
+                        */
+                       if (double_lock_balance(this_rq, src_rq)) {
+                               /* unlocked our runqueue lock */
+                               struct task_struct *old_next = next;
+                               next = pick_next_task_rt(this_rq);
+                               if (next != old_next)
+                                       ret = 1;
+                       }
+                       if (likely(src_rq->rt.rt_nr_running <= 1))
+                               /*
+                                * Small chance that this_rq->curr changed
+                                * but it's really harmless here.
+                                */
+                               rt_clear_overload(this_rq);
+                       else
+                               /*
+                                * Heh, the src_rq is now overloaded, since
+                                * we already have the src_rq lock, go straight
+                                * to pulling tasks from it.
+                                */
+                               goto try_pulling;
+                       spin_unlock(&src_rq->lock);
+                       continue;
+               }
+
+               /*
+                * We can potentially drop this_rq's lock in
+                * double_lock_balance, and another CPU could
+                * steal our next task - hence we must cause
+                * the caller to recalculate the next task
+                * in that case:
+                */
+               if (double_lock_balance(this_rq, src_rq)) {
+                       struct task_struct *old_next = next;
+                       next = pick_next_task_rt(this_rq);
+                       if (next != old_next)
+                               ret = 1;
+               }
+
+               /*
+                * Are there still pullable RT tasks?
+                */
+               if (src_rq->rt.rt_nr_running <= 1) {
+                       spin_unlock(&src_rq->lock);
+                       continue;
+               }
+
+ try_pulling:
+               p = pick_next_highest_task_rt(src_rq, this_cpu);
+
+               /*
+                * Do we have an RT task that preempts
+                * the to-be-scheduled task?
+                */
+               if (p && (!next || (p->prio < next->prio))) {
+                       WARN_ON(p == src_rq->curr);
+                       WARN_ON(!p->se.on_rq);
+
+                       /*
+                        * There's a chance that p is higher in priority
+                        * than what's currently running on its cpu.
+                        * This is just that p is wakeing up and hasn't
+                        * had a chance to schedule. We only pull
+                        * p if it is lower in priority than the
+                        * current task on the run queue or
+                        * this_rq next task is lower in prio than
+                        * the current task on that rq.
+                        */
+                       if (p->prio < src_rq->curr->prio ||
+                           (next && next->prio < src_rq->curr->prio))
+                               goto bail;
+
+                       ret = 1;
+
+                       deactivate_task(src_rq, p, 0);
+                       set_task_cpu(p, this_cpu);
+                       activate_task(this_rq, p, 0);
+                       /*
+                        * We continue with the search, just in
+                        * case there's an even higher prio task
+                        * in another runqueue. (low likelyhood
+                        * but possible)
+                        */
+
+                       /*
+                        * Update next so that we won't pick a task
+                        * on another cpu with a priority lower (or equal)
+                        * than the one we just picked.
+                        */
+                       next = p;
+
+               }
+ bail:
+               spin_unlock(&src_rq->lock);
+       }
+
+       return ret;
+}
+
+static void schedule_balance_rt(struct rq *rq,
+                               struct task_struct *prev)
+{
+       /* Try to pull RT tasks here if we lower this rq's prio */
+       if (unlikely(rt_task(prev)) &&
+           rq->rt.highest_prio > prev->prio)
+               pull_rt_task(rq);
+}
+
  static void schedule_tail_balance_rt(struct rq *rq)
  {
         /*
@@ -500,6 +664,7 @@ move_one_task_rt(struct rq *this_rq, int this_cpu, struct rq *busiest,
  }
  #else /* CONFIG_SMP */
  # define schedule_tail_balance_rt(rq)  do { } while (0)
+# define schedule_balance_rt(rq, prev) do { } while (0)
  #endif /* CONFIG_SMP */
  
  static void task_tick_rt(struct rq *rq, struct task_struct *p)
author	Steven Rostedt <srostedt@redhat.com>
	Fri, 25 Jan 2008 20:08:07 +0000 (21:08 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Fri, 25 Jan 2008 20:08:07 +0000 (21:08 +0100)
kernel/sched.c		patch \| blob \| blame \| history
kernel/sched_rt.c		patch \| blob \| blame \| history