sched: fair-group: SMP-nice for group scheduling

author Peter Zijlstra <a.p.zijlstra@chello.nl>

Sat, 19 Apr 2008 17:45:00 +0000 (19:45 +0200)

committer Ingo Molnar <mingo@elte.hu>

Sat, 19 Apr 2008 17:45:00 +0000 (19:45 +0200)
author Peter Zijlstra <a.p.zijlstra@chello.nl>
Sat, 19 Apr 2008 17:45:00 +0000 (19:45 +0200)
committer Ingo Molnar <mingo@elte.hu>
Sat, 19 Apr 2008 17:45:00 +0000 (19:45 +0200)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index 11f47249cdd2013c84625188202d8e58f7942f92..0a32059e6ed45df1a7e6ac120b004f4a636645aa 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -758,6 +758,7 @@ struct sched_domain {
         struct sched_domain *child;     /* bottom domain must be null terminated */
         struct sched_group *groups;     /* the balancing groups of the domain */
         cpumask_t span;                 /* span of all CPUs in this domain */
+       int first_cpu;                  /* cache of the first cpu in this domain */
         unsigned long min_interval;     /* Minimum balance interval ms */
         unsigned long max_interval;     /* Maximum balance interval ms */
         unsigned int busy_factor;       /* less balancing by factor if busy */
diff --git a/kernel/sched.c b/kernel/sched.c

index 62d7481caca53314d45f139440c33d138e74b5cd..ae1a3e936d28ca97d4dda8a4f9d54c4524d20e5d 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -316,6 +316,8 @@ static DEFINE_MUTEX(doms_cur_mutex);
  # define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
  #endif
  
+#define MIN_SHARES     2
+
  static int init_task_group_load = INIT_TASK_GROUP_LOAD;
  #endif
  
@@ -403,6 +405,43 @@ struct cfs_rq {
          */
         struct list_head leaf_cfs_rq_list;
         struct task_group *tg;  /* group that "owns" this runqueue */
+
+#ifdef CONFIG_SMP
+       unsigned long task_weight;
+       unsigned long shares;
+       /*
+        * We need space to build a sched_domain wide view of the full task
+        * group tree, in order to avoid depending on dynamic memory allocation
+        * during the load balancing we place this in the per cpu task group
+        * hierarchy. This limits the load balancing to one instance per cpu,
+        * but more should not be needed anyway.
+        */
+       struct aggregate_struct {
+               /*
+                *   load = weight(cpus) * f(tg)
+                *
+                * Where f(tg) is the recursive weight fraction assigned to
+                * this group.
+                */
+               unsigned long load;
+
+               /*
+                * part of the group weight distributed to this span.
+                */
+               unsigned long shares;
+
+               /*
+                * The sum of all runqueue weights within this span.
+                */
+               unsigned long rq_weight;
+
+               /*
+                * Weight contributed by tasks; this is the part we can
+                * influence by moving tasks around.
+                */
+               unsigned long task_weight;
+       } aggregate;
+#endif
  #endif
  };
  
@@ -1402,11 +1441,390 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime);
  static inline void cpuacct_charge(struct task_struct *tsk, u64 cputime) {}
  #endif
  
+static inline void inc_cpu_load(struct rq *rq, unsigned long load)
+{
+       update_load_add(&rq->load, load);
+}
+
+static inline void dec_cpu_load(struct rq *rq, unsigned long load)
+{
+       update_load_sub(&rq->load, load);
+}
+
  #ifdef CONFIG_SMP
  static unsigned long source_load(int cpu, int type);
  static unsigned long target_load(int cpu, int type);
  static unsigned long cpu_avg_load_per_task(int cpu);
  static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
+/*
+ * Group load balancing.
+ *
+ * We calculate a few balance domain wide aggregate numbers; load and weight.
+ * Given the pictures below, and assuming each item has equal weight:
+ *
+ *         root          1 - thread
+ *         / | \         A - group
+ *        A  1  B
+ *       /|\   / \
+ *      C 2 D 3   4
+ *      |   |
+ *      5   6
+ *
+ * load:
+ *    A and B get 1/3-rd of the total load. C and D get 1/3-rd of A's 1/3-rd,
+ *    which equals 1/9-th of the total load.
+ *
+ * shares:
+ *    The weight of this group on the selected cpus.
+ *
+ * rq_weight:
+ *    Direct sum of all the cpu's their rq weight, e.g. A would get 3 while
+ *    B would get 2.
+ *
+ * task_weight:
+ *    Part of the rq_weight contributed by tasks; all groups except B would
+ *    get 1, B gets 2.
+ */
+
+static inline struct aggregate_struct *
+aggregate(struct task_group *tg, struct sched_domain *sd)
+{
+       return &tg->cfs_rq[sd->first_cpu]->aggregate;
+}
+
+typedef void (*aggregate_func)(struct task_group *, struct sched_domain *);
+
+/*
+ * Iterate the full tree, calling @down when first entering a node and @up when
+ * leaving it for the final time.
+ */
+static
+void aggregate_walk_tree(aggregate_func down, aggregate_func up,
+                        struct sched_domain *sd)
+{
+       struct task_group *parent, *child;
+
+       rcu_read_lock();
+       parent = &root_task_group;
+down:
+       (*down)(parent, sd);
+       list_for_each_entry_rcu(child, &parent->children, siblings) {
+               parent = child;
+               goto down;
+
+up:
+               continue;
+       }
+       (*up)(parent, sd);
+
+       child = parent;
+       parent = parent->parent;
+       if (parent)
+               goto up;
+       rcu_read_unlock();
+}
+
+/*
+ * Calculate the aggregate runqueue weight.
+ */
+static
+void aggregate_group_weight(struct task_group *tg, struct sched_domain *sd)
+{
+       unsigned long rq_weight = 0;
+       unsigned long task_weight = 0;
+       int i;
+
+       for_each_cpu_mask(i, sd->span) {
+               rq_weight += tg->cfs_rq[i]->load.weight;
+               task_weight += tg->cfs_rq[i]->task_weight;
+       }
+
+       aggregate(tg, sd)->rq_weight = rq_weight;
+       aggregate(tg, sd)->task_weight = task_weight;
+}
+
+/*
+ * Redistribute tg->shares amongst all tg->cfs_rq[]s.
+ */
+static void __aggregate_redistribute_shares(struct task_group *tg)
+{
+       int i, max_cpu = smp_processor_id();
+       unsigned long rq_weight = 0;
+       unsigned long shares, max_shares = 0, shares_rem = tg->shares;
+
+       for_each_possible_cpu(i)
+               rq_weight += tg->cfs_rq[i]->load.weight;
+
+       for_each_possible_cpu(i) {
+               /*
+                * divide shares proportional to the rq_weights.
+                */
+               shares = tg->shares * tg->cfs_rq[i]->load.weight;
+               shares /= rq_weight + 1;
+
+               tg->cfs_rq[i]->shares = shares;
+
+               if (shares > max_shares) {
+                       max_shares = shares;
+                       max_cpu = i;
+               }
+               shares_rem -= shares;
+       }
+
+       /*
+        * Ensure it all adds up to tg->shares; we can loose a few
+        * due to rounding down when computing the per-cpu shares.
+        */
+       if (shares_rem)
+               tg->cfs_rq[max_cpu]->shares += shares_rem;
+}
+
+/*
+ * Compute the weight of this group on the given cpus.
+ */
+static
+void aggregate_group_shares(struct task_group *tg, struct sched_domain *sd)
+{
+       unsigned long shares = 0;
+       int i;
+
+again:
+       for_each_cpu_mask(i, sd->span)
+               shares += tg->cfs_rq[i]->shares;
+
+       /*
+        * When the span doesn't have any shares assigned, but does have
+        * tasks to run do a machine wide rebalance (should be rare).
+        */
+       if (unlikely(!shares && aggregate(tg, sd)->rq_weight)) {
+               __aggregate_redistribute_shares(tg);
+               goto again;
+       }
+
+       aggregate(tg, sd)->shares = shares;
+}
+
+/*
+ * Compute the load fraction assigned to this group, relies on the aggregate
+ * weight and this group's parent's load, i.e. top-down.
+ */
+static
+void aggregate_group_load(struct task_group *tg, struct sched_domain *sd)
+{
+       unsigned long load;
+
+       if (!tg->parent) {
+               int i;
+
+               load = 0;
+               for_each_cpu_mask(i, sd->span)
+                       load += cpu_rq(i)->load.weight;
+
+       } else {
+               load = aggregate(tg->parent, sd)->load;
+
+               /*
+                * shares is our weight in the parent's rq so
+                * shares/parent->rq_weight gives our fraction of the load
+                */
+               load *= aggregate(tg, sd)->shares;
+               load /= aggregate(tg->parent, sd)->rq_weight + 1;
+       }
+
+       aggregate(tg, sd)->load = load;
+}
+
+static void __set_se_shares(struct sched_entity *se, unsigned long shares);
+
+/*
+ * Calculate and set the cpu's group shares.
+ */
+static void
+__update_group_shares_cpu(struct task_group *tg, struct sched_domain *sd,
+                         int tcpu)
+{
+       int boost = 0;
+       unsigned long shares;
+       unsigned long rq_weight;
+
+       if (!tg->se[tcpu])
+               return;
+
+       rq_weight = tg->cfs_rq[tcpu]->load.weight;
+
+       /*
+        * If there are currently no tasks on the cpu pretend there is one of
+        * average load so that when a new task gets to run here it will not
+        * get delayed by group starvation.
+        */
+       if (!rq_weight) {
+               boost = 1;
+               rq_weight = NICE_0_LOAD;
+       }
+
+       /*
+        *           \Sum shares * rq_weight
+        * shares =  -----------------------
+        *               \Sum rq_weight
+        *
+        */
+       shares = aggregate(tg, sd)->shares * rq_weight;
+       shares /= aggregate(tg, sd)->rq_weight + 1;
+
+       /*
+        * record the actual number of shares, not the boosted amount.
+        */
+       tg->cfs_rq[tcpu]->shares = boost ? 0 : shares;
+
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
+
+       __set_se_shares(tg->se[tcpu], shares);
+}
+
+/*
+ * Re-adjust the weights on the cpu the task came from and on the cpu the
+ * task went to.
+ */
+static void
+__move_group_shares(struct task_group *tg, struct sched_domain *sd,
+                   int scpu, int dcpu)
+{
+       unsigned long shares;
+
+       shares = tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+
+       __update_group_shares_cpu(tg, sd, scpu);
+       __update_group_shares_cpu(tg, sd, dcpu);
+
+       /*
+        * ensure we never loose shares due to rounding errors in the
+        * above redistribution.
+        */
+       shares -= tg->cfs_rq[scpu]->shares + tg->cfs_rq[dcpu]->shares;
+       if (shares)
+               tg->cfs_rq[dcpu]->shares += shares;
+}
+
+/*
+ * Because changing a group's shares changes the weight of the super-group
+ * we need to walk up the tree and change all shares until we hit the root.
+ */
+static void
+move_group_shares(struct task_group *tg, struct sched_domain *sd,
+                 int scpu, int dcpu)
+{
+       while (tg) {
+               __move_group_shares(tg, sd, scpu, dcpu);
+               tg = tg->parent;
+       }
+}
+
+static
+void aggregate_group_set_shares(struct task_group *tg, struct sched_domain *sd)
+{
+       unsigned long shares = aggregate(tg, sd)->shares;
+       int i;
+
+       for_each_cpu_mask(i, sd->span) {
+               struct rq *rq = cpu_rq(i);
+               unsigned long flags;
+
+               spin_lock_irqsave(&rq->lock, flags);
+               __update_group_shares_cpu(tg, sd, i);
+               spin_unlock_irqrestore(&rq->lock, flags);
+       }
+
+       aggregate_group_shares(tg, sd);
+
+       /*
+        * ensure we never loose shares due to rounding errors in the
+        * above redistribution.
+        */
+       shares -= aggregate(tg, sd)->shares;
+       if (shares) {
+               tg->cfs_rq[sd->first_cpu]->shares += shares;
+               aggregate(tg, sd)->shares += shares;
+       }
+}
+
+/*
+ * Calculate the accumulative weight and recursive load of each task group
+ * while walking down the tree.
+ */
+static
+void aggregate_get_down(struct task_group *tg, struct sched_domain *sd)
+{
+       aggregate_group_weight(tg, sd);
+       aggregate_group_shares(tg, sd);
+       aggregate_group_load(tg, sd);
+}
+
+/*
+ * Rebalance the cpu shares while walking back up the tree.
+ */
+static
+void aggregate_get_up(struct task_group *tg, struct sched_domain *sd)
+{
+       aggregate_group_set_shares(tg, sd);
+}
+
+static DEFINE_PER_CPU(spinlock_t, aggregate_lock);
+
+static void __init init_aggregate(void)
+{
+       int i;
+
+       for_each_possible_cpu(i)
+               spin_lock_init(&per_cpu(aggregate_lock, i));
+}
+
+static int get_aggregate(struct sched_domain *sd)
+{
+       if (!spin_trylock(&per_cpu(aggregate_lock, sd->first_cpu)))
+               return 0;
+
+       aggregate_walk_tree(aggregate_get_down, aggregate_get_up, sd);
+       return 1;
+}
+
+static void put_aggregate(struct sched_domain *sd)
+{
+       spin_unlock(&per_cpu(aggregate_lock, sd->first_cpu));
+}
+
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+       cfs_rq->shares = shares;
+}
+
+#else
+
+static inline void init_aggregate(void)
+{
+}
+
+static inline int get_aggregate(struct sched_domain *sd)
+{
+       return 0;
+}
+
+static inline void put_aggregate(struct sched_domain *sd)
+{
+}
+#endif
+
+#else /* CONFIG_SMP */
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares)
+{
+}
+#endif
+
  #endif /* CONFIG_SMP */
  
  #include "sched_stats.h"
@@ -1419,26 +1837,14 @@ static int task_hot(struct task_struct *p, u64 now, struct sched_domain *sd);
  
  #define sched_class_highest (&rt_sched_class)
  
-static inline void inc_load(struct rq *rq, const struct task_struct *p)
-{
-       update_load_add(&rq->load, p->se.load.weight);
-}
-
-static inline void dec_load(struct rq *rq, const struct task_struct *p)
-{
-       update_load_sub(&rq->load, p->se.load.weight);
-}
-
-static void inc_nr_running(struct task_struct *p, struct rq *rq)
+static void inc_nr_running(struct rq *rq)
  {
         rq->nr_running++;
-       inc_load(rq, p);
  }
  
-static void dec_nr_running(struct task_struct *p, struct rq *rq)
+static void dec_nr_running(struct rq *rq)
  {
         rq->nr_running--;
-       dec_load(rq, p);
  }
  
  static void set_load_weight(struct task_struct *p)
@@ -1530,7 +1936,7 @@ static void activate_task(struct rq *rq, struct task_struct *p, int wakeup)
                 rq->nr_uninterruptible--;
  
         enqueue_task(rq, p, wakeup);
-       inc_nr_running(p, rq);
+       inc_nr_running(rq);
  }
  
  /*
@@ -1542,7 +1948,7 @@ static void deactivate_task(struct rq *rq, struct task_struct *p, int sleep)
                 rq->nr_uninterruptible++;
  
         dequeue_task(rq, p, sleep);
-       dec_nr_running(p, rq);
+       dec_nr_running(rq);
  }
  
  /**
@@ -2194,7 +2600,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
                  * management (if any):
                  */
                 p->sched_class->task_new(rq, p);
-               inc_nr_running(p, rq);
+               inc_nr_running(rq);
         }
         check_preempt_curr(rq, p);
  #ifdef CONFIG_SMP
@@ -3185,9 +3591,12 @@ static int load_balance(int this_cpu, struct rq *this_rq,
         unsigned long imbalance;
         struct rq *busiest;
         unsigned long flags;
+       int unlock_aggregate;
  
         cpus_setall(*cpus);
  
+       unlock_aggregate = get_aggregate(sd);
+
         /*
          * When power savings policy is enabled for the parent domain, idle
          * sibling can pick up load irrespective of busy siblings. In this case,
@@ -3303,8 +3712,9 @@ redo:
  
         if (!ld_moved && !sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               return -1;
-       return ld_moved;
+               ld_moved = -1;
+
+       goto out;
  
  out_balanced:
         schedstat_inc(sd, lb_balanced[idle]);
@@ -3319,8 +3729,13 @@ out_one_pinned:
  
         if (!sd_idle && sd->flags & SD_SHARE_CPUPOWER &&
             !test_sd_parent(sd, SD_POWERSAVINGS_BALANCE))
-               return -1;
-       return 0;
+               ld_moved = -1;
+       else
+               ld_moved = 0;
+out:
+       if (unlock_aggregate)
+               put_aggregate(sd);
+       return ld_moved;
  }
  
  /*
@@ -4535,10 +4950,8 @@ void set_user_nice(struct task_struct *p, long nice)
                 goto out_unlock;
         }
         on_rq = p->se.on_rq;
-       if (on_rq) {
+       if (on_rq)
                 dequeue_task(rq, p, 0);
-               dec_load(rq, p);
-       }
  
         p->static_prio = NICE_TO_PRIO(nice);
         set_load_weight(p);
@@ -4548,7 +4961,6 @@ void set_user_nice(struct task_struct *p, long nice)
  
         if (on_rq) {
                 enqueue_task(rq, p, 0);
-               inc_load(rq, p);
                 /*
                  * If the task increased its priority or is running and
                  * lowered its priority, then reschedule its CPU:
@@ -6921,6 +7333,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                         SD_INIT(sd, ALLNODES);
                         set_domain_attribute(sd, attr);
                         sd->span = *cpu_map;
+                       sd->first_cpu = first_cpu(sd->span);
                         cpu_to_allnodes_group(i, cpu_map, &sd->groups, tmpmask);
                         p = sd;
                         sd_allnodes = 1;
@@ -6931,6 +7344,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                 SD_INIT(sd, NODE);
                 set_domain_attribute(sd, attr);
                 sched_domain_node_span(cpu_to_node(i), &sd->span);
+               sd->first_cpu = first_cpu(sd->span);
                 sd->parent = p;
                 if (p)
                         p->child = sd;
@@ -6942,6 +7356,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                 SD_INIT(sd, CPU);
                 set_domain_attribute(sd, attr);
                 sd->span = *nodemask;
+               sd->first_cpu = first_cpu(sd->span);
                 sd->parent = p;
                 if (p)
                         p->child = sd;
@@ -6953,6 +7368,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                 SD_INIT(sd, MC);
                 set_domain_attribute(sd, attr);
                 sd->span = cpu_coregroup_map(i);
+               sd->first_cpu = first_cpu(sd->span);
                 cpus_and(sd->span, sd->span, *cpu_map);
                 sd->parent = p;
                 p->child = sd;
@@ -6965,6 +7381,7 @@ static int __build_sched_domains(const cpumask_t *cpu_map,
                 SD_INIT(sd, SIBLING);
                 set_domain_attribute(sd, attr);
                 sd->span = per_cpu(cpu_sibling_map, i);
+               sd->first_cpu = first_cpu(sd->span);
                 cpus_and(sd->span, sd->span, *cpu_map);
                 sd->parent = p;
                 p->child = sd;
@@ -7633,6 +8050,7 @@ void __init sched_init(void)
         }
  
  #ifdef CONFIG_SMP
+       init_aggregate();
         init_defrootdomain();
  #endif
  
@@ -8199,14 +8617,11 @@ void sched_move_task(struct task_struct *tsk)
  #endif
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
-static void set_se_shares(struct sched_entity *se, unsigned long shares)
+static void __set_se_shares(struct sched_entity *se, unsigned long shares)
  {
         struct cfs_rq *cfs_rq = se->cfs_rq;
-       struct rq *rq = cfs_rq->rq;
         int on_rq;
  
-       spin_lock_irq(&rq->lock);
-
         on_rq = se->on_rq;
         if (on_rq)
                 dequeue_entity(cfs_rq, se, 0);
@@ -8216,8 +8631,17 @@ static void set_se_shares(struct sched_entity *se, unsigned long shares)
  
         if (on_rq)
                 enqueue_entity(cfs_rq, se, 0);
+}
  
-       spin_unlock_irq(&rq->lock);
+static void set_se_shares(struct sched_entity *se, unsigned long shares)
+{
+       struct cfs_rq *cfs_rq = se->cfs_rq;
+       struct rq *rq = cfs_rq->rq;
+       unsigned long flags;
+
+       spin_lock_irqsave(&rq->lock, flags);
+       __set_se_shares(se, shares);
+       spin_unlock_irqrestore(&rq->lock, flags);
  }
  
  static DEFINE_MUTEX(shares_mutex);
@@ -8238,8 +8662,8 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
          * (The default weight is 1024 - so there's no practical
          *  limitation from this.)
          */
-       if (shares < 2)
-               shares = 2;
+       if (shares < MIN_SHARES)
+               shares = MIN_SHARES;
  
         mutex_lock(&shares_mutex);
         if (tg->shares == shares)
@@ -8259,8 +8683,13 @@ int sched_group_set_shares(struct task_group *tg, unsigned long shares)
          * w/o tripping rebalance_share or load_balance_fair.
          */
         tg->shares = shares;
-       for_each_possible_cpu(i)
-               set_se_shares(tg->se[i], shares);
+       for_each_possible_cpu(i) {
+               /*
+                * force a rebalance
+                */
+               cfs_rq_set_shares(tg->cfs_rq[i], 0);
+               set_se_shares(tg->se[i], shares/nr_cpu_ids);
+       }
  
         /*
          * Enable load balance activity on this group, by inserting it back on
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index b43748efaa7f2246c22df70fd976599bdddb3fa7..b89fec93a2371cab354f944c2fec29edda4c9859 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -492,10 +492,27 @@ update_stats_curr_start(struct cfs_rq *cfs_rq, struct sched_entity *se)
   * Scheduling class queueing methods:
   */
  
+#if defined CONFIG_SMP && defined CONFIG_FAIR_GROUP_SCHED
+static void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+       cfs_rq->task_weight += weight;
+}
+#else
+static inline void
+add_cfs_task_weight(struct cfs_rq *cfs_rq, unsigned long weight)
+{
+}
+#endif
+
  static void
  account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         update_load_add(&cfs_rq->load, se->load.weight);
+       if (!parent_entity(se))
+               inc_cpu_load(rq_of(cfs_rq), se->load.weight);
+       if (entity_is_task(se))
+               add_cfs_task_weight(cfs_rq, se->load.weight);
         cfs_rq->nr_running++;
         se->on_rq = 1;
  }
@@ -504,6 +521,10 @@ static void
  account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se)
  {
         update_load_sub(&cfs_rq->load, se->load.weight);
+       if (!parent_entity(se))
+               dec_cpu_load(rq_of(cfs_rq), se->load.weight);
+       if (entity_is_task(se))
+               add_cfs_task_weight(cfs_rq, -se->load.weight);
         cfs_rq->nr_running--;
         se->on_rq = 0;
  }
@@ -1286,75 +1307,90 @@ static struct task_struct *load_balance_next_fair(void *arg)
         return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
  }
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
+static unsigned long
+__load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+               unsigned long max_load_move, struct sched_domain *sd,
+               enum cpu_idle_type idle, int *all_pinned, int *this_best_prio,
+               struct cfs_rq *cfs_rq)
  {
-       struct sched_entity *curr;
-       struct task_struct *p;
-
-       if (!cfs_rq->nr_running || !first_fair(cfs_rq))
-               return MAX_PRIO;
-
-       curr = cfs_rq->curr;
-       if (!curr)
-               curr = __pick_next_entity(cfs_rq);
+       struct rq_iterator cfs_rq_iterator;
  
-       p = task_of(curr);
+       cfs_rq_iterator.start = load_balance_start_fair;
+       cfs_rq_iterator.next = load_balance_next_fair;
+       cfs_rq_iterator.arg = cfs_rq;
  
-       return p->prio;
+       return balance_tasks(this_rq, this_cpu, busiest,
+                       max_load_move, sd, idle, all_pinned,
+                       this_best_prio, &cfs_rq_iterator);
  }
-#endif
  
+#ifdef CONFIG_FAIR_GROUP_SCHED
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   unsigned long max_load_move,
                   struct sched_domain *sd, enum cpu_idle_type idle,
                   int *all_pinned, int *this_best_prio)
  {
-       struct cfs_rq *busy_cfs_rq;
         long rem_load_move = max_load_move;
-       struct rq_iterator cfs_rq_iterator;
+       int busiest_cpu = cpu_of(busiest);
+       struct task_group *tg;
  
-       cfs_rq_iterator.start = load_balance_start_fair;
-       cfs_rq_iterator.next = load_balance_next_fair;
-
-       for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
-#ifdef CONFIG_FAIR_GROUP_SCHED
-               struct cfs_rq *this_cfs_rq;
+       rcu_read_lock();
+       list_for_each_entry(tg, &task_groups, list) {
                 long imbalance;
-               unsigned long maxload;
+               unsigned long this_weight, busiest_weight;
+               long rem_load, max_load, moved_load;
+
+               /*
+                * empty group
+                */
+               if (!aggregate(tg, sd)->task_weight)
+                       continue;
+
+               rem_load = rem_load_move * aggregate(tg, sd)->rq_weight;
+               rem_load /= aggregate(tg, sd)->load + 1;
  
-               this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+               this_weight = tg->cfs_rq[this_cpu]->task_weight;
+               busiest_weight = tg->cfs_rq[busiest_cpu]->task_weight;
  
-               imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-               /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-               if (imbalance <= 0)
+               imbalance = (busiest_weight - this_weight) / 2;
+
+               if (imbalance < 0)
+                       imbalance = busiest_weight;
+
+               max_load = max(rem_load, imbalance);
+               moved_load = __load_balance_fair(this_rq, this_cpu, busiest,
+                               max_load, sd, idle, all_pinned, this_best_prio,
+                               tg->cfs_rq[busiest_cpu]);
+
+               if (!moved_load)
                         continue;
  
-               /* Don't pull more than imbalance/2 */
-               imbalance /= 2;
-               maxload = min(rem_load_move, imbalance);
+               move_group_shares(tg, sd, busiest_cpu, this_cpu);
  
-               *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
-#else
-# define maxload rem_load_move
-#endif
-               /*
-                * pass busy_cfs_rq argument into
-                * load_balance_[start|next]_fair iterators
-                */
-               cfs_rq_iterator.arg = busy_cfs_rq;
-               rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
-                                              maxload, sd, idle, all_pinned,
-                                              this_best_prio,
-                                              &cfs_rq_iterator);
+               moved_load *= aggregate(tg, sd)->load;
+               moved_load /= aggregate(tg, sd)->rq_weight + 1;
  
-               if (rem_load_move <= 0)
+               rem_load_move -= moved_load;
+               if (rem_load_move < 0)
                         break;
         }
+       rcu_read_unlock();
  
         return max_load_move - rem_load_move;
  }
+#else
+static unsigned long
+load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
+                 unsigned long max_load_move,
+                 struct sched_domain *sd, enum cpu_idle_type idle,
+                 int *all_pinned, int *this_best_prio)
+{
+       return __load_balance_fair(this_rq, this_cpu, busiest,
+                       max_load_move, sd, idle, all_pinned,
+                       this_best_prio, &busiest->cfs);
+}
+#endif
  
  static int
  move_one_task_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c

index 201a69382a4232a26d52c1b4c37e6bdc0c568216..736fb8fd8977bd0a1cfd8e3129962f48e5d39138 100644 (file)
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -518,6 +518,8 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
          */
         for_each_sched_rt_entity(rt_se)
                 enqueue_rt_entity(rt_se);
+
+       inc_cpu_load(rq, p->se.load.weight);
  }
  
  static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
@@ -537,6 +539,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
                 if (rt_rq && rt_rq->rt_nr_running)
                         enqueue_rt_entity(rt_se);
         }
+
+       dec_cpu_load(rq, p->se.load.weight);
  }
  
  /*
author	Peter Zijlstra <a.p.zijlstra@chello.nl>
	Sat, 19 Apr 2008 17:45:00 +0000 (19:45 +0200)
committer	Ingo Molnar <mingo@elte.hu>
	Sat, 19 Apr 2008 17:45:00 +0000 (19:45 +0200)
include/linux/sched.h		patch \| blob \| blame \| history
kernel/sched.c		patch \| blob \| blame \| history
kernel/sched_fair.c		patch \| blob \| blame \| history
kernel/sched_rt.c		patch \| blob \| blame \| history