sched: Fix cgroup smp fairness
authorPeter Zijlstra <a.p.zijlstra@chello.nl>
Mon, 27 Jul 2009 12:04:49 +0000 (14:04 +0200)
committerIngo Molnar <mingo@elte.hu>
Sun, 2 Aug 2009 12:26:06 +0000 (14:26 +0200)
Commit ec4e0e2fe018992d980910db901637c814575914 ("fix
inconsistency when redistribute per-cpu tg->cfs_rq shares")
broke cgroup smp fairness.

In order to avoid starvation of newly placed tasks, we never
quite set the share of an empty cpu group-task to 0, but
instead we set it as if there's a single NICE-0 task present.

If however we actually set this in cfs_rq[cpu]->shares, that
means the total shares for that group will be slightly inflated
every time we balance, causing the observed unfairness.

Fix this by setting cfs_rq[cpu]->shares to 0 but actually
setting the effective weight of the related se to the inflated
number.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1248696557.6987.1615.camel@twins>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
kernel/sched.c

index ce1056e9b02ac5d3cc091a517cc2abfc7a75eb5a..26976cd8be0ff5be5b1119d5db1ae4b563a21a33 100644 (file)
@@ -1523,13 +1523,18 @@ static void
 update_group_shares_cpu(struct task_group *tg, int cpu,
                        unsigned long sd_shares, unsigned long sd_rq_weight)
 {
-       unsigned long shares;
        unsigned long rq_weight;
+       unsigned long shares;
+       int boost = 0;
 
        if (!tg->se[cpu])
                return;
 
        rq_weight = tg->cfs_rq[cpu]->rq_weight;
+       if (!rq_weight) {
+               boost = 1;
+               rq_weight = NICE_0_LOAD;
+       }
 
        /*
         *           \Sum shares * rq_weight
@@ -1546,8 +1551,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
                unsigned long flags;
 
                spin_lock_irqsave(&rq->lock, flags);
-               tg->cfs_rq[cpu]->shares = shares;
-
+               tg->cfs_rq[cpu]->shares = boost ? 0 : shares;
                __set_se_shares(tg->se[cpu], shares);
                spin_unlock_irqrestore(&rq->lock, flags);
        }
@@ -1560,7 +1564,7 @@ update_group_shares_cpu(struct task_group *tg, int cpu,
  */
 static int tg_shares_up(struct task_group *tg, void *data)
 {
-       unsigned long weight, rq_weight = 0;
+       unsigned long weight, rq_weight = 0, eff_weight = 0;
        unsigned long shares = 0;
        struct sched_domain *sd = data;
        int i;
@@ -1572,11 +1576,13 @@ static int tg_shares_up(struct task_group *tg, void *data)
                 * run here it will not get delayed by group starvation.
                 */
                weight = tg->cfs_rq[i]->load.weight;
+               tg->cfs_rq[i]->rq_weight = weight;
+               rq_weight += weight;
+
                if (!weight)
                        weight = NICE_0_LOAD;
 
-               tg->cfs_rq[i]->rq_weight = weight;
-               rq_weight += weight;
+               eff_weight += weight;
                shares += tg->cfs_rq[i]->shares;
        }
 
@@ -1586,8 +1592,14 @@ static int tg_shares_up(struct task_group *tg, void *data)
        if (!sd->parent || !(sd->parent->flags & SD_LOAD_BALANCE))
                shares = tg->shares;
 
-       for_each_cpu(i, sched_domain_span(sd))
-               update_group_shares_cpu(tg, i, shares, rq_weight);
+       for_each_cpu(i, sched_domain_span(sd)) {
+               unsigned long sd_rq_weight = rq_weight;
+
+               if (!tg->cfs_rq[i]->rq_weight)
+                       sd_rq_weight = eff_weight;
+
+               update_group_shares_cpu(tg, i, shares, sd_rq_weight);
+       }
 
        return 0;
 }