sched: group scheduler, fix fairness of cpu bandwidth allocation for task groups

author Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>

Fri, 25 Jan 2008 20:08:00 +0000 (21:08 +0100)

committer Ingo Molnar <mingo@elte.hu>

Fri, 25 Jan 2008 20:08:00 +0000 (21:08 +0100)
author Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
Fri, 25 Jan 2008 20:08:00 +0000 (21:08 +0100)
committer Ingo Molnar <mingo@elte.hu>
Fri, 25 Jan 2008 20:08:00 +0000 (21:08 +0100)
diff --git a/include/linux/sched.h b/include/linux/sched.h

index d6eacda765ca76a8db94b1d6834652d5be875217..288245f83bd4f89fd8937677f2cf59872961f81d 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1453,6 +1453,10 @@ extern unsigned int sysctl_sched_child_runs_first;
  extern unsigned int sysctl_sched_features;
  extern unsigned int sysctl_sched_migration_cost;
  extern unsigned int sysctl_sched_nr_migrate;
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+extern unsigned int sysctl_sched_min_bal_int_shares;
+extern unsigned int sysctl_sched_max_bal_int_shares;
+#endif
  
  int sched_nr_latency_handler(struct ctl_table *table, int write,
                 struct file *file, void __user *buffer, size_t *length,
diff --git a/kernel/sched.c b/kernel/sched.c

index d9585f15043f2391c4b2bf938f4f19cf1c8c14fd..86e55a9c2de60d564f2d0ba5d676aed58c3270bd 100644 (file)
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -168,7 +168,43 @@ struct task_group {
         struct sched_entity **se;
         /* runqueue "owned" by this group on each cpu */
         struct cfs_rq **cfs_rq;
+
+       /*
+        * shares assigned to a task group governs how much of cpu bandwidth
+        * is allocated to the group. The more shares a group has, the more is
+        * the cpu bandwidth allocated to it.
+        *
+        * For ex, lets say that there are three task groups, A, B and C which
+        * have been assigned shares 1000, 2000 and 3000 respectively. Then,
+        * cpu bandwidth allocated by the scheduler to task groups A, B and C
+        * should be:
+        *
+        *      Bw(A) = 1000/(1000+2000+3000) * 100 = 16.66%
+        *      Bw(B) = 2000/(1000+2000+3000) * 100 = 33.33%
+        *      Bw(C) = 3000/(1000+2000+3000) * 100 = 50%
+        *
+        * The weight assigned to a task group's schedulable entities on every
+        * cpu (task_group.se[a_cpu]->load.weight) is derived from the task
+        * group's shares. For ex: lets say that task group A has been
+        * assigned shares of 1000 and there are two CPUs in a system. Then,
+        *
+        *  tg_A->se[0]->load.weight = tg_A->se[1]->load.weight = 1000;
+        *
+        * Note: It's not necessary that each of a task's group schedulable
+        *       entity have the same weight on all CPUs. If the group
+        *       has 2 of its tasks on CPU0 and 1 task on CPU1, then a
+        *       better distribution of weight could be:
+        *
+        *      tg_A->se[0]->load.weight = 2/3 * 2000 = 1333
+        *      tg_A->se[1]->load.weight = 1/2 * 2000 =  667
+        *
+        * rebalance_shares() is responsible for distributing the shares of a
+        * task groups like this among the group's schedulable entities across
+        * cpus.
+        *
+        */
         unsigned long shares;
+
         struct rcu_head rcu;
  };
  
@@ -188,6 +224,14 @@ static DEFINE_MUTEX(task_group_mutex);
  /* doms_cur_mutex serializes access to doms_cur[] array */
  static DEFINE_MUTEX(doms_cur_mutex);
  
+#ifdef CONFIG_SMP
+/* kernel thread that runs rebalance_shares() periodically */
+static struct task_struct *lb_monitor_task;
+static int load_balance_monitor(void *unused);
+#endif
+
+static void set_se_shares(struct sched_entity *se, unsigned long shares);
+
  /* Default task group.
   *     Every task in system belong to this group at bootup.
   */
@@ -202,6 +246,8 @@ struct task_group init_task_group = {
  # define INIT_TASK_GROUP_LOAD  NICE_0_LOAD
  #endif
  
+#define MIN_GROUP_SHARES       2
+
  static int init_task_group_load = INIT_TASK_GROUP_LOAD;
  
  /* return group to which a task belongs */
@@ -6736,6 +6782,21 @@ void __init sched_init_smp(void)
         if (set_cpus_allowed(current, non_isolated_cpus) < 0)
                 BUG();
         sched_init_granularity();
+
+#ifdef CONFIG_FAIR_GROUP_SCHED
+       if (nr_cpu_ids == 1)
+               return;
+
+       lb_monitor_task = kthread_create(load_balance_monitor, NULL,
+                                        "group_balance");
+       if (!IS_ERR(lb_monitor_task)) {
+               lb_monitor_task->flags |= PF_NOFREEZE;
+               wake_up_process(lb_monitor_task);
+       } else {
+               printk(KERN_ERR "Could not create load balance monitor thread"
+                       "(error = %ld) \n", PTR_ERR(lb_monitor_task));
+       }
+#endif
  }
  #else
  void __init sched_init_smp(void)
@@ -6988,6 +7049,157 @@ void set_curr_task(int cpu, struct task_struct *p)
  
  #ifdef CONFIG_FAIR_GROUP_SCHED
  
+#ifdef CONFIG_SMP
+/*
+ * distribute shares of all task groups among their schedulable entities,
+ * to reflect load distrbution across cpus.
+ */
+static int rebalance_shares(struct sched_domain *sd, int this_cpu)
+{
+       struct cfs_rq *cfs_rq;
+       struct rq *rq = cpu_rq(this_cpu);
+       cpumask_t sdspan = sd->span;
+       int balanced = 1;
+
+       /* Walk thr' all the task groups that we have */
+       for_each_leaf_cfs_rq(rq, cfs_rq) {
+               int i;
+               unsigned long total_load = 0, total_shares;
+               struct task_group *tg = cfs_rq->tg;
+
+               /* Gather total task load of this group across cpus */
+               for_each_cpu_mask(i, sdspan)
+                       total_load += tg->cfs_rq[i]->load.weight;
+
+               /* Nothing to do if this group has no load  */
+               if (!total_load)
+                       continue;
+
+               /*
+                * tg->shares represents the number of cpu shares the task group
+                * is eligible to hold on a single cpu. On N cpus, it is
+                * eligible to hold (N * tg->shares) number of cpu shares.
+                */
+               total_shares = tg->shares * cpus_weight(sdspan);
+
+               /*
+                * redistribute total_shares across cpus as per the task load
+                * distribution.
+                */
+               for_each_cpu_mask(i, sdspan) {
+                       unsigned long local_load, local_shares;
+
+                       local_load = tg->cfs_rq[i]->load.weight;
+                       local_shares = (local_load * total_shares) / total_load;
+                       if (!local_shares)
+                               local_shares = MIN_GROUP_SHARES;
+                       if (local_shares == tg->se[i]->load.weight)
+                               continue;
+
+                       spin_lock_irq(&cpu_rq(i)->lock);
+                       set_se_shares(tg->se[i], local_shares);
+                       spin_unlock_irq(&cpu_rq(i)->lock);
+                       balanced = 0;
+               }
+       }
+
+       return balanced;
+}
+
+/*
+ * How frequently should we rebalance_shares() across cpus?
+ *
+ * The more frequently we rebalance shares, the more accurate is the fairness
+ * of cpu bandwidth distribution between task groups. However higher frequency
+ * also implies increased scheduling overhead.
+ *
+ * sysctl_sched_min_bal_int_shares represents the minimum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * sysctl_sched_max_bal_int_shares represents the maximum interval between
+ * consecutive calls to rebalance_shares() in the same sched domain.
+ *
+ * These settings allows for the appropriate tradeoff between accuracy of
+ * fairness and the associated overhead.
+ *
+ */
+
+/* default: 8ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_min_bal_int_shares = 8;
+
+/* default: 128ms, units: milliseconds */
+const_debug unsigned int sysctl_sched_max_bal_int_shares = 128;
+
+/* kernel thread that runs rebalance_shares() periodically */
+static int load_balance_monitor(void *unused)
+{
+       unsigned int timeout = sysctl_sched_min_bal_int_shares;
+       struct sched_param schedparm;
+       int ret;
+
+       /*
+        * We don't want this thread's execution to be limited by the shares
+        * assigned to default group (init_task_group). Hence make it run
+        * as a SCHED_RR RT task at the lowest priority.
+        */
+       schedparm.sched_priority = 1;
+       ret = sched_setscheduler(current, SCHED_RR, &schedparm);
+       if (ret)
+               printk(KERN_ERR "Couldn't set SCHED_RR policy for load balance"
+                               " monitor thread (error = %d) \n", ret);
+
+       while (!kthread_should_stop()) {
+               int i, cpu, balanced = 1;
+
+               /* Prevent cpus going down or coming up */
+               lock_cpu_hotplug();
+               /* lockout changes to doms_cur[] array */
+               lock_doms_cur();
+               /*
+                * Enter a rcu read-side critical section to safely walk rq->sd
+                * chain on various cpus and to walk task group list
+                * (rq->leaf_cfs_rq_list) in rebalance_shares().
+                */
+               rcu_read_lock();
+
+               for (i = 0; i < ndoms_cur; i++) {
+                       cpumask_t cpumap = doms_cur[i];
+                       struct sched_domain *sd = NULL, *sd_prev = NULL;
+
+                       cpu = first_cpu(cpumap);
+
+                       /* Find the highest domain at which to balance shares */
+                       for_each_domain(cpu, sd) {
+                               if (!(sd->flags & SD_LOAD_BALANCE))
+                                       continue;
+                               sd_prev = sd;
+                       }
+
+                       sd = sd_prev;
+                       /* sd == NULL? No load balance reqd in this domain */
+                       if (!sd)
+                               continue;
+
+                       balanced &= rebalance_shares(sd, cpu);
+               }
+
+               rcu_read_unlock();
+
+               unlock_doms_cur();
+               unlock_cpu_hotplug();
+
+               if (!balanced)
+                       timeout = sysctl_sched_min_bal_int_shares;
+               else if (timeout < sysctl_sched_max_bal_int_shares)
+                       timeout *= 2;
+
+               msleep_interruptible(timeout);
+       }
+
+       return 0;
+}
+#endif /* CONFIG_SMP */
+
  /* allocate runqueue etc for a new task group */
  struct task_group *sched_create_group(void)
  {
@@ -7144,47 +7356,77 @@ done:
         task_rq_unlock(rq, &flags);
  }
  
+/* rq->lock to be locked by caller */
  static void set_se_shares(struct sched_entity *se, unsigned long shares)
  {
         struct cfs_rq *cfs_rq = se->cfs_rq;
         struct rq *rq = cfs_rq->rq;
         int on_rq;
  
-       spin_lock_irq(&rq->lock);
+       if (!shares)
+               shares = MIN_GROUP_SHARES;
  
         on_rq = se->on_rq;
-       if (on_rq)
+       if (on_rq) {
                 dequeue_entity(cfs_rq, se, 0);
+               dec_cpu_load(rq, se->load.weight);
+       }
  
         se->load.weight = shares;
         se->load.inv_weight = div64_64((1ULL<<32), shares);
  
-       if (on_rq)
+       if (on_rq) {
                 enqueue_entity(cfs_rq, se, 0);
-
-       spin_unlock_irq(&rq->lock);
+               inc_cpu_load(rq, se->load.weight);
+       }
  }
  
  int sched_group_set_shares(struct task_group *tg, unsigned long shares)
  {
         int i;
-
-       /*
-        * A weight of 0 or 1 can cause arithmetics problems.
-        * (The default weight is 1024 - so there's no practical
-        *  limitation from this.)
-        */
-       if (shares < 2)
-               shares = 2;
+       struct cfs_rq *cfs_rq;
+       struct rq *rq;
  
         lock_task_group_list();
         if (tg->shares == shares)
                 goto done;
  
+       if (shares < MIN_GROUP_SHARES)
+               shares = MIN_GROUP_SHARES;
+
+       /*
+        * Prevent any load balance activity (rebalance_shares,
+        * load_balance_fair) from referring to this group first,
+        * by taking it off the rq->leaf_cfs_rq_list on each cpu.
+        */
+       for_each_possible_cpu(i) {
+               cfs_rq = tg->cfs_rq[i];
+               list_del_rcu(&cfs_rq->leaf_cfs_rq_list);
+       }
+
+       /* wait for any ongoing reference to this group to finish */
+       synchronize_sched();
+
+       /*
+        * Now we are free to modify the group's share on each cpu
+        * w/o tripping rebalance_share or load_balance_fair.
+        */
         tg->shares = shares;
-       for_each_possible_cpu(i)
+       for_each_possible_cpu(i) {
+               spin_lock_irq(&cpu_rq(i)->lock);
                 set_se_shares(tg->se[i], shares);
+               spin_unlock_irq(&cpu_rq(i)->lock);
+       }
  
+       /*
+        * Enable load balance activity on this group, by inserting it back on
+        * each cpu's rq->leaf_cfs_rq_list.
+        */
+       for_each_possible_cpu(i) {
+               rq = cpu_rq(i);
+               cfs_rq = tg->cfs_rq[i];
+               list_add_rcu(&cfs_rq->leaf_cfs_rq_list, &rq->leaf_cfs_rq_list);
+       }
  done:
         unlock_task_group_list();
         return 0;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c

index 30ae9c2a28614b22721e31d49839f69fa6d85d16..5c208e090ae427de8c9f5dd4240e2c29c344b12f 100644 (file)
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -707,6 +707,8 @@ static inline struct sched_entity *parent_entity(struct sched_entity *se)
         return se->parent;
  }
  
+#define GROUP_IMBALANCE_PCT    20
+
  #else  /* CONFIG_FAIR_GROUP_SCHED */
  
  #define for_each_sched_entity(se) \
@@ -967,25 +969,6 @@ static struct task_struct *load_balance_next_fair(void *arg)
         return __load_balance_iterator(cfs_rq, cfs_rq->rb_load_balance_curr);
  }
  
-#ifdef CONFIG_FAIR_GROUP_SCHED
-static int cfs_rq_best_prio(struct cfs_rq *cfs_rq)
-{
-       struct sched_entity *curr;
-       struct task_struct *p;
-
-       if (!cfs_rq->nr_running)
-               return MAX_PRIO;
-
-       curr = cfs_rq->curr;
-       if (!curr)
-               curr = __pick_next_entity(cfs_rq);
-
-       p = task_of(curr);
-
-       return p->prio;
-}
-#endif
-
  static unsigned long
  load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                   unsigned long max_load_move,
@@ -995,28 +978,45 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
         struct cfs_rq *busy_cfs_rq;
         long rem_load_move = max_load_move;
         struct rq_iterator cfs_rq_iterator;
+       unsigned long load_moved;
  
         cfs_rq_iterator.start = load_balance_start_fair;
         cfs_rq_iterator.next = load_balance_next_fair;
  
         for_each_leaf_cfs_rq(busiest, busy_cfs_rq) {
  #ifdef CONFIG_FAIR_GROUP_SCHED
-               struct cfs_rq *this_cfs_rq;
-               long imbalance;
-               unsigned long maxload;
+               struct cfs_rq *this_cfs_rq = busy_cfs_rq->tg->cfs_rq[this_cpu];
+               unsigned long maxload, task_load, group_weight;
+               unsigned long thisload, per_task_load;
+               struct sched_entity *se = busy_cfs_rq->tg->se[busiest->cpu];
  
-               this_cfs_rq = cpu_cfs_rq(busy_cfs_rq, this_cpu);
+               task_load = busy_cfs_rq->load.weight;
+               group_weight = se->load.weight;
  
-               imbalance = busy_cfs_rq->load.weight - this_cfs_rq->load.weight;
-               /* Don't pull if this_cfs_rq has more load than busy_cfs_rq */
-               if (imbalance <= 0)
+               /*
+                * 'group_weight' is contributed by tasks of total weight
+                * 'task_load'. To move 'rem_load_move' worth of weight only,
+                * we need to move a maximum task load of:
+                *
+                *      maxload = (remload / group_weight) * task_load;
+                */
+               maxload = (rem_load_move * task_load) / group_weight;
+
+               if (!maxload || !task_load)
                         continue;
  
-               /* Don't pull more than imbalance/2 */
-               imbalance /= 2;
-               maxload = min(rem_load_move, imbalance);
+               per_task_load = task_load / busy_cfs_rq->nr_running;
+               /*
+                * balance_tasks will try to forcibly move atleast one task if
+                * possible (because of SCHED_LOAD_SCALE_FUZZ). Avoid that if
+                * maxload is less than GROUP_IMBALANCE_FUZZ% the per_task_load.
+                */
+                if (100 * maxload < GROUP_IMBALANCE_PCT * per_task_load)
+                       continue;
  
-               *this_best_prio = cfs_rq_best_prio(this_cfs_rq);
+               /* Disable priority-based load balance */
+               *this_best_prio = 0;
+               thisload = this_cfs_rq->load.weight;
  #else
  # define maxload rem_load_move
  #endif
@@ -1025,11 +1025,33 @@ load_balance_fair(struct rq *this_rq, int this_cpu, struct rq *busiest,
                  * load_balance_[start|next]_fair iterators
                  */
                 cfs_rq_iterator.arg = busy_cfs_rq;
-               rem_load_move -= balance_tasks(this_rq, this_cpu, busiest,
+               load_moved = balance_tasks(this_rq, this_cpu, busiest,
                                                maxload, sd, idle, all_pinned,
                                                this_best_prio,
                                                &cfs_rq_iterator);
  
+#ifdef CONFIG_FAIR_GROUP_SCHED
+               /*
+                * load_moved holds the task load that was moved. The
+                * effective (group) weight moved would be:
+                *      load_moved_eff = load_moved/task_load * group_weight;
+                */
+               load_moved = (group_weight * load_moved) / task_load;
+
+               /* Adjust shares on both cpus to reflect load_moved */
+               group_weight -= load_moved;
+               set_se_shares(se, group_weight);
+
+               se = busy_cfs_rq->tg->se[this_cpu];
+               if (!thisload)
+                       group_weight = load_moved;
+               else
+                       group_weight = se->load.weight + load_moved;
+               set_se_shares(se, group_weight);
+#endif
+
+               rem_load_move -= load_moved;
+
                 if (rem_load_move <= 0)
                         break;
         }
diff --git a/kernel/sysctl.c b/kernel/sysctl.c

index c68f68dcc605052c9bcc81d1719b343acb482e5b..c95f3ed34474af8d6cf89303515180a4815f28ac 100644 (file)
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -309,6 +309,24 @@ static struct ctl_table kern_table[] = {
                 .mode           = 644,
                 .proc_handler   = &proc_dointvec,
         },
+#if defined(CONFIG_FAIR_GROUP_SCHED) && defined(CONFIG_SMP)
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "sched_min_bal_int_shares",
+               .data           = &sysctl_sched_min_bal_int_shares,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+       {
+               .ctl_name       = CTL_UNNUMBERED,
+               .procname       = "sched_max_bal_int_shares",
+               .data           = &sysctl_sched_max_bal_int_shares,
+               .maxlen         = sizeof(unsigned int),
+               .mode           = 0644,
+               .proc_handler   = &proc_dointvec,
+       },
+#endif
  #endif
         {
                 .ctl_name       = CTL_UNNUMBERED,
author	Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
	Fri, 25 Jan 2008 20:08:00 +0000 (21:08 +0100)
committer	Ingo Molnar <mingo@elte.hu>
	Fri, 25 Jan 2008 20:08:00 +0000 (21:08 +0100)
include/linux/sched.h		patch \| blob \| blame \| history
kernel/sched.c		patch \| blob \| blame \| history
kernel/sched_fair.c		patch \| blob \| blame \| history
kernel/sysctl.c		patch \| blob \| blame \| history