sched/deadline: Reclaim bandwidth not used by dl tasks

author Luca Abeni <luca.abeni@santannapisa.it>

Thu, 18 May 2017 20:13:36 +0000 (22:13 +0200)

committer Ingo Molnar <mingo@kernel.org>

Thu, 8 Jun 2017 08:31:55 +0000 (10:31 +0200)
author Luca Abeni <luca.abeni@santannapisa.it>
Thu, 18 May 2017 20:13:36 +0000 (22:13 +0200)
committer Ingo Molnar <mingo@kernel.org>
Thu, 8 Jun 2017 08:31:55 +0000 (10:31 +0200)
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 8d1a5a6258148f5b4ccdea454e1757839c8c5b04..799647927c4ca79a42294272518472641161d65b 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2444,7 +2444,7 @@ inline struct dl_bw *dl_bw_of(int i)
         return &cpu_rq(i)->rd->dl_bw;
  }
  
-static inline int dl_bw_cpus(int i)
+inline int dl_bw_cpus(int i)
  {
         struct root_domain *rd = cpu_rq(i)->rd;
         int cpus = 0;
@@ -2462,7 +2462,7 @@ inline struct dl_bw *dl_bw_of(int i)
         return &cpu_rq(i)->dl.dl_bw;
  }
  
-static inline int dl_bw_cpus(int i)
+inline int dl_bw_cpus(int i)
  {
         return 1;
  }
@@ -2500,8 +2500,8 @@ static int dl_overflow(struct task_struct *p, int policy,
         if (dl_policy(policy) && !task_has_dl_policy(p) &&
             !__dl_overflow(dl_b, cpus, 0, new_bw)) {
                 if (hrtimer_active(&p->dl.inactive_timer))
-                       __dl_clear(dl_b, p->dl.dl_bw);
-               __dl_add(dl_b, new_bw);
+                       __dl_clear(dl_b, p->dl.dl_bw, cpus);
+               __dl_add(dl_b, new_bw, cpus);
                 err = 0;
         } else if (dl_policy(policy) && task_has_dl_policy(p) &&
                    !__dl_overflow(dl_b, cpus, p->dl.dl_bw, new_bw)) {
@@ -2512,8 +2512,8 @@ static int dl_overflow(struct task_struct *p, int policy,
                  * But this would require to set the task's "inactive
                  * timer" when the task is not inactive.
                  */
-               __dl_clear(dl_b, p->dl.dl_bw);
-               __dl_add(dl_b, new_bw);
+               __dl_clear(dl_b, p->dl.dl_bw, cpus);
+               __dl_add(dl_b, new_bw, cpus);
                 dl_change_utilization(p, new_bw);
                 err = 0;
         } else if (!dl_policy(policy) && task_has_dl_policy(p)) {
@@ -5515,7 +5515,7 @@ int task_can_attach(struct task_struct *p,
                          * We will free resources in the source root_domain
                          * later on (see set_cpus_allowed_dl()).
                          */
-                       __dl_add(dl_b, p->dl.dl_bw);
+                       __dl_add(dl_b, p->dl.dl_bw, cpus);
                 }
                 raw_spin_unlock_irqrestore(&dl_b->lock, flags);
                 rcu_read_unlock_sched();
@@ -6764,9 +6764,12 @@ void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
  {
         if (global_rt_runtime() == RUNTIME_INF) {
                 dl_rq->bw_ratio = 1 << RATIO_SHIFT;
+               dl_rq->extra_bw = 1 << BW_SHIFT;
         } else {
                 dl_rq->bw_ratio = to_ratio(global_rt_runtime(),
                           global_rt_period()) >> (BW_SHIFT - RATIO_SHIFT);
+               dl_rq->extra_bw = to_ratio(global_rt_period(),
+                                                   global_rt_runtime());
         }
  }
  
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c

index 7d2f057780602a26d2d6a95e4f9a0f36372a88df..e3b25dfb74f36a6ee62b6fa32dc5706e48db3799 100644 (file)
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -209,7 +209,7 @@ static void task_non_contending(struct task_struct *p)
                         if (p->state == TASK_DEAD)
                                 sub_rq_bw(p->dl.dl_bw, &rq->dl);
                         raw_spin_lock(&dl_b->lock);
-                       __dl_clear(dl_b, p->dl.dl_bw);
+                       __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
                         __dl_clear_params(p);
                         raw_spin_unlock(&dl_b->lock);
                 }
@@ -955,28 +955,40 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq);
  /*
   * This function implements the GRUB accounting rule:
   * according to the GRUB reclaiming algorithm, the runtime is
- * not decreased as "dq = -dt", but as "dq = -max{u, (1 - Uinact)} dt",
- * where u is the utilization of the task and Uinact is the
- * (per-runqueue) inactive utilization, computed as the difference
- * between the "total runqueue utilization" and the runqueue
- * active utilization.
+ * not decreased as "dq = -dt", but as
+ * "dq = -max{u / Umax, (1 - Uinact - Uextra)} dt",
+ * where u is the utilization of the task, Umax is the maximum reclaimable
+ * utilization, Uinact is the (per-runqueue) inactive utilization, computed
+ * as the difference between the "total runqueue utilization" and the
+ * runqueue active utilization, and Uextra is the (per runqueue) extra
+ * reclaimable utilization.
   * Since rq->dl.running_bw and rq->dl.this_bw contain utilizations
- * multiplied by 2^BW_SHIFT, the result has to be shifted right by BW_SHIFT.
+ * multiplied by 2^BW_SHIFT, the result has to be shifted right by
+ * BW_SHIFT.
+ * Since rq->dl.bw_ratio contains 1 / Umax multipled by 2^RATIO_SHIFT,
+ * dl_bw is multiped by rq->dl.bw_ratio and shifted right by RATIO_SHIFT.
+ * Since delta is a 64 bit variable, to have an overflow its value
+ * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds.
+ * So, overflow is not an issue here.
   */
  u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se)
  {
         u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */
         u64 u_act;
+       u64 u_act_min = (dl_se->dl_bw * rq->dl.bw_ratio) >> RATIO_SHIFT;
  
         /*
-        * Instead of computing max{u, (1 - u_inact)}, we compare
-        * u_inact with 1 - u, because u_inact can be larger than 1
-        * (so, 1 - u_inact would be negative leading to wrong results)
+        * Instead of computing max{u * bw_ratio, (1 - u_inact - u_extra)},
+        * we compare u_inact + rq->dl.extra_bw with
+        * 1 - (u * rq->dl.bw_ratio >> RATIO_SHIFT), because
+        * u_inact + rq->dl.extra_bw can be larger than
+        * 1 * (so, 1 - u_inact - rq->dl.extra_bw would be negative
+        * leading to wrong results)
          */
-       if (u_inact > BW_UNIT - dl_se->dl_bw)
-               u_act = dl_se->dl_bw;
+       if (u_inact + rq->dl.extra_bw > BW_UNIT - u_act_min)
+               u_act = u_act_min;
         else
-               u_act = BW_UNIT - u_inact;
+               u_act = BW_UNIT - u_inact - rq->dl.extra_bw;
  
         return (delta * u_act) >> BW_SHIFT;
  }
@@ -1085,7 +1097,7 @@ static enum hrtimer_restart inactive_task_timer(struct hrtimer *timer)
                 }
  
                 raw_spin_lock(&dl_b->lock);
-               __dl_clear(dl_b, p->dl.dl_bw);
+               __dl_clear(dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
                 raw_spin_unlock(&dl_b->lock);
                 __dl_clear_params(p);
  
@@ -2054,7 +2066,7 @@ static void set_cpus_allowed_dl(struct task_struct *p,
                  * until we complete the update.
                  */
                 raw_spin_lock(&src_dl_b->lock);
-               __dl_clear(src_dl_b, p->dl.dl_bw);
+               __dl_clear(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
                 raw_spin_unlock(&src_dl_b->lock);
         }
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index b7321dac03c11cf5c3e7fbdda553fd99c5a21d85..f1e400c6403cf9106618210cc63ddd44dae66314 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -219,22 +219,27 @@ static inline int dl_bandwidth_enabled(void)
  }
  
  extern struct dl_bw *dl_bw_of(int i);
+extern int dl_bw_cpus(int i);
  
  struct dl_bw {
         raw_spinlock_t lock;
         u64 bw, total_bw;
  };
  
+static inline void __dl_update(struct dl_bw *dl_b, s64 bw);
+
  static inline
-void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw)
+void __dl_clear(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
  {
         dl_b->total_bw -= tsk_bw;
+       __dl_update(dl_b, (s32)tsk_bw / cpus);
  }
  
  static inline
-void __dl_add(struct dl_bw *dl_b, u64 tsk_bw)
+void __dl_add(struct dl_bw *dl_b, u64 tsk_bw, int cpus)
  {
         dl_b->total_bw += tsk_bw;
+       __dl_update(dl_b, -((s32)tsk_bw / cpus));
  }
  
  static inline
@@ -576,6 +581,7 @@ struct dl_rq {
          * runqueue (inactive utilization = this_bw - running_bw).
          */
         u64 this_bw;
+       u64 extra_bw;
  
         /*
          * Inverse of the fraction of CPU utilization that can be reclaimed
@@ -1958,6 +1964,33 @@ extern void nohz_balance_exit_idle(unsigned int cpu);
  static inline void nohz_balance_exit_idle(unsigned int cpu) { }
  #endif
  
+
+#ifdef CONFIG_SMP
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+       struct root_domain *rd = container_of(dl_b, struct root_domain, dl_bw);
+       int i;
+
+       RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held(),
+                        "sched RCU must be held");
+       for_each_cpu_and(i, rd->span, cpu_active_mask) {
+               struct rq *rq = cpu_rq(i);
+
+               rq->dl.extra_bw += bw;
+       }
+}
+#else
+static inline
+void __dl_update(struct dl_bw *dl_b, s64 bw)
+{
+       struct dl_rq *dl = container_of(dl_b, struct dl_rq, dl_bw);
+
+       dl->extra_bw += bw;
+}
+#endif
+
+
  #ifdef CONFIG_IRQ_TIME_ACCOUNTING
  struct irqtime {
         u64                     total;
author	Luca Abeni <luca.abeni@santannapisa.it>
	Thu, 18 May 2017 20:13:36 +0000 (22:13 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Thu, 8 Jun 2017 08:31:55 +0000 (10:31 +0200)
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/deadline.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history