FIXUP: sched/tune: fix accounting for runnable tasks

author Patrick Bellasi <patrick.bellasi@arm.com>

Thu, 28 Jul 2016 17:44:40 +0000 (18:44 +0100)

committer John Stultz <john.stultz@linaro.org>

Thu, 11 Aug 2016 21:26:41 +0000 (14:26 -0700)
author Patrick Bellasi <patrick.bellasi@arm.com>
Thu, 28 Jul 2016 17:44:40 +0000 (18:44 +0100)
committer John Stultz <john.stultz@linaro.org>
Thu, 11 Aug 2016 21:26:41 +0000 (14:26 -0700)
diff --git a/kernel/exit.c b/kernel/exit.c

index 07110c6020a04ea37c04bc18bd0b9287cd0466dc..92ff63200287f38c29e5ed205801bf1adb8bfb44 100644 (file)
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -54,6 +54,8 @@
  #include <linux/writeback.h>
  #include <linux/shm.h>
  
+#include "sched/tune.h"
+
  #include <asm/uaccess.h>
  #include <asm/unistd.h>
  #include <asm/pgtable.h>
@@ -699,6 +701,9 @@ void do_exit(long code)
         }
  
         exit_signals(tsk);  /* sets PF_EXITING */
+
+       schedtune_exit_task(tsk);
+
         /*
          * tsk->flags are checked in the futex code to protect against
          * an exiting task cleaning up the robust pi futexes.
diff --git a/kernel/sched/core.c b/kernel/sched/core.c

index 42c45ccc1f62ed3eba61629f51ca7505e222cbc1..f56a528c8ae7cbbb529794b92c827619b6b25d4c 100644 (file)
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -287,6 +287,18 @@ int sysctl_sched_rt_runtime = 950000;
  /* cpus with isolated domains */
  cpumask_var_t cpu_isolated_map;
  
+struct rq *
+lock_rq_of(struct task_struct *p, unsigned long *flags)
+{
+       return task_rq_lock(p, flags);
+}
+
+void
+unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags)
+{
+       task_rq_unlock(rq, p, flags);
+}
+
  /*
   * this_rq_lock - lock this runqueue and disable interrupts.
   */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c

index 6390b4cb916c2157f5edb7beb649b635f57d5541..a26f40cb7fd021bc9ff1f0b84f0bdb71684316e0 100644 (file)
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4234,8 +4234,6 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                     cpu_overutilized(rq->cpu))
                         rq->rd->overutilized = true;
  
-               schedtune_enqueue_task(p, cpu_of(rq));
-
                 /*
                  * We want to potentially trigger a freq switch
                  * request only for tasks that are waking up; this is
@@ -4246,6 +4244,10 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 if (task_new || task_wakeup)
                         update_capacity_of(cpu_of(rq));
         }
+
+       /* Update SchedTune accouting */
+       schedtune_enqueue_task(p, cpu_of(rq));
+
  #endif /* CONFIG_SMP */
  
         hrtick_update(rq);
@@ -4311,7 +4313,6 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
  #ifdef CONFIG_SMP
  
         if (!se) {
-               schedtune_dequeue_task(p, cpu_of(rq));
  
                 /*
                  * We want to potentially trigger a freq switch
@@ -4329,6 +4330,9 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags)
                 }
         }
  
+       /* Update SchedTune accouting */
+       schedtune_dequeue_task(p, cpu_of(rq));
+
  #endif /* CONFIG_SMP */
  
         hrtick_update(rq);
@@ -5604,7 +5608,6 @@ static inline int find_best_target(struct task_struct *p)
                  * The target CPU can be already at a capacity level higher
                  * than the one required to boost the task.
                  */
-
                 if (new_util > capacity_orig_of(i))
                         continue;
  
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h

index a06c19c4805798aff42c648e7921b1e6825cdbc8..144d4782a4558a245098dc33ce8f13afc19e755f 100644 (file)
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1701,6 +1701,9 @@ task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
         raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
  }
  
+extern struct rq *lock_rq_of(struct task_struct *p, unsigned long *flags);
+extern void unlock_rq_of(struct rq *rq, struct task_struct *p, unsigned long *flags);
+
  #ifdef CONFIG_SMP
  #ifdef CONFIG_PREEMPT
  
diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c

index a691b8db28881eb38e962da142f25207215a2102..4c77cc23e65b1163f581531cae56b507601ca47e 100644 (file)
--- a/kernel/sched/tune.c
+++ b/kernel/sched/tune.c
@@ -11,6 +11,10 @@
  #include "sched.h"
  #include "tune.h"
  
+#ifdef CONFIG_CGROUP_SCHEDTUNE
+static bool schedtune_initialized = false;
+#endif
+
  unsigned int sysctl_sched_cfs_boost __read_mostly;
  
  extern struct target_nrg schedtune_target_nrg;
@@ -222,6 +226,8 @@ struct boost_groups {
                 /* Count of RUNNABLE tasks on that boost group */
                 unsigned tasks;
         } group[BOOSTGROUPS_COUNT];
+       /* CPU's boost group locking */
+       raw_spinlock_t lock;
  };
  
  /* Boost groups affecting each CPU in the system */
@@ -298,28 +304,24 @@ schedtune_boostgroup_update(int idx, int boost)
         return 0;
  }
  
+#define ENQUEUE_TASK  1
+#define DEQUEUE_TASK -1
+
  static inline void
  schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
  {
-       struct boost_groups *bg;
-       int tasks;
-
-       bg = &per_cpu(cpu_boost_groups, cpu);
+       struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+       int tasks = bg->group[idx].tasks + task_count;
  
         /* Update boosted tasks count while avoiding to make it negative */
-       if (task_count < 0 && bg->group[idx].tasks <= -task_count)
-               bg->group[idx].tasks = 0;
-       else
-               bg->group[idx].tasks += task_count;
-
-       /* Boost group activation or deactivation on that RQ */
-       tasks = bg->group[idx].tasks;
-       if (tasks == 1 || tasks == 0)
-               schedtune_cpu_update(cpu);
+       bg->group[idx].tasks = max(0, tasks);
  
         trace_sched_tune_tasks_update(p, cpu, tasks, idx,
                         bg->group[idx].boost, bg->boost_max);
  
+       /* Boost group activation or deactivation on that RQ */
+       if (tasks == 1 || tasks == 0)
+               schedtune_cpu_update(cpu);
  }
  
  /*
@@ -327,9 +329,14 @@ schedtune_tasks_update(struct task_struct *p, int cpu, int idx, int task_count)
   */
  void schedtune_enqueue_task(struct task_struct *p, int cpu)
  {
+       struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+       unsigned long irq_flags;
         struct schedtune *st;
         int idx;
  
+       if (!unlikely(schedtune_initialized))
+               return;
+
         /*
          * When a task is marked PF_EXITING by do_exit() it's going to be
          * dequeued and enqueued multiple times in the exit path.
@@ -339,13 +346,109 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
         if (p->flags & PF_EXITING)
                 return;
  
-       /* Get task boost group */
+       /*
+        * Boost group accouting is protected by a per-cpu lock and requires
+        * interrupt to be disabled to avoid race conditions for example on
+        * do_exit()::cgroup_exit() and task migration.
+        */
+       raw_spin_lock_irqsave(&bg->lock, irq_flags);
         rcu_read_lock();
+
         st = task_schedtune(p);
         idx = st->idx;
+
+       schedtune_tasks_update(p, cpu, idx, ENQUEUE_TASK);
+
         rcu_read_unlock();
+       raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
  
-       schedtune_tasks_update(p, cpu, idx, 1);
+int schedtune_allow_attach(struct cgroup_taskset *tset)
+{
+       /* We always allows tasks to be moved between existing CGroups */
+       return 0;
+}
+
+int schedtune_can_attach(struct cgroup_taskset *tset)
+{
+       struct task_struct *task;
+       struct cgroup_subsys_state *css;
+       struct boost_groups *bg;
+       unsigned long irq_flags;
+       unsigned int cpu;
+       struct rq *rq;
+       int src_bg; /* Source boost group index */
+       int dst_bg; /* Destination boost group index */
+       int tasks;
+
+       if (!unlikely(schedtune_initialized))
+               return 0;
+
+
+       cgroup_taskset_for_each(task, css, tset) {
+
+               /*
+                * Lock the CPU's RQ the task is enqueued to avoid race
+                * conditions with migration code while the task is being
+                * accounted
+                */
+               rq = lock_rq_of(task, &irq_flags);
+
+               if (!task->on_rq) {
+                       unlock_rq_of(rq, task, &irq_flags);
+                       continue;
+               }
+
+               /*
+                * Boost group accouting is protected by a per-cpu lock and requires
+                * interrupt to be disabled to avoid race conditions on...
+                */
+               cpu = cpu_of(rq);
+               bg = &per_cpu(cpu_boost_groups, cpu);
+               raw_spin_lock(&bg->lock);
+
+               dst_bg = css_st(css)->idx;
+               src_bg = task_schedtune(task)->idx;
+
+               /*
+                * Current task is not changing boostgroup, which can
+                * happen when the new hierarchy is in use.
+                */
+               if (unlikely(dst_bg == src_bg)) {
+                       raw_spin_unlock(&bg->lock);
+                       unlock_rq_of(rq, task, &irq_flags);
+                       continue;
+               }
+
+               /*
+                * This is the case of a RUNNABLE task which is switching its
+                * current boost group.
+                */
+
+               /* Move task from src to dst boost group */
+               tasks = bg->group[src_bg].tasks - 1;
+               bg->group[src_bg].tasks = max(0, tasks);
+               bg->group[dst_bg].tasks += 1;
+
+               raw_spin_unlock(&bg->lock);
+               unlock_rq_of(rq, task, &irq_flags);
+
+               /* Update CPU boost group */
+               if (bg->group[src_bg].tasks == 0 || bg->group[dst_bg].tasks == 1)
+                       schedtune_cpu_update(task_cpu(task));
+
+       }
+
+       return 0;
+}
+
+void schedtune_cancel_attach(struct cgroup_taskset *tset)
+{
+       /* This can happen only if SchedTune controller is mounted with
+        * other hierarchies ane one of them fails. Since usually SchedTune is
+        * mouted on its own hierarcy, for the time being we do not implement
+        * a proper rollback mechanism */
+       WARN(1, "SchedTune cancel attach not implemented");
  }
  
  /*
@@ -353,26 +456,62 @@ void schedtune_enqueue_task(struct task_struct *p, int cpu)
   */
  void schedtune_dequeue_task(struct task_struct *p, int cpu)
  {
+       struct boost_groups *bg = &per_cpu(cpu_boost_groups, cpu);
+       unsigned long irq_flags;
         struct schedtune *st;
         int idx;
  
+       if (!unlikely(schedtune_initialized))
+               return;
+
         /*
          * When a task is marked PF_EXITING by do_exit() it's going to be
          * dequeued and enqueued multiple times in the exit path.
          * Thus we avoid any further update, since we do not want to change
          * CPU boosting while the task is exiting.
-        * The last dequeue will be done by cgroup exit() callback.
+        * The last dequeue is already enforce by the do_exit() code path
+        * via schedtune_exit_task().
          */
         if (p->flags & PF_EXITING)
                 return;
  
-       /* Get task boost group */
+       /*
+        * Boost group accouting is protected by a per-cpu lock and requires
+        * interrupt to be disabled to avoid race conditions on...
+        */
+       raw_spin_lock_irqsave(&bg->lock, irq_flags);
         rcu_read_lock();
+
         st = task_schedtune(p);
         idx = st->idx;
+
+       schedtune_tasks_update(p, cpu, idx, DEQUEUE_TASK);
+
         rcu_read_unlock();
+       raw_spin_unlock_irqrestore(&bg->lock, irq_flags);
+}
+
+void schedtune_exit_task(struct task_struct *tsk)
+{
+       struct schedtune *st;
+       unsigned long irq_flags;
+       unsigned int cpu;
+       struct rq *rq;
+       int idx;
+
+       if (!unlikely(schedtune_initialized))
+               return;
  
-       schedtune_tasks_update(p, cpu, idx, -1);
+       rq = lock_rq_of(tsk, &irq_flags);
+       rcu_read_lock();
+
+       cpu = cpu_of(rq);
+       st = task_schedtune(tsk);
+       idx = st->idx;
+       schedtune_tasks_update(tsk, cpu, idx, DEQUEUE_TASK);
+
+       rcu_read_unlock();
+       unlock_rq_of(rq, tsk, &irq_flags);
  }
  
  int schedtune_cpu_boost(int cpu)
@@ -518,6 +657,9 @@ schedtune_css_free(struct cgroup_subsys_state *css)
  struct cgroup_subsys schedtune_cgrp_subsys = {
         .css_alloc      = schedtune_css_alloc,
         .css_free       = schedtune_css_free,
+//     .allow_attach   = schedtune_allow_attach,
+       .can_attach     = schedtune_can_attach,
+       .cancel_attach  = schedtune_cancel_attach,
         .legacy_cftypes = files,
         .early_init     = 1,
  };
diff --git a/kernel/sched/tune.h b/kernel/sched/tune.h

index 99637758a8af087f30773a41e09fce0b43e43dd6..be1785eb1c5b2dcce92361a5a71b95d549b9fab9 100644 (file)
--- a/kernel/sched/tune.h
+++ b/kernel/sched/tune.h
@@ -17,6 +17,8 @@ struct target_nrg {
  int schedtune_cpu_boost(int cpu);
  int schedtune_task_boost(struct task_struct *tsk);
  
+void schedtune_exit_task(struct task_struct *tsk);
+
  void schedtune_enqueue_task(struct task_struct *p, int cpu);
  void schedtune_dequeue_task(struct task_struct *p, int cpu);
  
@@ -25,6 +27,8 @@ void schedtune_dequeue_task(struct task_struct *p, int cpu);
  #define schedtune_cpu_boost(cpu)  get_sysctl_sched_cfs_boost()
  #define schedtune_task_boost(tsk) get_sysctl_sched_cfs_boost()
  
+#define schedtune_exit_task(task) do { } while (0)
+
  #define schedtune_enqueue_task(task, cpu) do { } while (0)
  #define schedtune_dequeue_task(task, cpu) do { } while (0)
  
@@ -39,6 +43,8 @@ int schedtune_accept_deltas(int nrg_delta, int cap_delta,
  #define schedtune_cpu_boost(cpu)  0
  #define schedtune_task_boost(tsk) 0
  
+#define schedtune_exit_task(task) do { } while (0)
+
  #define schedtune_enqueue_task(task, cpu) do { } while (0)
  #define schedtune_dequeue_task(task, cpu) do { } while (0)
author	Patrick Bellasi <patrick.bellasi@arm.com>
	Thu, 28 Jul 2016 17:44:40 +0000 (18:44 +0100)
committer	John Stultz <john.stultz@linaro.org>
	Thu, 11 Aug 2016 21:26:41 +0000 (14:26 -0700)
kernel/exit.c		patch \| blob \| blame \| history
kernel/sched/core.c		patch \| blob \| blame \| history
kernel/sched/fair.c		patch \| blob \| blame \| history
kernel/sched/sched.h		patch \| blob \| blame \| history
kernel/sched/tune.c		patch \| blob \| blame \| history
kernel/sched/tune.h		patch \| blob \| blame \| history