sched/cputime: Guarantee stime + utime == rtime

author Peter Zijlstra <peterz@infradead.org>

Tue, 30 Jun 2015 09:30:54 +0000 (11:30 +0200)

committer Ingo Molnar <mingo@kernel.org>

Mon, 3 Aug 2015 10:21:21 +0000 (12:21 +0200)
author Peter Zijlstra <peterz@infradead.org>
Tue, 30 Jun 2015 09:30:54 +0000 (11:30 +0200)
committer Ingo Molnar <mingo@kernel.org>
Mon, 3 Aug 2015 10:21:21 +0000 (12:21 +0200)
diff --git a/include/linux/init_task.h b/include/linux/init_task.h

index e8493fee81602d626a80e1022ac85fa0b69d695a..d0b380ee7d67abbd421bf69fdd63ff10b2aa88b1 100644 (file)
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -32,6 +32,14 @@ extern struct fs_struct init_fs;
  #define INIT_CPUSET_SEQ(tsk)
  #endif
  
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+#define INIT_PREV_CPUTIME(x)   .prev_cputime = {                       \
+       .lock = __RAW_SPIN_LOCK_UNLOCKED(x.prev_cputime.lock),          \
+},
+#else
+#define INIT_PREV_CPUTIME(x)
+#endif
+
  #define INIT_SIGNALS(sig) {                                            \
         .nr_threads     = 1,                                            \
         .thread_head    = LIST_HEAD_INIT(init_task.thread_node),        \
@@ -46,6 +54,7 @@ extern struct fs_struct init_fs;
                 .cputime_atomic = INIT_CPUTIME_ATOMIC,                  \
                 .running        = 0,                                    \
         },                                                              \
+       INIT_PREV_CPUTIME(sig)                                          \
         .cred_guard_mutex =                                             \
                  __MUTEX_INITIALIZER(sig.cred_guard_mutex),             \
  }
@@ -246,6 +255,7 @@ extern struct task_group root_task_group;
         INIT_TASK_RCU_TASKS(tsk)                                        \
         INIT_CPUSET_SEQ(tsk)                                            \
         INIT_RT_MUTEXES(tsk)                                            \
+       INIT_PREV_CPUTIME(tsk)                                          \
         INIT_VTIME(tsk)                                                 \
         INIT_NUMA_BALANCING(tsk)                                        \
         INIT_KASAN(tsk)                                                 \
diff --git a/include/linux/sched.h b/include/linux/sched.h

index ae21f1591615e06cec2115563c3f821fe36c868e..7412070a25ccc906558e5ddd2328ff94f28ad3f4 100644 (file)
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -530,39 +530,49 @@ struct cpu_itimer {
  };
  
  /**
- * struct cputime - snaphsot of system and user cputime
+ * struct prev_cputime - snaphsot of system and user cputime
   * @utime: time spent in user mode
   * @stime: time spent in system mode
+ * @lock: protects the above two fields
   *
- * Gathers a generic snapshot of user and system time.
+ * Stores previous user/system time values such that we can guarantee
+ * monotonicity.
   */
-struct cputime {
+struct prev_cputime {
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
         cputime_t utime;
         cputime_t stime;
+       raw_spinlock_t lock;
+#endif
  };
  
+static inline void prev_cputime_init(struct prev_cputime *prev)
+{
+#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
+       prev->utime = prev->stime = 0;
+       raw_spin_lock_init(&prev->lock);
+#endif
+}
+
  /**
   * struct task_cputime - collected CPU time counts
   * @utime:             time spent in user mode, in &cputime_t units
   * @stime:             time spent in kernel mode, in &cputime_t units
   * @sum_exec_runtime:  total time spent on the CPU, in nanoseconds
   *
- * This is an extension of struct cputime that includes the total runtime
- * spent by the task from the scheduler point of view.
- *
- * As a result, this structure groups together three kinds of CPU time
- * that are tracked for threads and thread groups.  Most things considering
- * CPU time want to group these counts together and treat all three
- * of them in parallel.
+ * This structure groups together three kinds of CPU time that are tracked for
+ * threads and thread groups.  Most things considering CPU time want to group
+ * these counts together and treat all three of them in parallel.
   */
  struct task_cputime {
         cputime_t utime;
         cputime_t stime;
         unsigned long long sum_exec_runtime;
  };
+
  /* Alternate field names when used to cache expirations. */
-#define prof_exp       stime
  #define virt_exp       utime
+#define prof_exp       stime
  #define sched_exp      sum_exec_runtime
  
  #define INIT_CPUTIME   \
@@ -715,9 +725,7 @@ struct signal_struct {
         cputime_t utime, stime, cutime, cstime;
         cputime_t gtime;
         cputime_t cgtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       struct cputime prev_cputime;
-#endif
+       struct prev_cputime prev_cputime;
         unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
         unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
         unsigned long inblock, oublock, cinblock, coublock;
@@ -1481,9 +1489,7 @@ struct task_struct {
  
         cputime_t utime, stime, utimescaled, stimescaled;
         cputime_t gtime;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       struct cputime prev_cputime;
-#endif
+       struct prev_cputime prev_cputime;
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
         seqlock_t vtime_seqlock;
         unsigned long long vtime_snap;
diff --git a/kernel/fork.c b/kernel/fork.c

index 1bfefc6f96a4ea92507741cf1e935c6dab04c2b1..6e8f807c57169e57f928b717d8c1a7f5ad9ec6c5 100644 (file)
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1067,6 +1067,7 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
         rcu_assign_pointer(tsk->sighand, sig);
         if (!sig)
                 return -ENOMEM;
+
         atomic_set(&sig->count, 1);
         memcpy(sig->action, current->sighand->action, sizeof(sig->action));
         return 0;
@@ -1128,6 +1129,7 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
         init_sigpending(&sig->shared_pending);
         INIT_LIST_HEAD(&sig->posix_timers);
         seqlock_init(&sig->stats_lock);
+       prev_cputime_init(&sig->prev_cputime);
  
         hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
         sig->real_timer.function = it_real_fn;
@@ -1335,9 +1337,8 @@ static struct task_struct *copy_process(unsigned long clone_flags,
  
         p->utime = p->stime = p->gtime = 0;
         p->utimescaled = p->stimescaled = 0;
-#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
-       p->prev_cputime.utime = p->prev_cputime.stime = 0;
-#endif
+       prev_cputime_init(&p->prev_cputime);
+
  #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
         seqlock_init(&p->vtime_seqlock);
         p->vtime_snap = 0;
diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c

index f5a64ffad176f12b01381cb1dc2e25a05f02508d..8cbc3db671df5290f93b136175da42fdb5522bbb 100644 (file)
--- a/kernel/sched/cputime.c
+++ b/kernel/sched/cputime.c
@@ -555,48 +555,43 @@ drop_precision:
  }
  
  /*
- * Atomically advance counter to the new value. Interrupts, vcpu
- * scheduling, and scaling inaccuracies can cause cputime_advance
- * to be occasionally called with a new value smaller than counter.
- * Let's enforce atomicity.
+ * Adjust tick based cputime random precision against scheduler runtime
+ * accounting.
   *
- * Normally a caller will only go through this loop once, or not
- * at all in case a previous caller updated counter the same jiffy.
- */
-static void cputime_advance(cputime_t *counter, cputime_t new)
-{
-       cputime_t old;
-
-       while (new > (old = READ_ONCE(*counter)))
-               cmpxchg_cputime(counter, old, new);
-}
-
-/*
- * Adjust tick based cputime random precision against scheduler
- * runtime accounting.
+ * Tick based cputime accounting depend on random scheduling timeslices of a
+ * task to be interrupted or not by the timer.  Depending on these
+ * circumstances, the number of these interrupts may be over or
+ * under-optimistic, matching the real user and system cputime with a variable
+ * precision.
+ *
+ * Fix this by scaling these tick based values against the total runtime
+ * accounted by the CFS scheduler.
+ *
+ * This code provides the following guarantees:
+ *
+ *   stime + utime == rtime
+ *   stime_i+1 >= stime_i, utime_i+1 >= utime_i
+ *
+ * Assuming that rtime_i+1 >= rtime_i.
   */
  static void cputime_adjust(struct task_cputime *curr,
-                          struct cputime *prev,
+                          struct prev_cputime *prev,
                            cputime_t *ut, cputime_t *st)
  {
         cputime_t rtime, stime, utime;
+       unsigned long flags;
  
-       /*
-        * Tick based cputime accounting depend on random scheduling
-        * timeslices of a task to be interrupted or not by the timer.
-        * Depending on these circumstances, the number of these interrupts
-        * may be over or under-optimistic, matching the real user and system
-        * cputime with a variable precision.
-        *
-        * Fix this by scaling these tick based values against the total
-        * runtime accounted by the CFS scheduler.
-        */
+       /* Serialize concurrent callers such that we can honour our guarantees */
+       raw_spin_lock_irqsave(&prev->lock, flags);
         rtime = nsecs_to_cputime(curr->sum_exec_runtime);
  
         /*
-        * Update userspace visible utime/stime values only if actual execution
-        * time is bigger than already exported. Note that can happen, that we
-        * provided bigger values due to scaling inaccuracy on big numbers.
+        * This is possible under two circumstances:
+        *  - rtime isn't monotonic after all (a bug);
+        *  - we got reordered by the lock.
+        *
+        * In both cases this acts as a filter such that the rest of the code
+        * can assume it is monotonic regardless of anything else.
          */
         if (prev->stime + prev->utime >= rtime)
                 goto out;
@@ -606,22 +601,46 @@ static void cputime_adjust(struct task_cputime *curr,
  
         if (utime == 0) {
                 stime = rtime;
-       } else if (stime == 0) {
-               utime = rtime;
-       } else {
-               cputime_t total = stime + utime;
+               goto update;
+       }
  
-               stime = scale_stime((__force u64)stime,
-                                   (__force u64)rtime, (__force u64)total);
-               utime = rtime - stime;
+       if (stime == 0) {
+               utime = rtime;
+               goto update;
         }
  
-       cputime_advance(&prev->stime, stime);
-       cputime_advance(&prev->utime, utime);
+       stime = scale_stime((__force u64)stime, (__force u64)rtime,
+                           (__force u64)(stime + utime));
+
+       /*
+        * Make sure stime doesn't go backwards; this preserves monotonicity
+        * for utime because rtime is monotonic.
+        *
+        *  utime_i+1 = rtime_i+1 - stime_i
+        *            = rtime_i+1 - (rtime_i - utime_i)
+        *            = (rtime_i+1 - rtime_i) + utime_i
+        *            >= utime_i
+        */
+       if (stime < prev->stime)
+               stime = prev->stime;
+       utime = rtime - stime;
+
+       /*
+        * Make sure utime doesn't go backwards; this still preserves
+        * monotonicity for stime, analogous argument to above.
+        */
+       if (utime < prev->utime) {
+               utime = prev->utime;
+               stime = rtime - utime;
+       }
  
+update:
+       prev->stime = stime;
+       prev->utime = utime;
  out:
         *ut = prev->utime;
         *st = prev->stime;
+       raw_spin_unlock_irqrestore(&prev->lock, flags);
  }
  
  void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
author	Peter Zijlstra <peterz@infradead.org>
	Tue, 30 Jun 2015 09:30:54 +0000 (11:30 +0200)
committer	Ingo Molnar <mingo@kernel.org>
	Mon, 3 Aug 2015 10:21:21 +0000 (12:21 +0200)
include/linux/init_task.h		patch \| blob \| blame \| history
include/linux/sched.h		patch \| blob \| blame \| history
kernel/fork.c		patch \| blob \| blame \| history
kernel/sched/cputime.c		patch \| blob \| blame \| history