sched/cputime: Fix ksoftirqd cputime accounting regression
authorFrederic Weisbecker <fweisbec@gmail.com>
Tue, 25 Apr 2017 14:10:48 +0000 (16:10 +0200)
committerIngo Molnar <mingo@kernel.org>
Thu, 27 Apr 2017 07:08:26 +0000 (09:08 +0200)
irq_time_read() returns the irqtime minus the ksoftirqd time. This
is necessary because irq_time_read() is used to substract the IRQ time
from the sum_exec_runtime of a task. If we were to include the softirq
time of ksoftirqd, this task would substract its own CPU time everytime
it updates ksoftirqd->sum_exec_runtime which would therefore never
progress.

But this behaviour got broken by:

  a499a5a14db ("sched/cputime: Increment kcpustat directly on irqtime account")

... which now includes ksoftirqd softirq time in the time returned by
irq_time_read().

This has resulted in wrong ksoftirqd cputime reported to userspace
through /proc/stat and thus "top" not showing ksoftirqd when it should
after intense networking load.

ksoftirqd->stime happens to be correct but it gets scaled down by
sum_exec_runtime through task_cputime_adjusted().

To fix this, just account the strict IRQ time in a separate counter and
use it to report the IRQ time.

Reported-and-tested-by: Jesper Dangaard Brouer <brouer@redhat.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Jesper Dangaard Brouer <brouer@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Stanislaw Gruszka <sgruszka@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Wanpeng Li <wanpeng.li@hotmail.com>
Link: http://lkml.kernel.org/r/1493129448-5356-1-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
kernel/sched/cputime.c
kernel/sched/sched.h

index f3778e2b46c8dc00c90d9165f6ae9efbf1f16dc7..aea3135c5d90f434ee72980c30f0db1129ef752b 100644 (file)
@@ -34,6 +34,18 @@ void disable_sched_clock_irqtime(void)
        sched_clock_irqtime = 0;
 }
 
+static void irqtime_account_delta(struct irqtime *irqtime, u64 delta,
+                                 enum cpu_usage_stat idx)
+{
+       u64 *cpustat = kcpustat_this_cpu->cpustat;
+
+       u64_stats_update_begin(&irqtime->sync);
+       cpustat[idx] += delta;
+       irqtime->total += delta;
+       irqtime->tick_delta += delta;
+       u64_stats_update_end(&irqtime->sync);
+}
+
 /*
  * Called before incrementing preempt_count on {soft,}irq_enter
  * and before decrementing preempt_count on {soft,}irq_exit.
@@ -41,7 +53,6 @@ void disable_sched_clock_irqtime(void)
 void irqtime_account_irq(struct task_struct *curr)
 {
        struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime);
-       u64 *cpustat = kcpustat_this_cpu->cpustat;
        s64 delta;
        int cpu;
 
@@ -52,22 +63,16 @@ void irqtime_account_irq(struct task_struct *curr)
        delta = sched_clock_cpu(cpu) - irqtime->irq_start_time;
        irqtime->irq_start_time += delta;
 
-       u64_stats_update_begin(&irqtime->sync);
        /*
         * We do not account for softirq time from ksoftirqd here.
         * We want to continue accounting softirq time to ksoftirqd thread
         * in that case, so as not to confuse scheduler with a special task
         * that do not consume any time, but still wants to run.
         */
-       if (hardirq_count()) {
-               cpustat[CPUTIME_IRQ] += delta;
-               irqtime->tick_delta += delta;
-       } else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) {
-               cpustat[CPUTIME_SOFTIRQ] += delta;
-               irqtime->tick_delta += delta;
-       }
-
-       u64_stats_update_end(&irqtime->sync);
+       if (hardirq_count())
+               irqtime_account_delta(irqtime, delta, CPUTIME_IRQ);
+       else if (in_serving_softirq() && curr != this_cpu_ksoftirqd())
+               irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ);
 }
 EXPORT_SYMBOL_GPL(irqtime_account_irq);
 
index 5cbf92214ad89287d111ab8300e5b55923d83ffe..767aab3505a81d14789382686f9c4ccba095cfaa 100644 (file)
@@ -1869,6 +1869,7 @@ static inline void nohz_balance_exit_idle(unsigned int cpu) { }
 
 #ifdef CONFIG_IRQ_TIME_ACCOUNTING
 struct irqtime {
+       u64                     total;
        u64                     tick_delta;
        u64                     irq_start_time;
        struct u64_stats_sync   sync;
@@ -1876,16 +1877,20 @@ struct irqtime {
 
 DECLARE_PER_CPU(struct irqtime, cpu_irqtime);
 
+/*
+ * Returns the irqtime minus the softirq time computed by ksoftirqd.
+ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime
+ * and never move forward.
+ */
 static inline u64 irq_time_read(int cpu)
 {
        struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu);
-       u64 *cpustat = kcpustat_cpu(cpu).cpustat;
        unsigned int seq;
        u64 total;
 
        do {
                seq = __u64_stats_fetch_begin(&irqtime->sync);
-               total = cpustat[CPUTIME_SOFTIRQ] + cpustat[CPUTIME_IRQ];
+               total = irqtime->total;
        } while (__u64_stats_fetch_retry(&irqtime->sync, seq));
 
        return total;