Both times() and clock_gettime(CLOCK_PROCESS_CPUTIME_ID) have scalability
issues on large systems, due to both functions being serialized with a
lock.
The lock protects against reporting a wrong value, due to a thread in the
task group exiting, its statistics reporting up to the signal struct, and
that exited task's statistics being counted twice (or not at all).
Protecting that with a lock results in times() and clock_gettime() being
completely serialized on large systems.
This can be fixed by using a seqlock around the events that gather and
propagate statistics. As an additional benefit, the protection code can
be moved into thread_group_cputime(), slightly simplifying the calling
functions.
In the case of posix_cpu_clock_get_task() things can be simplified a
lot, because the calling function already ensures that the task sticks
around, and the rest is now taken care of in thread_group_cputime().
This way the statistics reporting code can run lockless.
Signed-off-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Alex Thorlton <athorlton@sgi.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Daeseok Youn <daeseok.youn@gmail.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Dongsheng Yang <yangds.fnst@cn.fujitsu.com>
Cc: Geert Uytterhoeven <geert@linux-m68k.org>
Cc: Guillaume Morin <guillaume@morinfr.org>
Cc: Ionut Alexa <ionut.m.alexa@gmail.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Li Zefan <lizefan@huawei.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Michal Schmidt <mschmidt@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Cc: umgwanakikbuti@gmail.com
Cc: fweisbec@gmail.com
Cc: srao@redhat.com
Cc: lwoodman@redhat.com
Cc: atheurer@redhat.com
Link: http://lkml.kernel.org/r/20140816134010.26a9b572@annuminas.surriel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
* Live threads maintain their own counters and add to these
* in __exit_signal, except for the group leader.
*/
+ seqlock_t stats_lock;
cputime_t utime, stime, cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
* the signal_struct.
*/
task_cputime(tsk, &utime, &stime);
+ write_seqlock(&sig->stats_lock);
sig->utime += utime;
sig->stime += stime;
sig->gtime += task_gtime(tsk);
sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
sig->nr_threads--;
__unhash_process(tsk, group_dead);
+ write_sequnlock(&sig->stats_lock);
/*
* Do this under ->siglock, we can race with another thread
spin_lock_irq(&p->real_parent->sighand->siglock);
psig = p->real_parent->signal;
sig = p->signal;
+ write_seqlock(&psig->stats_lock);
psig->cutime += tgutime + sig->cutime;
psig->cstime += tgstime + sig->cstime;
psig->cgtime += task_gtime(p) + sig->gtime + sig->cgtime;
psig->cmaxrss = maxrss;
task_io_accounting_add(&psig->ioac, &p->ioac);
task_io_accounting_add(&psig->ioac, &sig->ioac);
+ write_sequnlock(&psig->stats_lock);
spin_unlock_irq(&p->real_parent->sighand->siglock);
}
sig->curr_target = tsk;
init_sigpending(&sig->shared_pending);
INIT_LIST_HEAD(&sig->posix_timers);
+ seqlock_init(&sig->stats_lock);
hrtimer_init(&sig->real_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
sig->real_timer.function = it_real_fn;
struct signal_struct *sig = tsk->signal;
cputime_t utime, stime;
struct task_struct *t;
-
- times->utime = sig->utime;
- times->stime = sig->stime;
- times->sum_exec_runtime = sig->sum_sched_runtime;
+ unsigned int seq, nextseq;
rcu_read_lock();
- for_each_thread(tsk, t) {
- task_cputime(t, &utime, &stime);
- times->utime += utime;
- times->stime += stime;
- times->sum_exec_runtime += task_sched_runtime(t);
- }
+ /* Attempt a lockless read on the first round. */
+ nextseq = 0;
+ do {
+ seq = nextseq;
+ read_seqbegin_or_lock(&sig->stats_lock, &seq);
+ times->utime = sig->utime;
+ times->stime = sig->stime;
+ times->sum_exec_runtime = sig->sum_sched_runtime;
+
+ for_each_thread(tsk, t) {
+ task_cputime(t, &utime, &stime);
+ times->utime += utime;
+ times->stime += stime;
+ times->sum_exec_runtime += task_sched_runtime(t);
+ }
+ /* If lockless access failed, take the lock. */
+ nextseq = 1;
+ } while (need_seqretry(&sig->stats_lock, seq));
+ done_seqretry(&sig->stats_lock, seq);
rcu_read_unlock();
}
cputime_adjust(&cputime, &p->prev_cputime, ut, st);
}
-/*
- * Must be called with siglock held.
- */
void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st)
{
struct task_cputime cputime;
{
cputime_t tgutime, tgstime, cutime, cstime;
- spin_lock_irq(¤t->sighand->siglock);
thread_group_cputime_adjusted(current, &tgutime, &tgstime);
cutime = current->signal->cutime;
cstime = current->signal->cstime;
- spin_unlock_irq(¤t->sighand->siglock);
tms->tms_utime = cputime_to_clock_t(tgutime);
tms->tms_stime = cputime_to_clock_t(tgstime);
tms->tms_cutime = cputime_to_clock_t(cutime);
if (same_thread_group(tsk, current))
err = cpu_clock_sample(which_clock, tsk, &rtn);
} else {
- unsigned long flags;
- struct sighand_struct *sighand;
-
- /*
- * while_each_thread() is not yet entirely RCU safe,
- * keep locking the group while sampling process
- * clock for now.
- */
- sighand = lock_task_sighand(tsk, &flags);
- if (!sighand)
- return err;
-
if (tsk == current || thread_group_leader(tsk))
err = cpu_clock_sample_group(which_clock, tsk, &rtn);
-
- unlock_task_sighand(tsk, &flags);
}
if (!err)