[PATCH] per-task-delay-accounting: delay accounting usage of taskstats interface
authorShailabh Nagar <nagar@watson.ibm.com>
Fri, 14 Jul 2006 07:24:41 +0000 (00:24 -0700)
committerLinus Torvalds <torvalds@g5.osdl.org>
Sat, 15 Jul 2006 04:53:56 +0000 (21:53 -0700)
Usage of taskstats interface by delay accounting.

Signed-off-by: Shailabh Nagar <nagar@us.ibm.com>
Signed-off-by: Balbir Singh <balbir@in.ibm.com>
Cc: Jes Sorensen <jes@sgi.com>
Cc: Peter Chubb <peterc@gelato.unsw.edu.au>
Cc: Erich Focht <efocht@ess.nec.de>
Cc: Levent Serinol <lserinol@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
include/linux/delayacct.h
include/linux/sched.h
include/linux/taskstats.h
include/linux/taskstats_kern.h
init/Kconfig
kernel/delayacct.c
kernel/taskstats.c

index 0ecbf9aad8e127a93281a00c1c64d78fbf24159a..d955078a1441109289e45b492d6bafac069757c7 100644 (file)
@@ -18,6 +18,7 @@
 #define _LINUX_DELAYACCT_H
 
 #include <linux/sched.h>
+#include <linux/taskstats_kern.h>
 
 /*
  * Per-task flags relevant to delay accounting
@@ -35,6 +36,7 @@ extern void __delayacct_tsk_init(struct task_struct *);
 extern void __delayacct_tsk_exit(struct task_struct *);
 extern void __delayacct_blkio_start(void);
 extern void __delayacct_blkio_end(void);
+extern int __delayacct_add_tsk(struct taskstats *, struct task_struct *);
 
 static inline void delayacct_set_flag(int flag)
 {
@@ -74,6 +76,16 @@ static inline void delayacct_blkio_end(void)
                __delayacct_blkio_end();
 }
 
+static inline int delayacct_add_tsk(struct taskstats *d,
+                                       struct task_struct *tsk)
+{
+       if (likely(!delayacct_on))
+               return -EINVAL;
+       if (!tsk->delays)
+               return 0;
+       return __delayacct_add_tsk(d, tsk);
+}
+
 #else
 static inline void delayacct_set_flag(int flag)
 {}
@@ -89,6 +101,9 @@ static inline void delayacct_blkio_start(void)
 {}
 static inline void delayacct_blkio_end(void)
 {}
+static inline int delayacct_add_tsk(struct taskstats *d,
+                                       struct task_struct *tsk)
+{ return 0; }
 #endif /* CONFIG_TASK_DELAY_ACCT */
 
 #endif
index f751062d89a27fe6700fd7e871b3283880bb4fe5..3c5610ca0c923a0208315082f7f1f51d58a0af43 100644 (file)
@@ -990,6 +990,7 @@ struct task_struct {
         */
        struct pipe_inode_info *splice_pipe;
 #ifdef CONFIG_TASK_DELAY_ACCT
+       spinlock_t delays_lock;
        struct task_delay_info *delays;
 #endif
 };
index 51f62759bea9fd23a5158e19f42b5480fde60e29..c6aeca32348e63268f9e0346a57ba9def2c86fe2 100644 (file)
 struct taskstats {
 
        /* Version 1 */
-       __u64   version;
+       __u16   version;
+       __u16   padding[3];     /* Userspace should not interpret the padding
+                                * field which can be replaced by useful
+                                * fields if struct taskstats is extended.
+                                */
+
+       /* Delay accounting fields start
+        *
+        * All values, until comment "Delay accounting fields end" are
+        * available only if delay accounting is enabled, even though the last
+        * few fields are not delays
+        *
+        * xxx_count is the number of delay values recorded
+        * xxx_delay_total is the corresponding cumulative delay in nanoseconds
+        *
+        * xxx_delay_total wraps around to zero on overflow
+        * xxx_count incremented regardless of overflow
+        */
+
+       /* Delay waiting for cpu, while runnable
+        * count, delay_total NOT updated atomically
+        */
+       __u64   cpu_count;
+       __u64   cpu_delay_total;
+
+       /* Following four fields atomically updated using task->delays->lock */
+
+       /* Delay waiting for synchronous block I/O to complete
+        * does not account for delays in I/O submission
+        */
+       __u64   blkio_count;
+       __u64   blkio_delay_total;
+
+       /* Delay waiting for page fault I/O (swap in only) */
+       __u64   swapin_count;
+       __u64   swapin_delay_total;
+
+       /* cpu "wall-clock" running time
+        * On some architectures, value will adjust for cpu time stolen
+        * from the kernel in involuntary waits due to virtualization.
+        * Value is cumulative, in nanoseconds, without a corresponding count
+        * and wraps around to zero silently on overflow
+        */
+       __u64   cpu_run_real_total;
+
+       /* cpu "virtual" running time
+        * Uses time intervals seen by the kernel i.e. no adjustment
+        * for kernel's involuntary waits due to virtualization.
+        * Value is cumulative, in nanoseconds, without a corresponding count
+        * and wraps around to zero silently on overflow
+        */
+       __u64   cpu_run_virtual_total;
+       /* Delay accounting fields end */
+       /* version 1 ends here */
 };
 
 
index bd0ecb969c263a1541193725b6ffab55f1740bf0..fc9da2e26443911d266883924da1499d73713e4a 100644 (file)
@@ -17,6 +17,7 @@ enum {
 
 #ifdef CONFIG_TASKSTATS
 extern kmem_cache_t *taskstats_cache;
+extern struct mutex taskstats_exit_mutex;
 
 static inline void taskstats_exit_alloc(struct taskstats **ptidstats,
                                        struct taskstats **ptgidstats)
index 56a7093b4e4ce367b14a5ff4d60a454493e7405b..a099fc6526d9277cf140c70c04399022ce1aaf43 100644 (file)
@@ -173,6 +173,7 @@ config TASKSTATS
 
 config TASK_DELAY_ACCT
        bool "Enable per-task delay accounting (EXPERIMENTAL)"
+       depends on TASKSTATS
        help
          Collect information on time spent by a task waiting for system
          resources like cpu, synchronous block I/O completion and swapping
index 3546b0800f9f393a4196a8ed154a6f8833ba5950..1be274a462ca0760bfec7e89a8d916e2d866d8c3 100644 (file)
@@ -41,6 +41,10 @@ void delayacct_init(void)
 
 void __delayacct_tsk_init(struct task_struct *tsk)
 {
+       spin_lock_init(&tsk->delays_lock);
+       /* No need to acquire tsk->delays_lock for allocation here unless
+          __delayacct_tsk_init called after tsk is attached to tasklist
+       */
        tsk->delays = kmem_cache_zalloc(delayacct_cache, SLAB_KERNEL);
        if (tsk->delays)
                spin_lock_init(&tsk->delays->lock);
@@ -48,8 +52,11 @@ void __delayacct_tsk_init(struct task_struct *tsk)
 
 void __delayacct_tsk_exit(struct task_struct *tsk)
 {
-       kmem_cache_free(delayacct_cache, tsk->delays);
+       struct task_delay_info *delays = tsk->delays;
+       spin_lock(&tsk->delays_lock);
        tsk->delays = NULL;
+       spin_unlock(&tsk->delays_lock);
+       kmem_cache_free(delayacct_cache, delays);
 }
 
 /*
@@ -104,3 +111,56 @@ void __delayacct_blkio_end(void)
                        &current->delays->blkio_delay,
                        &current->delays->blkio_count);
 }
+
+int __delayacct_add_tsk(struct taskstats *d, struct task_struct *tsk)
+{
+       s64 tmp;
+       struct timespec ts;
+       unsigned long t1,t2,t3;
+
+       spin_lock(&tsk->delays_lock);
+
+       /* Though tsk->delays accessed later, early exit avoids
+        * unnecessary returning of other data
+        */
+       if (!tsk->delays)
+               goto done;
+
+       tmp = (s64)d->cpu_run_real_total;
+       cputime_to_timespec(tsk->utime + tsk->stime, &ts);
+       tmp += timespec_to_ns(&ts);
+       d->cpu_run_real_total = (tmp < (s64)d->cpu_run_real_total) ? 0 : tmp;
+
+       /*
+        * No locking available for sched_info (and too expensive to add one)
+        * Mitigate by taking snapshot of values
+        */
+       t1 = tsk->sched_info.pcnt;
+       t2 = tsk->sched_info.run_delay;
+       t3 = tsk->sched_info.cpu_time;
+
+       d->cpu_count += t1;
+
+       jiffies_to_timespec(t2, &ts);
+       tmp = (s64)d->cpu_delay_total + timespec_to_ns(&ts);
+       d->cpu_delay_total = (tmp < (s64)d->cpu_delay_total) ? 0 : tmp;
+
+       tmp = (s64)d->cpu_run_virtual_total + (s64)jiffies_to_usecs(t3) * 1000;
+       d->cpu_run_virtual_total =
+               (tmp < (s64)d->cpu_run_virtual_total) ? 0 : tmp;
+
+       /* zero XXX_total, non-zero XXX_count implies XXX stat overflowed */
+
+       spin_lock(&tsk->delays->lock);
+       tmp = d->blkio_delay_total + tsk->delays->blkio_delay;
+       d->blkio_delay_total = (tmp < d->blkio_delay_total) ? 0 : tmp;
+       tmp = d->swapin_delay_total + tsk->delays->swapin_delay;
+       d->swapin_delay_total = (tmp < d->swapin_delay_total) ? 0 : tmp;
+       d->blkio_count += tsk->delays->blkio_count;
+       d->swapin_count += tsk->delays->swapin_count;
+       spin_unlock(&tsk->delays->lock);
+
+done:
+       spin_unlock(&tsk->delays_lock);
+       return 0;
+}
index 82ec9137d908cd35c321be2bf832ad515d30945a..ea9506de3b855fd82414102a63ce0a37b24d9881 100644 (file)
 
 #include <linux/kernel.h>
 #include <linux/taskstats_kern.h>
+#include <linux/delayacct.h>
 #include <net/genetlink.h>
 #include <asm/atomic.h>
 
 static DEFINE_PER_CPU(__u32, taskstats_seqnum) = { 0 };
 static int family_registered;
 kmem_cache_t *taskstats_cache;
-static DEFINE_MUTEX(taskstats_exit_mutex);
 
 static struct genl_family family = {
        .id             = GENL_ID_GENERATE,
@@ -120,7 +120,10 @@ static int fill_pid(pid_t pid, struct task_struct *pidtsk,
         *              goto err;
         */
 
-err:
+       rc = delayacct_add_tsk(stats, tsk);
+       stats->version = TASKSTATS_VERSION;
+
+       /* Define err: label here if needed */
        put_task_struct(tsk);
        return rc;
 
@@ -152,8 +155,14 @@ static int fill_tgid(pid_t tgid, struct task_struct *tgidtsk,
                 *              break;
                 */
 
+               rc = delayacct_add_tsk(stats, tsk);
+               if (rc)
+                       break;
+
        } while_each_thread(first, tsk);
        read_unlock(&tasklist_lock);
+       stats->version = TASKSTATS_VERSION;
+
 
        /*
         * Accounting subsytems can also add calls here if they don't
@@ -233,8 +242,6 @@ void taskstats_exit_send(struct task_struct *tsk, struct taskstats *tidstats,
        if (!family_registered || !tidstats)
                return;
 
-       mutex_lock(&taskstats_exit_mutex);
-
        is_thread_group = !thread_group_empty(tsk);
        rc = 0;
 
@@ -292,7 +299,6 @@ nla_put_failure:
 err_skb:
        nlmsg_free(rep_skb);
 ret:
-       mutex_unlock(&taskstats_exit_mutex);
        return;
 }