function-graph: ignore times across schedule
authorSteven Rostedt <srostedt@redhat.com>
Tue, 24 Mar 2009 05:10:15 +0000 (01:10 -0400)
committerSteven Rostedt <srostedt@redhat.com>
Tue, 24 Mar 2009 13:33:30 +0000 (09:33 -0400)
Impact: more accurate timings

The current method of function graph tracing does not take into
account the time spent when a task is not running. This shows functions
that call schedule have increased costs:

 3) + 18.664 us   |      }
 ------------------------------------------
 3)    <idle>-0    =>  kblockd-123
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   1.441 us    |        _spin_unlock_irq();
 3)   3.966 us    |      }
 3) ! 2959.433 us |    }
 3) ! 2961.465 us |  }

This patch uses the tracepoint in the scheduling context switch to
account for time that has elapsed while a task is scheduled out.
Now we see:

 ------------------------------------------
 3)    <idle>-0    =>  edac-po-1067
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   0.685 us    |        _spin_unlock_irq();
 3)   2.331 us    |      }
 3) + 41.439 us   |    }
 3) + 42.663 us   |  }

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
include/linux/sched.h
kernel/trace/ftrace.c

index 89cd308cc7a5bdf92e510b1fc9a785d79bd3ec9c..471e36d3012352ecf5e01d2bd237be1467b01c66 100644 (file)
@@ -1409,6 +1409,8 @@ struct task_struct {
        int curr_ret_stack;
        /* Stack of return addresses for return function tracing */
        struct ftrace_ret_stack *ret_stack;
+       /* time stamp for last schedule */
+       unsigned long long ftrace_timestamp;
        /*
         * Number of functions that haven't been traced
         * because of depth overrun.
index c81a759fbf76b2eecb767cb35a218c761ed35881..0b90364d1a2cefce0c6fefd621acc73250a5522f 100644 (file)
@@ -29,6 +29,8 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
+#include <trace/sched.h>
+
 #include <asm/ftrace.h>
 
 #include "trace.h"
@@ -2590,6 +2592,31 @@ free:
        return ret;
 }
 
+static void
+ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+                               struct task_struct *next)
+{
+       unsigned long long timestamp;
+       int index;
+
+       timestamp = trace_clock_local();
+
+       prev->ftrace_timestamp = timestamp;
+
+       /* only process tasks that we timestamped */
+       if (!next->ftrace_timestamp)
+               return;
+
+       /*
+        * Update all the counters in next to make up for the
+        * time next was sleeping.
+        */
+       timestamp -= next->ftrace_timestamp;
+
+       for (index = next->curr_ret_stack; index >= 0; index--)
+               next->ret_stack[index].calltime += timestamp;
+}
+
 /* Allocate a return stack for each task */
 static int start_graph_tracing(void)
 {
@@ -2611,6 +2638,13 @@ static int start_graph_tracing(void)
                ret = alloc_retstack_tasklist(ret_stack_list);
        } while (ret == -EAGAIN);
 
+       if (!ret) {
+               ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+               if (ret)
+                       pr_info("ftrace_graph: Couldn't activate tracepoint"
+                               " probe to kernel_sched_switch\n");
+       }
+
        kfree(ret_stack_list);
        return ret;
 }
@@ -2674,6 +2708,7 @@ void unregister_ftrace_graph(void)
        mutex_lock(&ftrace_lock);
 
        atomic_dec(&ftrace_graph_active);
+       unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
        ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
        ftrace_graph_entry = ftrace_graph_entry_stub;
        ftrace_shutdown(FTRACE_STOP_FUNC_RET);
@@ -2694,6 +2729,7 @@ void ftrace_graph_init_task(struct task_struct *t)
                t->curr_ret_stack = -1;
                atomic_set(&t->tracing_graph_pause, 0);
                atomic_set(&t->trace_overrun, 0);
+               t->ftrace_timestamp = 0;
        } else
                t->ret_stack = NULL;
 }