dm stats: collect and report histogram of IO latencies
authorMikulas Patocka <mpatocka@redhat.com>
Tue, 9 Jun 2015 21:22:05 +0000 (17:22 -0400)
committerMike Snitzer <snitzer@redhat.com>
Wed, 17 Jun 2015 16:40:40 +0000 (12:40 -0400)
Add an option to dm statistics to collect and report a histogram of
IO latencies.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Documentation/device-mapper/statistics.txt
drivers/md/dm-stats.c

index ff6baeaa71f7f7e42b52fd6d8fad9f85a0268718..4919b2dfd1b391c4169de1b66f5e78bd9ba2d9cc 100644 (file)
@@ -13,9 +13,10 @@ the range specified.
 The I/O statistics counters for each step-sized area of a region are
 in the same format as /sys/block/*/stat or /proc/diskstats (see:
 Documentation/iostats.txt).  But two extra counters (12 and 13) are
-provided: total time spent reading and writing.  All these counters may
-be accessed by sending the @stats_print message to the appropriate DM
-device via dmsetup.
+provided: total time spent reading and writing.  When the histogram
+argument is used, the 14th parameter is reported that represents the
+histogram of latencies.  All these counters may be accessed by sending
+the @stats_print message to the appropriate DM device via dmsetup.
 
 The reported times are in milliseconds and the granularity depends on
 the kernel ticks.  When the option precise_timestamps is used, the
@@ -64,6 +65,18 @@ Messages
                used, the resulting times are in nanoseconds instead of
                milliseconds.  Precise timestamps are a little bit slower
                to obtain than jiffies-based timestamps.
+         histogram:n1,n2,n3,n4,... - collect histogram of latencies.  The
+               numbers n1, n2, etc are times that represent the boundaries
+               of the histogram.  If precise_timestamps is not used, the
+               times are in milliseconds, otherwise they are in
+               nanoseconds.  For each range, the kernel will report the
+               number of requests that completed within this range. For
+               example, if we use "histogram:10,20,30", the kernel will
+               report four numbers a:b:c:d. a is the number of requests
+               that took 0-10 ms to complete, b is the number of requests
+               that took 10-20 ms to complete, c is the number of requests
+               that took 20-30 ms to complete and d is the number of
+               requests that took more than 30 ms to complete.
 
        <program_id>
          An optional parameter.  A name that uniquely identifies
index 4bfd84ab1d4a04b95c2ec2ff95d1add5b30d61f7..faf1071ef63142384d0b8621cbf8da30dcc20779 100644 (file)
@@ -29,6 +29,7 @@ struct dm_stat_percpu {
        unsigned long long io_ticks[2];
        unsigned long long io_ticks_total;
        unsigned long long time_in_queue;
+       unsigned long long *histogram;
 };
 
 struct dm_stat_shared {
@@ -45,11 +46,14 @@ struct dm_stat {
        sector_t start;
        sector_t end;
        sector_t step;
+       unsigned n_histogram_entries;
+       unsigned long long *histogram_boundaries;
        const char *program_id;
        const char *aux_data;
        struct rcu_head rcu_head;
        size_t shared_alloc_size;
        size_t percpu_alloc_size;
+       size_t histogram_alloc_size;
        struct dm_stat_percpu *stat_percpu[NR_CPUS];
        struct dm_stat_shared stat_shared[0];
 };
@@ -173,8 +177,11 @@ static void dm_stat_free(struct rcu_head *head)
 
        kfree(s->program_id);
        kfree(s->aux_data);
-       for_each_possible_cpu(cpu)
+       for_each_possible_cpu(cpu) {
+               dm_kvfree(s->stat_percpu[cpu][0].histogram, s->histogram_alloc_size);
                dm_kvfree(s->stat_percpu[cpu], s->percpu_alloc_size);
+       }
+       dm_kvfree(s->stat_shared[0].tmp.histogram, s->histogram_alloc_size);
        dm_kvfree(s, s->shared_alloc_size);
 }
 
@@ -228,6 +235,8 @@ void dm_stats_cleanup(struct dm_stats *stats)
 
 static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
                           sector_t step, unsigned stat_flags,
+                          unsigned n_histogram_entries,
+                          unsigned long long *histogram_boundaries,
                           const char *program_id, const char *aux_data,
                           void (*suspend_callback)(struct mapped_device *),
                           void (*resume_callback)(struct mapped_device *),
@@ -239,6 +248,7 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
        size_t ni;
        size_t shared_alloc_size;
        size_t percpu_alloc_size;
+       size_t histogram_alloc_size;
        struct dm_stat_percpu *p;
        int cpu;
        int ret_id;
@@ -262,7 +272,12 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
        if (percpu_alloc_size / sizeof(struct dm_stat_percpu) != n_entries)
                return -EOVERFLOW;
 
-       if (!check_shared_memory(shared_alloc_size + num_possible_cpus() * percpu_alloc_size))
+       histogram_alloc_size = (n_histogram_entries + 1) * (size_t)n_entries * sizeof(unsigned long long);
+       if (histogram_alloc_size / (n_histogram_entries + 1) != (size_t)n_entries * sizeof(unsigned long long))
+               return -EOVERFLOW;
+
+       if (!check_shared_memory(shared_alloc_size + histogram_alloc_size +
+                                num_possible_cpus() * (percpu_alloc_size + histogram_alloc_size)))
                return -ENOMEM;
 
        s = dm_kvzalloc(shared_alloc_size, NUMA_NO_NODE);
@@ -276,6 +291,15 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
        s->step = step;
        s->shared_alloc_size = shared_alloc_size;
        s->percpu_alloc_size = percpu_alloc_size;
+       s->histogram_alloc_size = histogram_alloc_size;
+
+       s->n_histogram_entries = n_histogram_entries;
+       s->histogram_boundaries = kmemdup(histogram_boundaries,
+                                         s->n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+       if (!s->histogram_boundaries) {
+               r = -ENOMEM;
+               goto out;
+       }
 
        s->program_id = kstrdup(program_id, GFP_KERNEL);
        if (!s->program_id) {
@@ -293,6 +317,19 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
                atomic_set(&s->stat_shared[ni].in_flight[WRITE], 0);
        }
 
+       if (s->n_histogram_entries) {
+               unsigned long long *hi;
+               hi = dm_kvzalloc(s->histogram_alloc_size, NUMA_NO_NODE);
+               if (!hi) {
+                       r = -ENOMEM;
+                       goto out;
+               }
+               for (ni = 0; ni < n_entries; ni++) {
+                       s->stat_shared[ni].tmp.histogram = hi;
+                       hi += s->n_histogram_entries + 1;
+               }
+       }
+
        for_each_possible_cpu(cpu) {
                p = dm_kvzalloc(percpu_alloc_size, cpu_to_node(cpu));
                if (!p) {
@@ -300,6 +337,18 @@ static int dm_stats_create(struct dm_stats *stats, sector_t start, sector_t end,
                        goto out;
                }
                s->stat_percpu[cpu] = p;
+               if (s->n_histogram_entries) {
+                       unsigned long long *hi;
+                       hi = dm_kvzalloc(s->histogram_alloc_size, cpu_to_node(cpu));
+                       if (!hi) {
+                               r = -ENOMEM;
+                               goto out;
+                       }
+                       for (ni = 0; ni < n_entries; ni++) {
+                               p[ni].histogram = hi;
+                               hi += s->n_histogram_entries + 1;
+                       }
+               }
        }
 
        /*
@@ -377,9 +426,11 @@ static int dm_stats_delete(struct dm_stats *stats, int id)
         * vfree can't be called from RCU callback
         */
        for_each_possible_cpu(cpu)
-               if (is_vmalloc_addr(s->stat_percpu))
+               if (is_vmalloc_addr(s->stat_percpu) ||
+                   is_vmalloc_addr(s->stat_percpu[cpu][0].histogram))
                        goto do_sync_free;
-       if (is_vmalloc_addr(s)) {
+       if (is_vmalloc_addr(s) ||
+           is_vmalloc_addr(s->stat_shared[0].tmp.histogram)) {
 do_sync_free:
                synchronize_rcu_expedited();
                dm_stat_free(&s->rcu_head);
@@ -486,15 +537,32 @@ static void dm_stat_for_entry(struct dm_stat *s, size_t entry,
                dm_stat_round(s, shared, p);
                atomic_inc(&shared->in_flight[idx]);
        } else {
+               unsigned long long duration;
                dm_stat_round(s, shared, p);
                atomic_dec(&shared->in_flight[idx]);
                p->sectors[idx] += len;
                p->ios[idx] += 1;
                p->merges[idx] += stats_aux->merged;
-               if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS))
+               if (!(s->stat_flags & STAT_PRECISE_TIMESTAMPS)) {
                        p->ticks[idx] += duration_jiffies;
-               else
+                       duration = jiffies_to_msecs(duration_jiffies);
+               } else {
                        p->ticks[idx] += stats_aux->duration_ns;
+                       duration = stats_aux->duration_ns;
+               }
+               if (s->n_histogram_entries) {
+                       unsigned lo = 0, hi = s->n_histogram_entries + 1;
+                       while (lo + 1 < hi) {
+                               unsigned mid = (lo + hi) / 2;
+                               if (s->histogram_boundaries[mid - 1] > duration) {
+                                       hi = mid;
+                               } else {
+                                       lo = mid;
+                               }
+
+                       }
+                       p->histogram[lo]++;
+               }
        }
 
 #if BITS_PER_LONG == 32
@@ -600,7 +668,22 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
        dm_stat_round(s, shared, p);
        local_irq_enable();
 
-       memset(&shared->tmp, 0, sizeof(shared->tmp));
+       shared->tmp.sectors[READ] = 0;
+       shared->tmp.sectors[WRITE] = 0;
+       shared->tmp.ios[READ] = 0;
+       shared->tmp.ios[WRITE] = 0;
+       shared->tmp.merges[READ] = 0;
+       shared->tmp.merges[WRITE] = 0;
+       shared->tmp.ticks[READ] = 0;
+       shared->tmp.ticks[WRITE] = 0;
+       shared->tmp.io_ticks[READ] = 0;
+       shared->tmp.io_ticks[WRITE] = 0;
+       shared->tmp.io_ticks_total = 0;
+       shared->tmp.time_in_queue = 0;
+
+       if (s->n_histogram_entries)
+               memset(shared->tmp.histogram, 0, (s->n_histogram_entries + 1) * sizeof(unsigned long long));
+
        for_each_possible_cpu(cpu) {
                p = &s->stat_percpu[cpu][x];
                shared->tmp.sectors[READ] += ACCESS_ONCE(p->sectors[READ]);
@@ -615,6 +698,11 @@ static void __dm_stat_init_temporary_percpu_totals(struct dm_stat_shared *shared
                shared->tmp.io_ticks[WRITE] += ACCESS_ONCE(p->io_ticks[WRITE]);
                shared->tmp.io_ticks_total += ACCESS_ONCE(p->io_ticks_total);
                shared->tmp.time_in_queue += ACCESS_ONCE(p->time_in_queue);
+               if (s->n_histogram_entries) {
+                       unsigned i;
+                       for (i = 0; i < s->n_histogram_entries + 1; i++)
+                               shared->tmp.histogram[i] += ACCESS_ONCE(p->histogram[i]);
+               }
        }
 }
 
@@ -644,6 +732,15 @@ static void __dm_stat_clear(struct dm_stat *s, size_t idx_start, size_t idx_end,
                p->io_ticks_total -= shared->tmp.io_ticks_total;
                p->time_in_queue -= shared->tmp.time_in_queue;
                local_irq_enable();
+               if (s->n_histogram_entries) {
+                       unsigned i;
+                       for (i = 0; i < s->n_histogram_entries + 1; i++) {
+                               local_irq_disable();
+                               p = &s->stat_percpu[smp_processor_id()][x];
+                               p->histogram[i] -= shared->tmp.histogram[i];
+                               local_irq_enable();
+                       }
+               }
        }
 }
 
@@ -733,7 +830,7 @@ static int dm_stats_print(struct dm_stats *stats, int id,
 
                __dm_stat_init_temporary_percpu_totals(shared, s, x);
 
-               DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu\n",
+               DMEMIT("%llu+%llu %llu %llu %llu %llu %llu %llu %llu %llu %d %llu %llu %llu %llu",
                       (unsigned long long)start,
                       (unsigned long long)step,
                       shared->tmp.ios[READ],
@@ -749,6 +846,13 @@ static int dm_stats_print(struct dm_stats *stats, int id,
                       dm_jiffies_to_msec64(s, shared->tmp.time_in_queue),
                       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[READ]),
                       dm_jiffies_to_msec64(s, shared->tmp.io_ticks[WRITE]));
+               if (s->n_histogram_entries) {
+                       unsigned i;
+                       for (i = 0; i < s->n_histogram_entries + 1; i++) {
+                               DMEMIT("%s%llu", !i ? " " : ":", shared->tmp.histogram[i]);
+                       }
+               }
+               DMEMIT("\n");
 
                if (unlikely(sz + 1 >= maxlen))
                        goto buffer_overflow;
@@ -790,10 +894,47 @@ static int dm_stats_set_aux(struct dm_stats *stats, int id, const char *aux_data
        return 0;
 }
 
+static int parse_histogram(const char *h, unsigned *n_histogram_entries,
+                          unsigned long long **histogram_boundaries)
+{
+       const char *q;
+       unsigned n;
+       unsigned long long last;
+
+       *n_histogram_entries = 1;
+       for (q = h; *q; q++)
+               if (*q == ',')
+                       (*n_histogram_entries)++;
+
+       *histogram_boundaries = kmalloc(*n_histogram_entries * sizeof(unsigned long long), GFP_KERNEL);
+       if (!*histogram_boundaries)
+               return -ENOMEM;
+
+       n = 0;
+       last = 0;
+       while (1) {
+               unsigned long long hi;
+               int s;
+               char ch;
+               s = sscanf(h, "%llu%c", &hi, &ch);
+               if (!s || (s == 2 && ch != ','))
+                       return -EINVAL;
+               if (hi <= last)
+                       return -EINVAL;
+               last = hi;
+               (*histogram_boundaries)[n] = hi;
+               if (s == 1)
+                       return 0;
+               h = strchr(h, ',') + 1;
+               n++;
+       }
+}
+
 static int message_stats_create(struct mapped_device *md,
                                unsigned argc, char **argv,
                                char *result, unsigned maxlen)
 {
+       int r;
        int id;
        char dummy;
        unsigned long long start, end, len, step;
@@ -801,6 +942,9 @@ static int message_stats_create(struct mapped_device *md,
        const char *program_id, *aux_data;
        unsigned stat_flags = 0;
 
+       unsigned n_histogram_entries = 0;
+       unsigned long long *histogram_boundaries = NULL;
+
        struct dm_arg_set as, as_backup;
        const char *a;
        unsigned feature_args;
@@ -811,7 +955,7 @@ static int message_stats_create(struct mapped_device *md,
         */
 
        if (argc < 3)
-               return -EINVAL;
+               goto ret_einval;
 
        as.argc = argc;
        as.argv = argv;
@@ -825,11 +969,11 @@ static int message_stats_create(struct mapped_device *md,
                        len = 1;
        } else if (sscanf(a, "%llu+%llu%c", &start, &len, &dummy) != 2 ||
                   start != (sector_t)start || len != (sector_t)len)
-               return -EINVAL;
+               goto ret_einval;
 
        end = start + len;
        if (start >= end)
-               return -EINVAL;
+               goto ret_einval;
 
        a = dm_shift_arg(&as);
        if (sscanf(a, "/%u%c", &divisor, &dummy) == 1) {
@@ -842,7 +986,7 @@ static int message_stats_create(struct mapped_device *md,
                        step = 1;
        } else if (sscanf(a, "%llu%c", &step, &dummy) != 1 ||
                   step != (sector_t)step || !step)
-               return -EINVAL;
+               goto ret_einval;
 
        as_backup = as;
        a = dm_shift_arg(&as);
@@ -850,11 +994,16 @@ static int message_stats_create(struct mapped_device *md,
                while (feature_args--) {
                        a = dm_shift_arg(&as);
                        if (!a)
-                               return -EINVAL;
+                               goto ret_einval;
                        if (!strcasecmp(a, "precise_timestamps"))
                                stat_flags |= STAT_PRECISE_TIMESTAMPS;
-                       else
-                               return -EINVAL;
+                       else if (!strncasecmp(a, "histogram:", 10)) {
+                               if (n_histogram_entries)
+                                       goto ret_einval;
+                               if ((r = parse_histogram(a + 10, &n_histogram_entries, &histogram_boundaries)))
+                                       goto ret;
+                       } else
+                               goto ret_einval;
                }
        } else {
                as = as_backup;
@@ -872,7 +1021,7 @@ static int message_stats_create(struct mapped_device *md,
                aux_data = a;
 
        if (as.argc)
-               return -EINVAL;
+               goto ret_einval;
 
        /*
         * If a buffer overflow happens after we created the region,
@@ -881,17 +1030,29 @@ static int message_stats_create(struct mapped_device *md,
         * leaked).  So we must detect buffer overflow in advance.
         */
        snprintf(result, maxlen, "%d", INT_MAX);
-       if (dm_message_test_buffer_overflow(result, maxlen))
-               return 1;
+       if (dm_message_test_buffer_overflow(result, maxlen)) {
+               r = 1;
+               goto ret;
+       }
 
-       id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags, program_id, aux_data,
+       id = dm_stats_create(dm_get_stats(md), start, end, step, stat_flags,
+                            n_histogram_entries, histogram_boundaries, program_id, aux_data,
                             dm_internal_suspend_fast, dm_internal_resume_fast, md);
-       if (id < 0)
-               return id;
+       if (id < 0) {
+               r = id;
+               goto ret;
+       }
 
        snprintf(result, maxlen, "%d", id);
 
-       return 1;
+       r = 1;
+       goto ret;
+
+ret_einval:
+       r = -EINVAL;
+ret:
+       kfree(histogram_boundaries);
+       return r;
 }
 
 static int message_stats_delete(struct mapped_device *md,