blk-mq: map all HWQ also in hyperthreaded system
authorMax Gurtovoy <maxg@mellanox.com>
Thu, 29 Jun 2017 14:40:11 +0000 (08:40 -0600)
committerJens Axboe <axboe@kernel.dk>
Thu, 29 Jun 2017 14:40:11 +0000 (08:40 -0600)
This patch performs sequential mapping between CPUs and queues.
In case the system has more CPUs than HWQs then there are still
CPUs to map to HWQs. In hyperthreaded system, map the unmapped CPUs
and their siblings to the same HWQ.
This actually fixes a bug that found unmapped HWQs in a system with
2 sockets, 18 cores per socket, 2 threads per core (total 72 CPUs)
running NVMEoF (opens upto maximum of 64 HWQs).

Performance results running fio (72 jobs, 128 iodepth)
using null_blk (w/w.o patch):

bs      IOPS(read submit_queues=72)   IOPS(write submit_queues=72)   IOPS(read submit_queues=24)  IOPS(write submit_queues=24)
-----  ----------------------------  ------------------------------ ---------------------------- -----------------------------
512    4890.4K/4723.5K                 4524.7K/4324.2K                   4280.2K/4264.3K               3902.4K/3909.5K
1k     4910.1K/4715.2K                 4535.8K/4309.6K                   4296.7K/4269.1K               3906.8K/3914.9K
2k     4906.3K/4739.7K                 4526.7K/4330.6K                   4301.1K/4262.4K               3890.8K/3900.1K
4k     4918.6K/4730.7K                 4556.1K/4343.6K                   4297.6K/4264.5K               3886.9K/3893.9K
8k     4906.4K/4748.9K                 4550.9K/4346.7K                   4283.2K/4268.8K               3863.4K/3858.2K
16k    4903.8K/4782.6K                 4501.5K/4233.9K                   4292.3K/4282.3K               3773.1K/3773.5K
32k    4885.8K/4782.4K                 4365.9K/4184.2K                   4307.5K/4289.4K               3780.3K/3687.3K
64k    4822.5K/4762.7K                 2752.8K/2675.1K                   4308.8K/4312.3K               2651.5K/2655.7K
128k   2388.5K/2313.8K                 1391.9K/1375.7K                   2142.8K/2152.2K               1395.5K/1374.2K

Signed-off-by: Max Gurtovoy <maxg@mellanox.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
block/blk-mq-cpumap.c

index 8e61e8640e1701a8736ab791aaac79983bde6225..2cca4fc43f45a962120b7c775e32dacfb26566c3 100644 (file)
 #include "blk.h"
 #include "blk-mq.h"
 
-static int cpu_to_queue_index(unsigned int nr_cpus, unsigned int nr_queues,
-                             const int cpu)
+static int cpu_to_queue_index(unsigned int nr_queues, const int cpu,
+                             const struct cpumask *online_mask)
 {
-       return cpu * nr_queues / nr_cpus;
+       /*
+        * Non online CPU will be mapped to queue index 0.
+        */
+       if (!cpumask_test_cpu(cpu, online_mask))
+               return 0;
+       return cpu % nr_queues;
 }
 
 static int get_first_sibling(unsigned int cpu)
@@ -36,55 +41,26 @@ int blk_mq_map_queues(struct blk_mq_tag_set *set)
        unsigned int *map = set->mq_map;
        unsigned int nr_queues = set->nr_hw_queues;
        const struct cpumask *online_mask = cpu_online_mask;
-       unsigned int i, nr_cpus, nr_uniq_cpus, queue, first_sibling;
-       cpumask_var_t cpus;
-
-       if (!alloc_cpumask_var(&cpus, GFP_ATOMIC))
-               return -ENOMEM;
-
-       cpumask_clear(cpus);
-       nr_cpus = nr_uniq_cpus = 0;
-       for_each_cpu(i, online_mask) {
-               nr_cpus++;
-               first_sibling = get_first_sibling(i);
-               if (!cpumask_test_cpu(first_sibling, cpus))
-                       nr_uniq_cpus++;
-               cpumask_set_cpu(i, cpus);
-       }
-
-       queue = 0;
-       for_each_possible_cpu(i) {
-               if (!cpumask_test_cpu(i, online_mask)) {
-                       map[i] = 0;
-                       continue;
-               }
+       unsigned int cpu, first_sibling;
 
+       for_each_possible_cpu(cpu) {
                /*
-                * Easy case - we have equal or more hardware queues. Or
-                * there are no thread siblings to take into account. Do
-                * 1:1 if enough, or sequential mapping if less.
+                * First do sequential mapping between CPUs and queues.
+                * In case we still have CPUs to map, and we have some number of
+                * threads per cores then map sibling threads to the same queue for
+                * performace optimizations.
                 */
-               if (nr_queues >= nr_cpus || nr_cpus == nr_uniq_cpus) {
-                       map[i] = cpu_to_queue_index(nr_cpus, nr_queues, queue);
-                       queue++;
-                       continue;
+               if (cpu < nr_queues) {
+                       map[cpu] = cpu_to_queue_index(nr_queues, cpu, online_mask);
+               } else {
+                       first_sibling = get_first_sibling(cpu);
+                       if (first_sibling == cpu)
+                               map[cpu] = cpu_to_queue_index(nr_queues, cpu, online_mask);
+                       else
+                               map[cpu] = map[first_sibling];
                }
-
-               /*
-                * Less then nr_cpus queues, and we have some number of
-                * threads per cores. Map sibling threads to the same
-                * queue.
-                */
-               first_sibling = get_first_sibling(i);
-               if (first_sibling == i) {
-                       map[i] = cpu_to_queue_index(nr_uniq_cpus, nr_queues,
-                                                       queue);
-                       queue++;
-               } else
-                       map[i] = map[first_sibling];
        }
 
-       free_cpumask_var(cpus);
        return 0;
 }
 EXPORT_SYMBOL_GPL(blk_mq_map_queues);